diff --git a/python/durak/__init__.py b/python/durak/__init__.py index b00d4db..0d2714b 100644 --- a/python/durak/__init__.py +++ b/python/durak/__init__.py @@ -26,6 +26,8 @@ attach_detached_suffixes, ) from .tokenizer import ( + Tokenizer, + TokenizationError, normalize_tokens, split_sentences, tokenize, diff --git a/python/durak/_durak_core.pyi b/python/durak/_durak_core.pyi index b1ff11b..ceaee14 100644 --- a/python/durak/_durak_core.pyi +++ b/python/durak/_durak_core.pyi @@ -59,17 +59,26 @@ def tokenize_with_offsets(text: str) -> list[tuple[str, int, int]]: def lookup_lemma(word: str) -> str | None: """Perform exact dictionary lookup for lemmatization. - Tier 1 lemmatization: Fast exact lookup in the internal dictionary. + Tier 1 lemmatization: Fast exact lookup in the embedded Turkish lemma dictionary. + The dictionary contains 1,362+ inflected forms mapped to their base lemmas, + loaded from resources/tr/lemmas/turkish_lemma_dict.txt at build time. + + Coverage: + - High-frequency nouns with case/plural suffixes + - Common verb inflections (tense, aspect, person markers) + - Systematic vowel harmony patterns (front/back vowel classes) Args: - word: The word to lemmatize + word: The inflected word to lemmatize Returns: - The lemma if found in dictionary, None otherwise + The base lemma if found in dictionary, None otherwise Examples: >>> lookup_lemma("kitaplar") 'kitap' + >>> lookup_lemma("geliyorum") + 'gel' >>> lookup_lemma("unknown") None """ diff --git a/src/lib.rs b/src/lib.rs index bf7ea3c..d2d183f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -265,6 +265,27 @@ mod tests { } } + #[test] + fn test_lookup_lemma_with_resource_dict() { + // Test lookups from embedded turkish_lemma_dict.txt + let test_cases = vec![ + ("kitaplar", "kitap"), + ("evler", "ev"), + ("geliyorum", "gel"), + ("aldım", "al"), + ("adamlar", "adam"), + ("anaları", "ana"), + ]; + + for (inflected, expected_lemma) in test_cases { + let result = lookup_lemma(inflected); + assert_eq!(result, Some(expected_lemma.to_string()), + "lookup_lemma('{}') should return '{}', got: {:?}", + inflected, expected_lemma, result + ); + } + } + #[test] fn test_lookup_lemma_oov_words() { // Out-of-vocabulary words should return None