cdliai · ada-cinar · Jan 27, 2026 · Jan 27, 2026
@@ -26,6 +26,8 @@
     attach_detached_suffixes,
 )
 from .tokenizer import (
+    Tokenizer,
+    TokenizationError,
     normalize_tokens,
     split_sentences,
     tokenize,

@@ -59,17 +59,26 @@ def tokenize_with_offsets(text: str) -> list[tuple[str, int, int]]:
 def lookup_lemma(word: str) -> str | None:
     """Perform exact dictionary lookup for lemmatization.
 
-    Tier 1 lemmatization: Fast exact lookup in the internal dictionary.
+    Tier 1 lemmatization: Fast exact lookup in the embedded Turkish lemma dictionary.
+    The dictionary contains 1,362+ inflected forms mapped to their base lemmas,
+    loaded from resources/tr/lemmas/turkish_lemma_dict.txt at build time.
+
+    Coverage:
+    - High-frequency nouns with case/plural suffixes
+    - Common verb inflections (tense, aspect, person markers)
+    - Systematic vowel harmony patterns (front/back vowel classes)
 
     Args:
-        word: The word to lemmatize
+        word: The inflected word to lemmatize
 
     Returns:
-        The lemma if found in dictionary, None otherwise
+        The base lemma if found in dictionary, None otherwise
 
     Examples:
         >>> lookup_lemma("kitaplar")
         'kitap'
+        >>> lookup_lemma("geliyorum")
+        'gel'
         >>> lookup_lemma("unknown")
         None
     """

@@ -265,6 +265,27 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_lookup_lemma_with_resource_dict() {
+        // Test lookups from embedded turkish_lemma_dict.txt
+        let test_cases = vec![
+            ("kitaplar", "kitap"),
+            ("evler", "ev"),
+            ("geliyorum", "gel"),
+            ("aldım", "al"),
+            ("adamlar", "adam"),
+            ("anaları", "ana"),
+        ];
+
+        for (inflected, expected_lemma) in test_cases {
+            let result = lookup_lemma(inflected);
+            assert_eq!(result, Some(expected_lemma.to_string()),
+                "lookup_lemma('{}') should return '{}', got: {:?}",
+                inflected, expected_lemma, result
+            );
+        }
+    }
+
     #[test]
     fn test_lookup_lemma_oov_words() {
         // Out-of-vocabulary words should return None