Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/durak/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
attach_detached_suffixes,
)
from .tokenizer import (
Tokenizer,
TokenizationError,
normalize_tokens,
split_sentences,
tokenize,
Expand Down
15 changes: 12 additions & 3 deletions python/durak/_durak_core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,26 @@ def tokenize_with_offsets(text: str) -> list[tuple[str, int, int]]:
def lookup_lemma(word: str) -> str | None:
"""Perform exact dictionary lookup for lemmatization.

Tier 1 lemmatization: Fast exact lookup in the internal dictionary.
Tier 1 lemmatization: Fast exact lookup in the embedded Turkish lemma dictionary.
The dictionary contains 1,362+ inflected forms mapped to their base lemmas,
loaded from resources/tr/lemmas/turkish_lemma_dict.txt at build time.

Coverage:
- High-frequency nouns with case/plural suffixes
- Common verb inflections (tense, aspect, person markers)
- Systematic vowel harmony patterns (front/back vowel classes)

Args:
word: The word to lemmatize
word: The inflected word to lemmatize

Returns:
The lemma if found in dictionary, None otherwise
The base lemma if found in dictionary, None otherwise

Examples:
>>> lookup_lemma("kitaplar")
'kitap'
>>> lookup_lemma("geliyorum")
'gel'
>>> lookup_lemma("unknown")
None
"""
Expand Down
21 changes: 21 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,27 @@ mod tests {
}
}

#[test]
fn test_lookup_lemma_with_resource_dict() {
// Test lookups from embedded turkish_lemma_dict.txt
let test_cases = vec![
("kitaplar", "kitap"),
("evler", "ev"),
("geliyorum", "gel"),
("aldım", "al"),
("adamlar", "adam"),
("anaları", "ana"),
];

for (inflected, expected_lemma) in test_cases {
let result = lookup_lemma(inflected);
assert_eq!(result, Some(expected_lemma.to_string()),
"lookup_lemma('{}') should return '{}', got: {:?}",
inflected, expected_lemma, result
);
}
}

#[test]
fn test_lookup_lemma_oov_words() {
// Out-of-vocabulary words should return None
Expand Down
Loading