Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ from durak import _durak_core
normalized = _durak_core.fast_normalize("İSTANBUL") # "istanbul"
tokens = _durak_core.tokenize_with_offsets("Merhaba dünya!")

# Vowel harmony validation (Turkish morphophonology)
_durak_core.check_vowel_harmony_py("kitap", "lar") # True (a-a harmony ✅)
_durak_core.check_vowel_harmony_py("ev", "ler") # True (e-e harmony ✅)
_durak_core.check_vowel_harmony_py("kitap", "ler") # False (a-e violation ❌)

# Lemmatization with vowel harmony
_durak_core.strip_suffixes("kitaplar") # "kitap" (harmony valid)
_durak_core.strip_suffixes("kitapler") # "kitapler" (harmony violation, not stripped)

# Embedded resources (no file I/O!)
stopwords = _durak_core.get_stopwords_base() # 100-1000x faster loading
suffixes = _durak_core.get_detached_suffixes()
Expand All @@ -113,6 +122,7 @@ suffixes = _durak_core.get_detached_suffixes()
- **Unicode-aware cleaning**: Turkish-specific normalization (İ/ı, I/i handling)
- **Configurable stopword management**: Keep-lists, custom additions, domain-specific sets
- **Regex-based tokenizer**: Preserves Turkish morphology (clitics, suffixes, apostrophes)
- **Vowel harmony validation**: Linguistically-aware suffix stripping respecting Turkish phonotactics
- **Offset tracking**: Character-accurate positions for NER and span tasks
- **Embedded resources**: Zero file I/O, compiled directly into binary
- **Type-safe**: Complete `.pyi` stubs for IDE support and static analysis
Expand Down
50 changes: 44 additions & 6 deletions python/durak/_durak_core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -78,21 +78,58 @@ def lookup_lemma(word: str) -> str | None:
def strip_suffixes(word: str) -> str:
"""Heuristic suffix stripping for Turkish morphology.

Tier 2 lemmatization: Rule-based suffix stripping with basic vowel harmony.
Recursively strips common Turkish suffixes while preventing over-stripping
of short roots (minimum length constraint).
Tier 2 lemmatization: Rule-based suffix stripping with vowel harmony validation.
Recursively strips common Turkish suffixes ONLY if they respect Turkish vowel
harmony rules. Prevents over-stripping of short roots (minimum length constraint).

Vowel harmony constraint:
- Backness harmony: Suffix vowel must match stem's last vowel backness
- Back vowels: a, ı, o, u
- Front vowels: e, i, ö, ü

Args:
word: The word to strip suffixes from

Returns:
The word with suffixes removed
The word with suffixes removed (if harmony is valid)

Examples:
>>> strip_suffixes("kitaplardan")
'kitap'
>>> strip_suffixes("geliyorum")
'gel'
>>> strip_suffixes("evlerden")
'ev'
>>> strip_suffixes("kitapler") # Harmony violation
'kitapler'
"""
...

def check_vowel_harmony_py(stem: str, suffix: str) -> bool:
"""Check if a suffix harmonizes with the stem's last vowel.

Turkish vowel harmony validator that checks backness harmony between
stem and suffix. This is a fundamental constraint in Turkish morphophonology.

Turkish vowel harmony rules:
- Backness harmony: Both vowels must be back OR both must be front
- Back vowels: a, ı, o, u
- Front vowels: e, i, ö, ü

Args:
stem: The stem word to check
suffix: The suffix to validate

Returns:
True if vowel harmony is satisfied, False otherwise

Examples:
>>> check_vowel_harmony_py("kitap", "lar") # a-a (both back)
True
>>> check_vowel_harmony_py("ev", "ler") # e-e (both front)
True
>>> check_vowel_harmony_py("kitap", "ler") # a-e (mismatch)
False
>>> check_vowel_harmony_py("ev", "lar") # e-a (mismatch)
False
"""
...

Expand Down Expand Up @@ -170,6 +207,7 @@ __all__ = [
"tokenize_with_offsets",
"lookup_lemma",
"strip_suffixes",
"check_vowel_harmony_py",
"get_detached_suffixes",
"get_stopwords_base",
"get_stopwords_metadata",
Expand Down
246 changes: 237 additions & 9 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,24 +98,104 @@ fn lookup_lemma(word: &str) -> Option<String> {
dict.get(word).map(|s| s.to_string())
}

/// Tier 2: Heuristic Suffix Stripping
/// Simple rule-based stripper for demonstration.
/// In production, this would use a more complex state machine and vowel harmony checks.
// ============================================================================
// Vowel Harmony Functions
// ============================================================================

/// Turkish vowels for phonological analysis
const BACK_VOWELS: [char; 4] = ['a', 'ı', 'o', 'u'];
const FRONT_VOWELS: [char; 4] = ['e', 'i', 'ö', 'ü'];

/// Check if a character is a back vowel (a, ı, o, u)
fn is_back_vowel(c: char) -> bool {
BACK_VOWELS.contains(&c)
}

/// Check if a character is a front vowel (e, i, ö, ü)
fn is_front_vowel(c: char) -> bool {
FRONT_VOWELS.contains(&c)
}

/// Get the last vowel from a word
/// Returns None if no vowel is found
fn get_last_vowel(word: &str) -> Option<char> {
word.chars().rev().find(|&c| is_back_vowel(c) || is_front_vowel(c))
}

/// Get the first vowel from a word/suffix
/// Returns None if no vowel is found
fn get_first_vowel(word: &str) -> Option<char> {
word.chars().find(|&c| is_back_vowel(c) || is_front_vowel(c))
}

/// Check vowel harmony between stem and suffix
/// Turkish backness harmony: both vowels must be back OR both front
///
/// Examples:
/// - kitap + lar → ✅ (a-a: both back)
/// - ev + ler → ✅ (e-e: both front)
/// - kitap + ler → ❌ (a-e: back-front mismatch)
/// - ev + lar → ❌ (e-a: front-back mismatch)
fn check_vowel_harmony(stem: &str, suffix: &str) -> bool {
match (get_last_vowel(stem), get_first_vowel(suffix)) {
(Some(stem_vowel), Some(suffix_vowel)) => {
// Both must be back OR both must be front
(is_back_vowel(stem_vowel) && is_back_vowel(suffix_vowel))
|| (is_front_vowel(stem_vowel) && is_front_vowel(suffix_vowel))
}
// If either has no vowel, cannot verify harmony → reject stripping
_ => false,
}
}

/// Expose vowel harmony checker to Python API
#[pyfunction]
fn check_vowel_harmony_py(stem: &str, suffix: &str) -> bool {
check_vowel_harmony(stem, suffix)
}

// ============================================================================
// Suffix Stripping
// ============================================================================

/// Tier 2: Heuristic Suffix Stripping with Vowel Harmony Validation
///
/// Strips common Turkish suffixes ONLY if they respect vowel harmony rules.
/// Prevents false positives like "kitapLER" (harmony violation) from being stripped.
///
/// Turkish vowel harmony constraint:
/// - Backness harmony: Suffix vowel must match stem's last vowel backness
/// - Back vowels: a, ı, o, u
/// - Front vowels: e, i, ö, ü
///
/// Examples:
/// - kitaplar → kitap ✅ (a-a harmony)
/// - evler → ev ✅ (e-e harmony)
/// - kitapler → kitapler ❌ (a-e violation, not stripped)
#[pyfunction]
fn strip_suffixes(word: &str) -> String {
let suffixes = ["lar", "ler", "nin", "nın", "den", "dan", "du", "dün"];
let mut current = word.to_string();

// Very naive recursive stripping for PoC
// Iterative stripping with vowel harmony validation
let mut changed = true;
while changed {
changed = false;
for suffix in suffixes {
if current.ends_with(suffix) && current.len() > suffix.len() + 2 {
// +2 constraint prevents over-stripping short roots
current = current[..current.len() - suffix.len()].to_string();
changed = true;
break; // Restart loop after stripping one suffix
if current.ends_with(suffix) {
let potential_stem = &current[..current.len() - suffix.len()];

// Prevent over-stripping: stem must be at least 2 characters
if potential_stem.len() < 2 {
continue;
}

// ✅ Vowel harmony check before stripping
if check_vowel_harmony(potential_stem, suffix) {
current = potential_stem.to_string();
changed = true;
break; // Restart loop after stripping one suffix
}
}
}
}
Expand Down Expand Up @@ -181,6 +261,9 @@ fn _durak_core(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(lookup_lemma, m)?)?;
m.add_function(wrap_pyfunction!(strip_suffixes, m)?)?;

// Vowel harmony checker
m.add_function(wrap_pyfunction!(check_vowel_harmony_py, m)?)?;

// Embedded resource accessors
m.add_function(wrap_pyfunction!(get_detached_suffixes, m)?)?;
m.add_function(wrap_pyfunction!(get_stopwords_base, m)?)?;
Expand All @@ -189,3 +272,148 @@ fn _durak_core(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {

Ok(())
}

// ============================================================================
// Tests
// ============================================================================

#[cfg(test)]
mod tests {
use super::*;

// Vowel Detection Tests
#[test]
fn test_is_back_vowel() {
assert!(is_back_vowel('a'));
assert!(is_back_vowel('ı'));
assert!(is_back_vowel('o'));
assert!(is_back_vowel('u'));

assert!(!is_back_vowel('e'));
assert!(!is_back_vowel('i'));
assert!(!is_back_vowel('ö'));
assert!(!is_back_vowel('ü'));
assert!(!is_back_vowel('k'));
}

#[test]
fn test_is_front_vowel() {
assert!(is_front_vowel('e'));
assert!(is_front_vowel('i'));
assert!(is_front_vowel('ö'));
assert!(is_front_vowel('ü'));

assert!(!is_front_vowel('a'));
assert!(!is_front_vowel('ı'));
assert!(!is_front_vowel('o'));
assert!(!is_front_vowel('u'));
assert!(!is_front_vowel('t'));
}

#[test]
fn test_get_last_vowel() {
assert_eq!(get_last_vowel("kitap"), Some('a'));
assert_eq!(get_last_vowel("ev"), Some('e'));
assert_eq!(get_last_vowel("okul"), Some('u'));
assert_eq!(get_last_vowel("gül"), Some('ü'));
assert_eq!(get_last_vowel("xyz"), None); // No vowels
assert_eq!(get_last_vowel(""), None); // Empty string
}

#[test]
fn test_get_first_vowel() {
assert_eq!(get_first_vowel("lar"), Some('a'));
assert_eq!(get_first_vowel("ler"), Some('e'));
assert_eq!(get_first_vowel("nın"), Some('ı'));
assert_eq!(get_first_vowel("dün"), Some('ü'));
assert_eq!(get_first_vowel("xyz"), None);
assert_eq!(get_first_vowel(""), None);
}

// Vowel Harmony Validation Tests
#[test]
fn test_vowel_harmony_back_back_valid() {
// Back vowel stem + back vowel suffix = ✅
assert!(check_vowel_harmony("kitap", "lar")); // a-a
assert!(check_vowel_harmony("okul", "dan")); // u-a
assert!(check_vowel_harmony("masa", "nın")); // a-ı
}

#[test]
fn test_vowel_harmony_front_front_valid() {
// Front vowel stem + front vowel suffix = ✅
assert!(check_vowel_harmony("ev", "ler")); // e-e
assert!(check_vowel_harmony("gül", "den")); // ü-e
assert!(check_vowel_harmony("şehir", "nin")); // i-i
}

#[test]
fn test_vowel_harmony_back_front_invalid() {
// Back vowel stem + front vowel suffix = ❌
assert!(!check_vowel_harmony("kitap", "ler")); // a-e mismatch
assert!(!check_vowel_harmony("okul", "den")); // u-e mismatch
assert!(!check_vowel_harmony("masa", "nin")); // a-i mismatch
}

#[test]
fn test_vowel_harmony_front_back_invalid() {
// Front vowel stem + back vowel suffix = ❌
assert!(!check_vowel_harmony("ev", "lar")); // e-a mismatch
assert!(!check_vowel_harmony("gül", "dan")); // ü-a mismatch
assert!(!check_vowel_harmony("şehir", "nın")); // i-ı mismatch
}

#[test]
fn test_vowel_harmony_no_vowel_edge_cases() {
// Edge case: no vowels in stem or suffix
assert!(!check_vowel_harmony("xyz", "lar")); // No vowel in stem
assert!(!check_vowel_harmony("kitap", "xyz")); // No vowel in suffix
assert!(!check_vowel_harmony("xyz", "abc")); // No vowels at all
}

// Suffix Stripping with Harmony Tests
#[test]
fn test_strip_suffixes_with_harmony_valid() {
// Should strip when harmony is valid
assert_eq!(strip_suffixes("kitaplar"), "kitap"); // a-a harmony ✅
assert_eq!(strip_suffixes("evler"), "ev"); // e-e harmony ✅
assert_eq!(strip_suffixes("okuldan"), "okul"); // u-a harmony ✅
assert_eq!(strip_suffixes("gülden"), "gül"); // ü-e harmony ✅
}

#[test]
fn test_strip_suffixes_with_harmony_invalid() {
// Should NOT strip when harmony is violated
assert_eq!(strip_suffixes("kitapler"), "kitapler"); // a-e violation ❌
assert_eq!(strip_suffixes("evlar"), "evlar"); // e-a violation ❌
assert_eq!(strip_suffixes("okulden"), "okulden"); // u-e violation ❌
assert_eq!(strip_suffixes("güldan"), "güldan"); // ü-a violation ❌
}

#[test]
fn test_strip_suffixes_recursive_with_harmony() {
// Multiple suffixes should respect harmony at each step
// "kitaplardan" → "kitaplar" (strip "dan" with a-a harmony)
// → "kitap" (strip "lar" with a-a harmony)
assert_eq!(strip_suffixes("kitaplardan"), "kitap");

// "evlerden" → "evler" (strip "den" with e-e harmony)
// → "ev" (strip "ler" with e-e harmony)
assert_eq!(strip_suffixes("evlerden"), "ev");
}

#[test]
fn test_strip_suffixes_min_length_constraint() {
// Should not over-strip short words (min stem length = suffix + 2)
assert_eq!(strip_suffixes("lar"), "lar"); // Too short to strip
assert_eq!(strip_suffixes("evler"), "ev"); // Long enough, strips
}

#[test]
fn test_strip_suffixes_no_suffix() {
// Words without recognized suffixes should remain unchanged
assert_eq!(strip_suffixes("masa"), "masa");
assert_eq!(strip_suffixes("bilgisayar"), "bilgisayar");
assert_eq!(strip_suffixes("xyz"), "xyz");
}
}
Loading