cdliai · ada-cinar · Jan 26, 2026 · Jan 27, 2026
@@ -103,6 +103,15 @@ from durak import _durak_core
 normalized = _durak_core.fast_normalize("İSTANBUL")  # "istanbul"
 tokens = _durak_core.tokenize_with_offsets("Merhaba dünya!")
 
+# Vowel harmony validation (Turkish morphophonology)
+_durak_core.check_vowel_harmony_py("kitap", "lar")  # True (a-a harmony ✅)
+_durak_core.check_vowel_harmony_py("ev", "ler")     # True (e-e harmony ✅)
+_durak_core.check_vowel_harmony_py("kitap", "ler")  # False (a-e violation ❌)
+
+# Lemmatization with vowel harmony
+_durak_core.strip_suffixes("kitaplar")   # "kitap" (harmony valid)
+_durak_core.strip_suffixes("kitapler")   # "kitapler" (harmony violation, not stripped)
+
 # Embedded resources (no file I/O!)
 stopwords = _durak_core.get_stopwords_base()  # 100-1000x faster loading
 suffixes = _durak_core.get_detached_suffixes()
@@ -113,6 +122,7 @@ suffixes = _durak_core.get_detached_suffixes()
 - **Unicode-aware cleaning**: Turkish-specific normalization (İ/ı, I/i handling)
 - **Configurable stopword management**: Keep-lists, custom additions, domain-specific sets
 - **Regex-based tokenizer**: Preserves Turkish morphology (clitics, suffixes, apostrophes)
+- **Vowel harmony validation**: Linguistically-aware suffix stripping respecting Turkish phonotactics
 - **Offset tracking**: Character-accurate positions for NER and span tasks
 - **Embedded resources**: Zero file I/O, compiled directly into binary
 - **Type-safe**: Complete `.pyi` stubs for IDE support and static analysis

@@ -78,21 +78,58 @@ def lookup_lemma(word: str) -> str | None:
 def strip_suffixes(word: str) -> str:
     """Heuristic suffix stripping for Turkish morphology.
 
-    Tier 2 lemmatization: Rule-based suffix stripping with basic vowel harmony.
-    Recursively strips common Turkish suffixes while preventing over-stripping
-    of short roots (minimum length constraint).
+    Tier 2 lemmatization: Rule-based suffix stripping with vowel harmony validation.
+    Recursively strips common Turkish suffixes ONLY if they respect Turkish vowel
+    harmony rules. Prevents over-stripping of short roots (minimum length constraint).
+
+    Vowel harmony constraint:
+    - Backness harmony: Suffix vowel must match stem's last vowel backness
+    - Back vowels: a, ı, o, u
+    - Front vowels: e, i, ö, ü
 
     Args:
         word: The word to strip suffixes from
 
     Returns:
-        The word with suffixes removed
+        The word with suffixes removed (if harmony is valid)
 
     Examples:
         >>> strip_suffixes("kitaplardan")
         'kitap'
-        >>> strip_suffixes("geliyorum")
-        'gel'
+        >>> strip_suffixes("evlerden")
+        'ev'
+        >>> strip_suffixes("kitapler")  # Harmony violation
+        'kitapler'
+    """
+    ...
+
+def check_vowel_harmony_py(stem: str, suffix: str) -> bool:
+    """Check if a suffix harmonizes with the stem's last vowel.
+
+    Turkish vowel harmony validator that checks backness harmony between
+    stem and suffix. This is a fundamental constraint in Turkish morphophonology.
+
+    Turkish vowel harmony rules:
+    - Backness harmony: Both vowels must be back OR both must be front
+    - Back vowels: a, ı, o, u
+    - Front vowels: e, i, ö, ü
+
+    Args:
+        stem: The stem word to check
+        suffix: The suffix to validate
+
+    Returns:
+        True if vowel harmony is satisfied, False otherwise
+
+    Examples:
+        >>> check_vowel_harmony_py("kitap", "lar")  # a-a (both back)
+        True
+        >>> check_vowel_harmony_py("ev", "ler")  # e-e (both front)
+        True
+        >>> check_vowel_harmony_py("kitap", "ler")  # a-e (mismatch)
+        False
+        >>> check_vowel_harmony_py("ev", "lar")  # e-a (mismatch)
+        False
     """
     ...
 
@@ -170,6 +207,7 @@ __all__ = [
     "tokenize_with_offsets",
     "lookup_lemma",
     "strip_suffixes",
+    "check_vowel_harmony_py",
     "get_detached_suffixes",
     "get_stopwords_base",
     "get_stopwords_metadata",

@@ -98,24 +98,104 @@ fn lookup_lemma(word: &str) -> Option<String> {
     dict.get(word).map(|s| s.to_string())
 }
 
-/// Tier 2: Heuristic Suffix Stripping
-/// Simple rule-based stripper for demonstration.
-/// In production, this would use a more complex state machine and vowel harmony checks.
+// ============================================================================
+// Vowel Harmony Functions
+// ============================================================================
+
+/// Turkish vowels for phonological analysis
+const BACK_VOWELS: [char; 4] = ['a', 'ı', 'o', 'u'];
+const FRONT_VOWELS: [char; 4] = ['e', 'i', 'ö', 'ü'];
+
+/// Check if a character is a back vowel (a, ı, o, u)
+fn is_back_vowel(c: char) -> bool {
+    BACK_VOWELS.contains(&c)
+}
+
+/// Check if a character is a front vowel (e, i, ö, ü)
+fn is_front_vowel(c: char) -> bool {
+    FRONT_VOWELS.contains(&c)
+}
+
+/// Get the last vowel from a word
+/// Returns None if no vowel is found
+fn get_last_vowel(word: &str) -> Option<char> {
+    word.chars().rev().find(|&c| is_back_vowel(c) || is_front_vowel(c))
+}
+
+/// Get the first vowel from a word/suffix
+/// Returns None if no vowel is found
+fn get_first_vowel(word: &str) -> Option<char> {
+    word.chars().find(|&c| is_back_vowel(c) || is_front_vowel(c))
+}
+
+/// Check vowel harmony between stem and suffix
+/// Turkish backness harmony: both vowels must be back OR both front
+/// 
+/// Examples:
+/// - kitap + lar → ✅ (a-a: both back)
+/// - ev + ler → ✅ (e-e: both front)
+/// - kitap + ler → ❌ (a-e: back-front mismatch)
+/// - ev + lar → ❌ (e-a: front-back mismatch)
+fn check_vowel_harmony(stem: &str, suffix: &str) -> bool {
+    match (get_last_vowel(stem), get_first_vowel(suffix)) {
+        (Some(stem_vowel), Some(suffix_vowel)) => {
+            // Both must be back OR both must be front
+            (is_back_vowel(stem_vowel) && is_back_vowel(suffix_vowel))
+                || (is_front_vowel(stem_vowel) && is_front_vowel(suffix_vowel))
+        }
+        // If either has no vowel, cannot verify harmony → reject stripping
+        _ => false,
+    }
+}
+
+/// Expose vowel harmony checker to Python API
+#[pyfunction]
+fn check_vowel_harmony_py(stem: &str, suffix: &str) -> bool {
+    check_vowel_harmony(stem, suffix)
+}
+
+// ============================================================================
+// Suffix Stripping
+// ============================================================================
+
+/// Tier 2: Heuristic Suffix Stripping with Vowel Harmony Validation
+/// 
+/// Strips common Turkish suffixes ONLY if they respect vowel harmony rules.
+/// Prevents false positives like "kitapLER" (harmony violation) from being stripped.
+/// 
+/// Turkish vowel harmony constraint:
+/// - Backness harmony: Suffix vowel must match stem's last vowel backness
+/// - Back vowels: a, ı, o, u
+/// - Front vowels: e, i, ö, ü
+/// 
+/// Examples:
+/// - kitaplar → kitap ✅ (a-a harmony)
+/// - evler → ev ✅ (e-e harmony)
+/// - kitapler → kitapler ❌ (a-e violation, not stripped)
 #[pyfunction]
 fn strip_suffixes(word: &str) -> String {
     let suffixes = ["lar", "ler", "nin", "nın", "den", "dan", "du", "dün"];
     let mut current = word.to_string();
 
-    // Very naive recursive stripping for PoC
+    // Iterative stripping with vowel harmony validation
     let mut changed = true;
     while changed {
         changed = false;
         for suffix in suffixes {
-            if current.ends_with(suffix) && current.len() > suffix.len() + 2 { 
-                 // +2 constraint prevents over-stripping short roots
-                current = current[..current.len() - suffix.len()].to_string();
-                changed = true;
-                break; // Restart loop after stripping one suffix
+            if current.ends_with(suffix) {
+                let potential_stem = &current[..current.len() - suffix.len()];
+
+                // Prevent over-stripping: stem must be at least 2 characters
+                if potential_stem.len() < 2 {
+                    continue;
+                }
+
+                // ✅ Vowel harmony check before stripping
+                if check_vowel_harmony(potential_stem, suffix) {
+                    current = potential_stem.to_string();
+                    changed = true;
+                    break; // Restart loop after stripping one suffix
+                }
             }
         }
     }
@@ -181,6 +261,9 @@ fn _durak_core(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(lookup_lemma, m)?)?;
     m.add_function(wrap_pyfunction!(strip_suffixes, m)?)?;
 
+    // Vowel harmony checker
+    m.add_function(wrap_pyfunction!(check_vowel_harmony_py, m)?)?;
+
     // Embedded resource accessors
     m.add_function(wrap_pyfunction!(get_detached_suffixes, m)?)?;
     m.add_function(wrap_pyfunction!(get_stopwords_base, m)?)?;
@@ -189,3 +272,148 @@ fn _durak_core(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
 
     Ok(())
 }
+
+// ============================================================================
+// Tests
+// ============================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Vowel Detection Tests
+    #[test]
+    fn test_is_back_vowel() {
+        assert!(is_back_vowel('a'));
+        assert!(is_back_vowel('ı'));
+        assert!(is_back_vowel('o'));
+        assert!(is_back_vowel('u'));
+
+        assert!(!is_back_vowel('e'));
+        assert!(!is_back_vowel('i'));
+        assert!(!is_back_vowel('ö'));
+        assert!(!is_back_vowel('ü'));
+        assert!(!is_back_vowel('k'));
+    }
+
+    #[test]
+    fn test_is_front_vowel() {
+        assert!(is_front_vowel('e'));
+        assert!(is_front_vowel('i'));
+        assert!(is_front_vowel('ö'));
+        assert!(is_front_vowel('ü'));
+
+        assert!(!is_front_vowel('a'));
+        assert!(!is_front_vowel('ı'));
+        assert!(!is_front_vowel('o'));
+        assert!(!is_front_vowel('u'));
+        assert!(!is_front_vowel('t'));
+    }
+
+    #[test]
+    fn test_get_last_vowel() {
+        assert_eq!(get_last_vowel("kitap"), Some('a'));
+        assert_eq!(get_last_vowel("ev"), Some('e'));
+        assert_eq!(get_last_vowel("okul"), Some('u'));
+        assert_eq!(get_last_vowel("gül"), Some('ü'));
+        assert_eq!(get_last_vowel("xyz"), None); // No vowels
+        assert_eq!(get_last_vowel(""), None); // Empty string
+    }
+
+    #[test]
+    fn test_get_first_vowel() {
+        assert_eq!(get_first_vowel("lar"), Some('a'));
+        assert_eq!(get_first_vowel("ler"), Some('e'));
+        assert_eq!(get_first_vowel("nın"), Some('ı'));
+        assert_eq!(get_first_vowel("dün"), Some('ü'));
+        assert_eq!(get_first_vowel("xyz"), None);
+        assert_eq!(get_first_vowel(""), None);
+    }
+
+    // Vowel Harmony Validation Tests
+    #[test]
+    fn test_vowel_harmony_back_back_valid() {
+        // Back vowel stem + back vowel suffix = ✅
+        assert!(check_vowel_harmony("kitap", "lar")); // a-a
+        assert!(check_vowel_harmony("okul", "dan")); // u-a
+        assert!(check_vowel_harmony("masa", "nın")); // a-ı
+    }
+
+    #[test]
+    fn test_vowel_harmony_front_front_valid() {
+        // Front vowel stem + front vowel suffix = ✅
+        assert!(check_vowel_harmony("ev", "ler")); // e-e
+        assert!(check_vowel_harmony("gül", "den")); // ü-e
+        assert!(check_vowel_harmony("şehir", "nin")); // i-i
+    }
+
+    #[test]
+    fn test_vowel_harmony_back_front_invalid() {
+        // Back vowel stem + front vowel suffix = ❌
+        assert!(!check_vowel_harmony("kitap", "ler")); // a-e mismatch
+        assert!(!check_vowel_harmony("okul", "den")); // u-e mismatch
+        assert!(!check_vowel_harmony("masa", "nin")); // a-i mismatch
+    }
+
+    #[test]
+    fn test_vowel_harmony_front_back_invalid() {
+        // Front vowel stem + back vowel suffix = ❌
+        assert!(!check_vowel_harmony("ev", "lar")); // e-a mismatch
+        assert!(!check_vowel_harmony("gül", "dan")); // ü-a mismatch
+        assert!(!check_vowel_harmony("şehir", "nın")); // i-ı mismatch
+    }
+
+    #[test]
+    fn test_vowel_harmony_no_vowel_edge_cases() {
+        // Edge case: no vowels in stem or suffix
+        assert!(!check_vowel_harmony("xyz", "lar")); // No vowel in stem
+        assert!(!check_vowel_harmony("kitap", "xyz")); // No vowel in suffix
+        assert!(!check_vowel_harmony("xyz", "abc")); // No vowels at all
+    }
+
+    // Suffix Stripping with Harmony Tests
+    #[test]
+    fn test_strip_suffixes_with_harmony_valid() {
+        // Should strip when harmony is valid
+        assert_eq!(strip_suffixes("kitaplar"), "kitap"); // a-a harmony ✅
+        assert_eq!(strip_suffixes("evler"), "ev"); // e-e harmony ✅
+        assert_eq!(strip_suffixes("okuldan"), "okul"); // u-a harmony ✅
+        assert_eq!(strip_suffixes("gülden"), "gül"); // ü-e harmony ✅
+    }
+
+    #[test]
+    fn test_strip_suffixes_with_harmony_invalid() {
+        // Should NOT strip when harmony is violated
+        assert_eq!(strip_suffixes("kitapler"), "kitapler"); // a-e violation ❌
+        assert_eq!(strip_suffixes("evlar"), "evlar"); // e-a violation ❌
+        assert_eq!(strip_suffixes("okulden"), "okulden"); // u-e violation ❌
+        assert_eq!(strip_suffixes("güldan"), "güldan"); // ü-a violation ❌
+    }
+
+    #[test]
+    fn test_strip_suffixes_recursive_with_harmony() {
+        // Multiple suffixes should respect harmony at each step
+        // "kitaplardan" → "kitaplar" (strip "dan" with a-a harmony)
+        //              → "kitap" (strip "lar" with a-a harmony)
+        assert_eq!(strip_suffixes("kitaplardan"), "kitap");
+
+        // "evlerden" → "evler" (strip "den" with e-e harmony)
+        //           → "ev" (strip "ler" with e-e harmony)
+        assert_eq!(strip_suffixes("evlerden"), "ev");
+    }
+
+    #[test]
+    fn test_strip_suffixes_min_length_constraint() {
+        // Should not over-strip short words (min stem length = suffix + 2)
+        assert_eq!(strip_suffixes("lar"), "lar"); // Too short to strip
+        assert_eq!(strip_suffixes("evler"), "ev"); // Long enough, strips
+    }
+
+    #[test]
+    fn test_strip_suffixes_no_suffix() {
+        // Words without recognized suffixes should remain unchanged
+        assert_eq!(strip_suffixes("masa"), "masa");
+        assert_eq!(strip_suffixes("bilgisayar"), "bilgisayar");
+        assert_eq!(strip_suffixes("xyz"), "xyz");
+    }
+}