cdliai · ada-cinar · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
@@ -102,8 +102,58 @@ def load_from_rust():
     except ImportError:
         print("Rust extension not available")
 
-    # 4. Full Pipeline Benchmark
-    print("\n4. Complete Processing Pipeline")
+    # 4. Lemmatizer Cache Benchmark
+    print("\n4. Lemmatizer LRU Cache Performance")
+    print("-" * 70)
+
+    # Generate corpus with Zipfian distribution (realistic word frequency)
+    # Top 100 words cover ~50% of tokens in typical Turkish text
+    common_words = [
+        "kitap", "ev", "araba", "okul", "öğrenci", "öğretmen", "ders",
+        "sınıf", "masa", "sandalye", "kapı", "pencere", "bahçe", "ağaç",
+        "çocuk", "anne", "baba", "kardeş", "arkadaş", "komşu"
+    ] * 50  # 1000 tokens, highly repetitive
+
+    rare_words = [
+        f"nadir_kelime_{i}" for i in range(1000)
+    ]  # 1000 unique tokens
+
+    # Test 1: Cache-friendly workload (repetitive)
+    lemmatizer_cached = durak.Lemmatizer(cache_size=10_000)
+    lemmatizer_nocache = durak.Lemmatizer(cache_size=0)
+
+    def lemmatize_corpus(lemmatizer, words):
+        for word in words:
+            lemmatizer(word)
+
+    cached_time = benchmark(lemmatize_corpus, lemmatizer_cached, common_words, iterations=10)
+    nocache_time = benchmark(lemmatize_corpus, lemmatizer_nocache, common_words, iterations=10)
+
+    cache_info = lemmatizer_cached.get_cache_info()
+    hit_rate = cache_info.hits / (cache_info.hits + cache_info.misses) if cache_info else 0
+
+    print(f"Repetitive corpus (1000 tokens, 20 unique words):")
+    print(f"  With cache:    {cached_time:.4f} ms per call")
+    print(f"  Without cache: {nocache_time:.4f} ms per call")
+    print(f"  Speedup:       {nocache_time / cached_time:.2f}x")
+    print(f"  Cache hit rate: {hit_rate:.1%}")
+
+    # Test 2: Cache-hostile workload (all unique)
+    lemmatizer_cached.clear_cache()
+    cached_time_unique = benchmark(lemmatize_corpus, lemmatizer_cached, rare_words, iterations=10)
+    nocache_time_unique = benchmark(lemmatize_corpus, lemmatizer_nocache, rare_words, iterations=10)
+
+    cache_info_unique = lemmatizer_cached.get_cache_info()
+    hit_rate_unique = cache_info_unique.hits / (cache_info_unique.hits + cache_info_unique.misses) if cache_info_unique else 0
+
+    print(f"\nUnique corpus (1000 unique words, no repetition):")
+    print(f"  With cache:    {cached_time_unique:.4f} ms per call")
+    print(f"  Without cache: {nocache_time_unique:.4f} ms per call")
+    print(f"  Overhead:      {cached_time_unique / nocache_time_unique:.2f}x")
+    print(f"  Cache hit rate: {hit_rate_unique:.1%}")
+
+    # 5. Full Pipeline Benchmark
+    print("\n5. Complete Processing Pipeline")
     print("-" * 70)
 
     pipeline = durak.Pipeline(

@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
+from functools import lru_cache
 from time import perf_counter
 from typing import Literal
 
@@ -108,6 +109,7 @@ class Lemmatizer:
         strict_validation: Require roots to be in lemma dictionary
         min_root_length: Minimum acceptable root length (characters)
         collect_metrics: Enable performance metrics collection (adds ~5-10% overhead)
+        cache_size: LRU cache size for repeated words (0=disabled, default=10_000)
     """
     def __init__(
         self, 
@@ -116,18 +118,24 @@ def __init__(
         strict_validation: bool = False,
         min_root_length: int = 2,
         collect_metrics: bool = False,
+        cache_size: int = 10_000,
     ):
         self.strategy = strategy
         self.validate_roots = validate_roots
         self.strict_validation = strict_validation
         self.min_root_length = min_root_length
         self.collect_metrics = collect_metrics
+        self.cache_size = cache_size
         self._metrics = LemmatizerMetrics() if collect_metrics else None
-
-    def __call__(self, word: str) -> str:
-        if not word:
-            return ""
 
+        # Setup LRU cache if enabled
+        if cache_size > 0:
+            self._cached_call = lru_cache(maxsize=cache_size)(self._raw_call)
+        else:
+            self._cached_call = self._raw_call
+
+    def _raw_call(self, word: str) -> str:
+        """Core lemmatization logic (bypasses cache)."""
         start_time = perf_counter() if self.collect_metrics else None
 
         # Tier 1: Lookup
@@ -177,6 +185,12 @@ def __call__(self, word: str) -> str:
 
         return word
 
+    def __call__(self, word: str) -> str:
+        """Lemmatize a word (with LRU caching if enabled)."""
+        if not word:
+            return ""
+        return self._cached_call(word)
+
     def get_metrics(self) -> LemmatizerMetrics:
         """Return collected metrics.
 
@@ -211,6 +225,42 @@ def reset_metrics(self) -> None:
             )
         self._metrics = LemmatizerMetrics()
 
+    def get_cache_info(self):
+        """Return LRU cache statistics if caching is enabled.
+
+        Returns:
+            CacheInfo: Named tuple with hits, misses, maxsize, currsize
+            None: If caching is disabled (cache_size=0)
+
+        Example:
+            >>> lemmatizer = Lemmatizer(cache_size=1000)
+            >>> for word in ["kitap", "kitaplar", "kitap"]:
+            ...     lemmatizer(word)
+            >>> info = lemmatizer.get_cache_info()
+            >>> print(f"Cache hit rate: {info.hits / (info.hits + info.misses):.2%}")
+        """
+        if self.cache_size > 0:
+            return self._cached_call.cache_info()
+        return None
+
+    def clear_cache(self) -> None:
+        """Clear the LRU cache if caching is enabled.
+
+        Useful for:
+        - Benchmarking without cache warmup
+        - Measuring cold-start performance
+        - Memory management in long-running processes
+
+        Example:
+            >>> lemmatizer = Lemmatizer(cache_size=1000)
+            >>> lemmatizer("kitaplar")  # Cache miss
+            >>> lemmatizer("kitaplar")  # Cache hit
+            >>> lemmatizer.clear_cache()
+            >>> lemmatizer("kitaplar")  # Cache miss again
+        """
+        if self.cache_size > 0:
+            self._cached_call.cache_clear()
+
     def __repr__(self) -> str:
         parts = [f"strategy='{self.strategy}'"]
         if self.validate_roots:
@@ -221,4 +271,6 @@ def __repr__(self) -> str:
                 parts.append(f"min_root_length={self.min_root_length}")
         if self.collect_metrics:
             parts.append("collect_metrics=True")
+        if self.cache_size != 10_000:  # Only show if non-default
+            parts.append(f"cache_size={self.cache_size}")
         return f"Lemmatizer({', '.join(parts)})"
@@ -0,0 +1,79 @@
+# Lemmatization Evaluation Test Sets
+
+This directory contains gold-standard test sets for evaluating lemmatizer quality.
+
+## Files
+
+### `gold_standard.tsv`
+
+Hand-curated test set with 152 entries covering:
+
+- **Nouns** (63 entries): plural, case markers (accusative, dative, locative, ablative), possessive, genitive
+- **Verbs** (85 entries): present continuous, past tense, future tense, infinitive, simple present
+- **Pronouns** (included in noun cases)
+- **Edge cases** (4 entries): short words, compound suffixes
+
+**Format:**
+```
+inflected<TAB>lemma<TAB>source
+```
+
+**Sources:**
+- `manual`: Hand-curated entries
+- `dict`: From `turkish_lemma_dict.txt`
+- `test`: From existing unit tests
+
+## Usage
+
+### Basic Evaluation
+
+```python
+from durak.lemmatizer import Lemmatizer
+import pandas as pd
+
+# Load test set
+df = pd.read_csv("resources/tr/lemmas/eval/gold_standard.tsv", 
+                 sep="\t", comment="#", 
+                 names=["word", "lemma", "source"])
+
+# Evaluate a strategy
+lemmatizer = Lemmatizer(strategy="hybrid")
+correct = sum(lemmatizer(row.word) == row.lemma for _, row in df.iterrows())
+accuracy = correct / len(df)
+
+print(f"Accuracy: {accuracy:.2%} ({correct}/{len(df)})")
+```
+
+### Strategy Comparison
+
+```bash
+python scripts/evaluate_lemmatizer.py
+```
+
+## Expansion
+
+Future test sets to add:
+
+- `domain_news.tsv`: Formal news corpus (high dictionary coverage expected)
+- `domain_social_media.tsv`: Informal text (OOV-heavy, slang, misspellings)
+- `domain_technical.tsv`: Technical/scientific terms
+- `edge_cases.tsv`: Adversarial examples (apostrophes, rare patterns)
+
+## Provenance
+
+- **Gold standard**: Curated by cdliai team (Jan 2026)
+- **Dictionary entries**: From `turkish_lemma_dict.txt` (v0.5.0+)
+- **Test cases**: From `tests/test_lemmatizer.py` regression tests
+
+## Citation
+
+If using these test sets in research:
+
+```bibtex
+@software{durak_lemma_eval,
+  title = {Durak Turkish NLP Lemmatization Evaluation Sets},
+  author = {{CDLI AI}},
+  year = {2026},
+  url = {https://github.com/cdliai/durak}
+}
+```