Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 52 additions & 2 deletions benchmarks/benchmark_rust_vs_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,58 @@ def load_from_rust():
except ImportError:
print("Rust extension not available")

# 4. Full Pipeline Benchmark
print("\n4. Complete Processing Pipeline")
# 4. Lemmatizer Cache Benchmark
print("\n4. Lemmatizer LRU Cache Performance")
print("-" * 70)

# Generate corpus with Zipfian distribution (realistic word frequency)
# Top 100 words cover ~50% of tokens in typical Turkish text
common_words = [
"kitap", "ev", "araba", "okul", "öğrenci", "öğretmen", "ders",
"sınıf", "masa", "sandalye", "kapı", "pencere", "bahçe", "ağaç",
"çocuk", "anne", "baba", "kardeş", "arkadaş", "komşu"
] * 50 # 1000 tokens, highly repetitive

rare_words = [
f"nadir_kelime_{i}" for i in range(1000)
] # 1000 unique tokens

# Test 1: Cache-friendly workload (repetitive)
lemmatizer_cached = durak.Lemmatizer(cache_size=10_000)
lemmatizer_nocache = durak.Lemmatizer(cache_size=0)

def lemmatize_corpus(lemmatizer, words):
for word in words:
lemmatizer(word)

cached_time = benchmark(lemmatize_corpus, lemmatizer_cached, common_words, iterations=10)
nocache_time = benchmark(lemmatize_corpus, lemmatizer_nocache, common_words, iterations=10)

cache_info = lemmatizer_cached.get_cache_info()
hit_rate = cache_info.hits / (cache_info.hits + cache_info.misses) if cache_info else 0

print(f"Repetitive corpus (1000 tokens, 20 unique words):")
print(f" With cache: {cached_time:.4f} ms per call")
print(f" Without cache: {nocache_time:.4f} ms per call")
print(f" Speedup: {nocache_time / cached_time:.2f}x")
print(f" Cache hit rate: {hit_rate:.1%}")

# Test 2: Cache-hostile workload (all unique)
lemmatizer_cached.clear_cache()
cached_time_unique = benchmark(lemmatize_corpus, lemmatizer_cached, rare_words, iterations=10)
nocache_time_unique = benchmark(lemmatize_corpus, lemmatizer_nocache, rare_words, iterations=10)

cache_info_unique = lemmatizer_cached.get_cache_info()
hit_rate_unique = cache_info_unique.hits / (cache_info_unique.hits + cache_info_unique.misses) if cache_info_unique else 0

print(f"\nUnique corpus (1000 unique words, no repetition):")
print(f" With cache: {cached_time_unique:.4f} ms per call")
print(f" Without cache: {nocache_time_unique:.4f} ms per call")
print(f" Overhead: {cached_time_unique / nocache_time_unique:.2f}x")
print(f" Cache hit rate: {hit_rate_unique:.1%}")

# 5. Full Pipeline Benchmark
print("\n5. Complete Processing Pipeline")
print("-" * 70)

pipeline = durak.Pipeline(
Expand Down
60 changes: 56 additions & 4 deletions python/durak/lemmatizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from dataclasses import dataclass, field
from functools import lru_cache
from time import perf_counter
from typing import Literal

Expand Down Expand Up @@ -108,6 +109,7 @@ class Lemmatizer:
strict_validation: Require roots to be in lemma dictionary
min_root_length: Minimum acceptable root length (characters)
collect_metrics: Enable performance metrics collection (adds ~5-10% overhead)
cache_size: LRU cache size for repeated words (0=disabled, default=10_000)
"""
def __init__(
self,
Expand All @@ -116,18 +118,24 @@ def __init__(
strict_validation: bool = False,
min_root_length: int = 2,
collect_metrics: bool = False,
cache_size: int = 10_000,
):
self.strategy = strategy
self.validate_roots = validate_roots
self.strict_validation = strict_validation
self.min_root_length = min_root_length
self.collect_metrics = collect_metrics
self.cache_size = cache_size
self._metrics = LemmatizerMetrics() if collect_metrics else None

def __call__(self, word: str) -> str:
if not word:
return ""

# Setup LRU cache if enabled
if cache_size > 0:
self._cached_call = lru_cache(maxsize=cache_size)(self._raw_call)
else:
self._cached_call = self._raw_call

def _raw_call(self, word: str) -> str:
"""Core lemmatization logic (bypasses cache)."""
start_time = perf_counter() if self.collect_metrics else None

# Tier 1: Lookup
Expand Down Expand Up @@ -177,6 +185,12 @@ def __call__(self, word: str) -> str:

return word

def __call__(self, word: str) -> str:
"""Lemmatize a word (with LRU caching if enabled)."""
if not word:
return ""
return self._cached_call(word)

def get_metrics(self) -> LemmatizerMetrics:
"""Return collected metrics.

Expand Down Expand Up @@ -211,6 +225,42 @@ def reset_metrics(self) -> None:
)
self._metrics = LemmatizerMetrics()

def get_cache_info(self):
"""Return LRU cache statistics if caching is enabled.

Returns:
CacheInfo: Named tuple with hits, misses, maxsize, currsize
None: If caching is disabled (cache_size=0)

Example:
>>> lemmatizer = Lemmatizer(cache_size=1000)
>>> for word in ["kitap", "kitaplar", "kitap"]:
... lemmatizer(word)
>>> info = lemmatizer.get_cache_info()
>>> print(f"Cache hit rate: {info.hits / (info.hits + info.misses):.2%}")
"""
if self.cache_size > 0:
return self._cached_call.cache_info()
return None

def clear_cache(self) -> None:
"""Clear the LRU cache if caching is enabled.

Useful for:
- Benchmarking without cache warmup
- Measuring cold-start performance
- Memory management in long-running processes

Example:
>>> lemmatizer = Lemmatizer(cache_size=1000)
>>> lemmatizer("kitaplar") # Cache miss
>>> lemmatizer("kitaplar") # Cache hit
>>> lemmatizer.clear_cache()
>>> lemmatizer("kitaplar") # Cache miss again
"""
if self.cache_size > 0:
self._cached_call.cache_clear()

def __repr__(self) -> str:
parts = [f"strategy='{self.strategy}'"]
if self.validate_roots:
Expand All @@ -221,4 +271,6 @@ def __repr__(self) -> str:
parts.append(f"min_root_length={self.min_root_length}")
if self.collect_metrics:
parts.append("collect_metrics=True")
if self.cache_size != 10_000: # Only show if non-default
parts.append(f"cache_size={self.cache_size}")
return f"Lemmatizer({', '.join(parts)})"
79 changes: 79 additions & 0 deletions resources/tr/lemmas/eval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Lemmatization Evaluation Test Sets

This directory contains gold-standard test sets for evaluating lemmatizer quality.

## Files

### `gold_standard.tsv`

Hand-curated test set with 152 entries covering:

- **Nouns** (63 entries): plural, case markers (accusative, dative, locative, ablative), possessive, genitive
- **Verbs** (85 entries): present continuous, past tense, future tense, infinitive, simple present
- **Pronouns** (included in noun cases)
- **Edge cases** (4 entries): short words, compound suffixes

**Format:**
```
inflected<TAB>lemma<TAB>source
```

**Sources:**
- `manual`: Hand-curated entries
- `dict`: From `turkish_lemma_dict.txt`
- `test`: From existing unit tests

## Usage

### Basic Evaluation

```python
from durak.lemmatizer import Lemmatizer
import pandas as pd

# Load test set
df = pd.read_csv("resources/tr/lemmas/eval/gold_standard.tsv",
sep="\t", comment="#",
names=["word", "lemma", "source"])

# Evaluate a strategy
lemmatizer = Lemmatizer(strategy="hybrid")
correct = sum(lemmatizer(row.word) == row.lemma for _, row in df.iterrows())
accuracy = correct / len(df)

print(f"Accuracy: {accuracy:.2%} ({correct}/{len(df)})")
```

### Strategy Comparison

```bash
python scripts/evaluate_lemmatizer.py
```

## Expansion

Future test sets to add:

- `domain_news.tsv`: Formal news corpus (high dictionary coverage expected)
- `domain_social_media.tsv`: Informal text (OOV-heavy, slang, misspellings)
- `domain_technical.tsv`: Technical/scientific terms
- `edge_cases.tsv`: Adversarial examples (apostrophes, rare patterns)

## Provenance

- **Gold standard**: Curated by cdliai team (Jan 2026)
- **Dictionary entries**: From `turkish_lemma_dict.txt` (v0.5.0+)
- **Test cases**: From `tests/test_lemmatizer.py` regression tests

## Citation

If using these test sets in research:

```bibtex
@software{durak_lemma_eval,
title = {Durak Turkish NLP Lemmatization Evaluation Sets},
author = {{CDLI AI}},
year = {2026},
url = {https://github.com/cdliai/durak}
}
```
Loading
Loading