Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,10 @@ from durak import process_text

result = process_text(
"Türkiye'de NLP zor!",
steps=["clean", "tokenize", "remove_stopwords"]
remove_stopwords=True,
attach_suffixes=True,
)
# ["türkiye'de", "nlp", "zor", "!"]
# result.tokens => ["türkiye'de", "nlp", "zor", "!"]
```

### Build Blocks à la Carte
Expand Down
1 change: 1 addition & 0 deletions docs/USER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ config = ProcessorConfig(emoji_mode="remove")

# Extract emojis separately
config = ProcessorConfig(emoji_mode="extract")
processor = TextProcessor(config)
result = processor.process("Harika! 🎉")
print(result.tokens) # ['harika']
print(result.emojis) # ['🎉']
Expand Down
30 changes: 27 additions & 3 deletions python/durak/ottoman/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,11 +335,22 @@ def process(self, text: str) -> OttomanProcessingResult:
else:
output_token = original_token

# Create mapping for this token
# Create mapping for this token using document-level offsets
if orig_start >= 0 and orig_end >= orig_start:
# Use document-level offsets derived from the full mapping
token_char_mappings = [
(orig_start, orig_end, trans_idx, trans_idx + len(token))
]
else:
# Fallback: use token-local offsets if we could not resolve global ones
token_char_mappings = [
(0, len(original_token), 0, len(token))
]

token_mapping = TransliterationMapping(
original=original_token,
transliterated=token,
char_mappings=[(0, len(original_token), 0, len(token))],
char_mappings=token_char_mappings,
)

processed_tokens.append(output_token)
Expand All @@ -363,7 +374,20 @@ def process(self, text: str) -> OttomanProcessingResult:
))
script_types.append(script_type)

# Step 6: Remove stopwords (using processed/modern tokens for matching)
# Step 6: Strip custom suffixes from processed tokens
if self.suffixes:
sorted_suffixes = sorted(self.suffixes, key=len, reverse=True)
stripped = []
for token in processed_tokens:
# Remove the longest matching suffix (at most one per token)
for suffix in sorted_suffixes:
if token.endswith(suffix) and len(token) > len(suffix):
token = token[: -len(suffix)]
break
stripped.append(token)
processed_tokens = stripped

Comment on lines +380 to +389

Copilot AI Feb 23, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suffix stripping creates an inconsistency: result.tokens contains suffix-stripped tokens, but result.offset_mappings[i].transliterated still contains the original token before suffix removal. This breaks the invariant that result.tokens[i] should match result.offset_mappings[i].transliterated. The offset mappings should be updated after suffix stripping to reflect the actual tokens, or the transliterated field should be updated to match the stripped token.

Suggested change
stripped = []
for token in processed_tokens:
# Remove the longest matching suffix (at most one per token)
for suffix in sorted_suffixes:
if token.endswith(suffix) and len(token) > len(suffix):
token = token[: -len(suffix)]
break
stripped.append(token)
processed_tokens = stripped
stripped_tokens: list[str] = []
stripped_mappings: list[TransliterationMapping] = []
for idx, (token, mapping) in enumerate(zip(processed_tokens, offset_mappings)):
original_token = token
removed_len = 0
matched_suffix = ""
# Remove the longest matching suffix (at most one per token)
for suffix in sorted_suffixes:
if token.endswith(suffix) and len(token) > len(suffix):
removed_len = len(suffix)
matched_suffix = suffix
token = token[: -removed_len]
break
stripped_tokens.append(token)
# Keep offset mappings consistent with suffix-stripped tokens.
# If we did not remove any suffix, reuse the existing mapping.
if removed_len == 0:
stripped_mappings.append(mapping)
continue
# Update the transliterated form by stripping the same suffix length.
transliterated = mapping.transliterated
if len(transliterated) >= removed_len:
new_transliterated = transliterated[:-removed_len]
else:
# Fallback: avoid negative slicing if lengths somehow mismatch.
new_transliterated = ""
new_trans_len = len(new_transliterated)
new_char_mappings: list[tuple[int, int, int, int]] = []
for orig_start, orig_end, trans_start, trans_end in mapping.char_mappings:
# Skip mappings that start beyond the new transliterated length.
if trans_start >= new_trans_len:
break
# If the entire mapping lies within the new length, keep as is.
if trans_end <= new_trans_len:
new_char_mappings.append((orig_start, orig_end, trans_start, trans_end))
else:
# Truncate a partially overlapping mapping at the new end.
new_char_mappings.append((orig_start, orig_end, trans_start, new_trans_len))
break
stripped_mappings.append(
TransliterationMapping(
original=mapping.original,
transliterated=new_transliterated,
char_mappings=new_char_mappings,
)
)
processed_tokens = stripped_tokens
offset_mappings = stripped_mappings

Copilot uses AI. Check for mistakes.
# Step 7: Remove stopwords (using processed/modern tokens for matching)
if self.stopwords:
filtered_indices = [
i for i, token in enumerate(processed_tokens)
Expand Down
26 changes: 16 additions & 10 deletions python/durak/ottoman/transliterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def arabic_to_latin(self, text: str) -> TransliterationMapping:
mappings = []
ambiguous = []

trans_pos = 0
i = 0
while i < len(text):
char = text[i]
Expand All @@ -186,19 +187,21 @@ def arabic_to_latin(self, text: str) -> TransliterationMapping:
if char in AMBIGUOUS_MAPPINGS:
ambiguous.append((i, char, latin_char))

# Record mapping
trans_start = len("".join(transliterated_chars))
# Record mapping using running counter instead of O(n) join
trans_start = trans_pos
transliterated_chars.append(latin_char)
trans_end = len("".join(transliterated_chars))
trans_pos += len(latin_char)
trans_end = trans_pos

mappings.append((i, i + 1, trans_start, trans_end))
i += 1
else:
# Non-Arabic character (space, punctuation, etc.)
# Pass through but record mapping
trans_start = len("".join(transliterated_chars))
trans_start = trans_pos
transliterated_chars.append(char)
trans_end = len("".join(transliterated_chars))
trans_pos += len(char)
trans_end = trans_pos

mappings.append((i, i + 1, trans_start, trans_end))
i += 1
Expand All @@ -225,6 +228,7 @@ def scholarly_to_modern(self, text: str) -> TransliterationMapping:
result_chars = []
mappings = []

trans_pos = 0
i = 0
while i < len(text):
char = text[i]
Expand All @@ -237,16 +241,18 @@ def scholarly_to_modern(self, text: str) -> TransliterationMapping:
# Keep original if preserving and would be removed
modern = char

# Record mapping
trans_start = len("".join(result_chars))
# Record mapping using running counter instead of O(n) join
trans_start = trans_pos
result_chars.append(modern)
trans_end = len("".join(result_chars))
trans_pos += len(modern)
trans_end = trans_pos
mappings.append((i, i + 1, trans_start, trans_end))
else:
# Pass through unchanged (regular Latin letters, spaces, etc.)
trans_start = len("".join(result_chars))
trans_start = trans_pos
result_chars.append(char)
trans_end = len("".join(result_chars))
trans_pos += len(char)
trans_end = trans_pos
mappings.append((i, i + 1, trans_start, trans_end))

i += 1
Expand Down
29 changes: 19 additions & 10 deletions python/durak/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from dataclasses import dataclass, field
from typing import Callable, Literal

from durak.cleaning import clean_text, normalize_case
from durak.cleaning import clean_text, normalize_case, DEFAULT_CLEANING_STEPS
from durak.exceptions import ConfigurationError
from durak.lemmatizer import Lemmatizer
from durak.stopwords import BASE_STOPWORDS, StopwordManager, remove_stopwords
Expand Down Expand Up @@ -152,28 +152,37 @@ def process(self, text: str) -> ProcessingResult:
result = ProcessingResult()

# Step 1: Clean text (with emoji handling)
# When lowercase=False, use a custom steps pipeline that omits case normalization
if self.config.lowercase:
cleaning_steps = None # use DEFAULT_CLEANING_STEPS (includes lowercase)
else:
cleaning_steps = tuple(
step for step in DEFAULT_CLEANING_STEPS
if not (
step is normalize_case
or getattr(step, "func", None) is normalize_case
)
)

if self.config.emoji_mode == "extract":
cleaned, emojis = clean_text(text, emoji_mode="extract")
cleaned, emojis = clean_text(text, steps=cleaning_steps, emoji_mode="extract")
result.emojis = emojis
else:
cleaned = clean_text(text, emoji_mode=self.config.emoji_mode)

# Step 2: Additional lowercase normalization if needed
# (clean_text already lowercases via DEFAULT_CLEANING_STEPS)
cleaned = clean_text(text, steps=cleaning_steps, emoji_mode=self.config.emoji_mode)

# Step 3: Tokenize
# Step 2: Tokenize
tokens = tokenize(cleaned, strip_punct=self.config.remove_punctuation)

# Step 4: Reattach detached suffixes
# Step 3: Reattach detached suffixes
if self.config.attach_suffixes:
tokens = attach_detached_suffixes(tokens)

# Step 5: Lemmatize (before stopword removal to help with matching)
# Step 4: Lemmatize (before stopword removal to help with matching)
if self.config.lemmatize and self.config.lemmatizer:
lemmas = [self.config.lemmatizer(token) for token in tokens]
result.lemmas = lemmas

# Step 6: Remove stopwords
# Step 5: Remove stopwords
if self.config.remove_stopwords and self.config.stopword_manager:
# Filter both tokens and lemmas together
filtered_indices = [
Expand Down
2 changes: 1 addition & 1 deletion python/durak/stats/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

Example:
>>> from durak.stats import FrequencyCounter, ngrams
>>> from durak import TextProcessor
>>> from durak import TextProcessor, ProcessorConfig
>>>
>>> processor = TextProcessor(ProcessorConfig(lemmatize=True))
>>> texts = ["Kitap okuyorum.", "Kitap yazıyorum."]
Expand Down