diff --git a/README.md b/README.md index b492da8..e7b0b19 100644 --- a/README.md +++ b/README.md @@ -78,9 +78,10 @@ from durak import process_text result = process_text( "Türkiye'de NLP zor!", - steps=["clean", "tokenize", "remove_stopwords"] + remove_stopwords=True, + attach_suffixes=True, ) -# ["türkiye'de", "nlp", "zor", "!"] +# result.tokens => ["türkiye'de", "nlp", "zor", "!"] ``` ### Build Blocks à la Carte diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md index e0d44f0..b9f800e 100644 --- a/docs/USER_GUIDE.md +++ b/docs/USER_GUIDE.md @@ -77,6 +77,7 @@ config = ProcessorConfig(emoji_mode="remove") # Extract emojis separately config = ProcessorConfig(emoji_mode="extract") +processor = TextProcessor(config) result = processor.process("Harika! 🎉") print(result.tokens) # ['harika'] print(result.emojis) # ['🎉'] diff --git a/python/durak/ottoman/processor.py b/python/durak/ottoman/processor.py index 0dc7f41..fe94364 100644 --- a/python/durak/ottoman/processor.py +++ b/python/durak/ottoman/processor.py @@ -335,11 +335,22 @@ def process(self, text: str) -> OttomanProcessingResult: else: output_token = original_token - # Create mapping for this token + # Create mapping for this token using document-level offsets + if orig_start >= 0 and orig_end >= orig_start: + # Use document-level offsets derived from the full mapping + token_char_mappings = [ + (orig_start, orig_end, trans_idx, trans_idx + len(token)) + ] + else: + # Fallback: use token-local offsets if we could not resolve global ones + token_char_mappings = [ + (0, len(original_token), 0, len(token)) + ] + token_mapping = TransliterationMapping( original=original_token, transliterated=token, - char_mappings=[(0, len(original_token), 0, len(token))], + char_mappings=token_char_mappings, ) processed_tokens.append(output_token) @@ -363,7 +374,20 @@ def process(self, text: str) -> OttomanProcessingResult: )) script_types.append(script_type) - # Step 6: Remove stopwords (using processed/modern tokens for matching) + # Step 6: Strip custom suffixes from processed tokens + if self.suffixes: + sorted_suffixes = sorted(self.suffixes, key=len, reverse=True) + stripped = [] + for token in processed_tokens: + # Remove the longest matching suffix (at most one per token) + for suffix in sorted_suffixes: + if token.endswith(suffix) and len(token) > len(suffix): + token = token[: -len(suffix)] + break + stripped.append(token) + processed_tokens = stripped + + # Step 7: Remove stopwords (using processed/modern tokens for matching) if self.stopwords: filtered_indices = [ i for i, token in enumerate(processed_tokens) diff --git a/python/durak/ottoman/transliterator.py b/python/durak/ottoman/transliterator.py index c30e0e6..0cf777e 100644 --- a/python/durak/ottoman/transliterator.py +++ b/python/durak/ottoman/transliterator.py @@ -165,6 +165,7 @@ def arabic_to_latin(self, text: str) -> TransliterationMapping: mappings = [] ambiguous = [] + trans_pos = 0 i = 0 while i < len(text): char = text[i] @@ -186,19 +187,21 @@ def arabic_to_latin(self, text: str) -> TransliterationMapping: if char in AMBIGUOUS_MAPPINGS: ambiguous.append((i, char, latin_char)) - # Record mapping - trans_start = len("".join(transliterated_chars)) + # Record mapping using running counter instead of O(n) join + trans_start = trans_pos transliterated_chars.append(latin_char) - trans_end = len("".join(transliterated_chars)) + trans_pos += len(latin_char) + trans_end = trans_pos mappings.append((i, i + 1, trans_start, trans_end)) i += 1 else: # Non-Arabic character (space, punctuation, etc.) # Pass through but record mapping - trans_start = len("".join(transliterated_chars)) + trans_start = trans_pos transliterated_chars.append(char) - trans_end = len("".join(transliterated_chars)) + trans_pos += len(char) + trans_end = trans_pos mappings.append((i, i + 1, trans_start, trans_end)) i += 1 @@ -225,6 +228,7 @@ def scholarly_to_modern(self, text: str) -> TransliterationMapping: result_chars = [] mappings = [] + trans_pos = 0 i = 0 while i < len(text): char = text[i] @@ -237,16 +241,18 @@ def scholarly_to_modern(self, text: str) -> TransliterationMapping: # Keep original if preserving and would be removed modern = char - # Record mapping - trans_start = len("".join(result_chars)) + # Record mapping using running counter instead of O(n) join + trans_start = trans_pos result_chars.append(modern) - trans_end = len("".join(result_chars)) + trans_pos += len(modern) + trans_end = trans_pos mappings.append((i, i + 1, trans_start, trans_end)) else: # Pass through unchanged (regular Latin letters, spaces, etc.) - trans_start = len("".join(result_chars)) + trans_start = trans_pos result_chars.append(char) - trans_end = len("".join(result_chars)) + trans_pos += len(char) + trans_end = trans_pos mappings.append((i, i + 1, trans_start, trans_end)) i += 1 diff --git a/python/durak/processor.py b/python/durak/processor.py index 7d4eeb7..a657265 100644 --- a/python/durak/processor.py +++ b/python/durak/processor.py @@ -24,7 +24,7 @@ from dataclasses import dataclass, field from typing import Callable, Literal -from durak.cleaning import clean_text, normalize_case +from durak.cleaning import clean_text, normalize_case, DEFAULT_CLEANING_STEPS from durak.exceptions import ConfigurationError from durak.lemmatizer import Lemmatizer from durak.stopwords import BASE_STOPWORDS, StopwordManager, remove_stopwords @@ -152,28 +152,37 @@ def process(self, text: str) -> ProcessingResult: result = ProcessingResult() # Step 1: Clean text (with emoji handling) + # When lowercase=False, use a custom steps pipeline that omits case normalization + if self.config.lowercase: + cleaning_steps = None # use DEFAULT_CLEANING_STEPS (includes lowercase) + else: + cleaning_steps = tuple( + step for step in DEFAULT_CLEANING_STEPS + if not ( + step is normalize_case + or getattr(step, "func", None) is normalize_case + ) + ) + if self.config.emoji_mode == "extract": - cleaned, emojis = clean_text(text, emoji_mode="extract") + cleaned, emojis = clean_text(text, steps=cleaning_steps, emoji_mode="extract") result.emojis = emojis else: - cleaned = clean_text(text, emoji_mode=self.config.emoji_mode) - - # Step 2: Additional lowercase normalization if needed - # (clean_text already lowercases via DEFAULT_CLEANING_STEPS) + cleaned = clean_text(text, steps=cleaning_steps, emoji_mode=self.config.emoji_mode) - # Step 3: Tokenize + # Step 2: Tokenize tokens = tokenize(cleaned, strip_punct=self.config.remove_punctuation) - # Step 4: Reattach detached suffixes + # Step 3: Reattach detached suffixes if self.config.attach_suffixes: tokens = attach_detached_suffixes(tokens) - # Step 5: Lemmatize (before stopword removal to help with matching) + # Step 4: Lemmatize (before stopword removal to help with matching) if self.config.lemmatize and self.config.lemmatizer: lemmas = [self.config.lemmatizer(token) for token in tokens] result.lemmas = lemmas - # Step 6: Remove stopwords + # Step 5: Remove stopwords if self.config.remove_stopwords and self.config.stopword_manager: # Filter both tokens and lemmas together filtered_indices = [ diff --git a/python/durak/stats/frequencies.py b/python/durak/stats/frequencies.py index a0fb9dc..bb59a3c 100644 --- a/python/durak/stats/frequencies.py +++ b/python/durak/stats/frequencies.py @@ -8,7 +8,7 @@ Example: >>> from durak.stats import FrequencyCounter, ngrams - >>> from durak import TextProcessor + >>> from durak import TextProcessor, ProcessorConfig >>> >>> processor = TextProcessor(ProcessorConfig(lemmatize=True)) >>> texts = ["Kitap okuyorum.", "Kitap yazıyorum."]