cdliai · Copilot · Feb 21, 2026 · Feb 21, 2026 · Copilot · Feb 23, 2026
diff --git a/README.md b/README.md
@@ -78,9 +78,10 @@ from durak import process_text
 
 result = process_text(
     "Türkiye'de NLP zor!",
-    steps=["clean", "tokenize", "remove_stopwords"]
+    remove_stopwords=True,
+    attach_suffixes=True,
 )
-# ["türkiye'de", "nlp", "zor", "!"]
+# result.tokens => ["türkiye'de", "nlp", "zor", "!"]
 ```
 
 ### Build Blocks à la Carte

diff --git a/docs/USER_GUIDE.md b/docs/USER_GUIDE.md
@@ -77,6 +77,7 @@ config = ProcessorConfig(emoji_mode="remove")
 
 # Extract emojis separately
 config = ProcessorConfig(emoji_mode="extract")
+processor = TextProcessor(config)
 result = processor.process("Harika! 🎉")
 print(result.tokens)   # ['harika']
 print(result.emojis)   # ['🎉']

diff --git a/python/durak/ottoman/processor.py b/python/durak/ottoman/processor.py
@@ -335,11 +335,22 @@ def process(self, text: str) -> OttomanProcessingResult:
                 else:
                     output_token = original_token
 
-                # Create mapping for this token
+                # Create mapping for this token using document-level offsets
+                if orig_start >= 0 and orig_end >= orig_start:
+                    # Use document-level offsets derived from the full mapping
+                    token_char_mappings = [
+                        (orig_start, orig_end, trans_idx, trans_idx + len(token))
+                    ]
+                else:
+                    # Fallback: use token-local offsets if we could not resolve global ones
+                    token_char_mappings = [
+                        (0, len(original_token), 0, len(token))
+                    ]
+
                 token_mapping = TransliterationMapping(
                     original=original_token,
                     transliterated=token,
-                    char_mappings=[(0, len(original_token), 0, len(token))],
+                    char_mappings=token_char_mappings,
                 )
 
                 processed_tokens.append(output_token)
@@ -363,7 +374,20 @@ def process(self, text: str) -> OttomanProcessingResult:
                 ))
                 script_types.append(script_type)
 
-        # Step 6: Remove stopwords (using processed/modern tokens for matching)
+        # Step 6: Strip custom suffixes from processed tokens
+        if self.suffixes:
+            sorted_suffixes = sorted(self.suffixes, key=len, reverse=True)
+            stripped = []
+            for token in processed_tokens:
+                # Remove the longest matching suffix (at most one per token)
+                for suffix in sorted_suffixes:
+                    if token.endswith(suffix) and len(token) > len(suffix):
+                        token = token[: -len(suffix)]
+                        break
+                stripped.append(token)
+            processed_tokens = stripped
+
-            stripped = []
-            for token in processed_tokens:
-                # Remove the longest matching suffix (at most one per token)
-                for suffix in sorted_suffixes:
-                    if token.endswith(suffix) and len(token) > len(suffix):
-                        token = token[: -len(suffix)]
-                        break
-                stripped.append(token)
-            processed_tokens = stripped
+            stripped_tokens: list[str] = []
+            stripped_mappings: list[TransliterationMapping] = []
+            for idx, (token, mapping) in enumerate(zip(processed_tokens, offset_mappings)):
+                original_token = token
+                removed_len = 0
+                matched_suffix = ""
+                # Remove the longest matching suffix (at most one per token)
+                for suffix in sorted_suffixes:
+                    if token.endswith(suffix) and len(token) > len(suffix):
+                        removed_len = len(suffix)
+                        matched_suffix = suffix
+                        token = token[: -removed_len]
+                        break
+                stripped_tokens.append(token)
+
+                # Keep offset mappings consistent with suffix-stripped tokens.
+                # If we did not remove any suffix, reuse the existing mapping.
+                if removed_len == 0:
+                    stripped_mappings.append(mapping)
+                    continue
+
+                # Update the transliterated form by stripping the same suffix length.
+                transliterated = mapping.transliterated
+                if len(transliterated) >= removed_len:
+                    new_transliterated = transliterated[:-removed_len]
+                else:
+                    # Fallback: avoid negative slicing if lengths somehow mismatch.
+                    new_transliterated = ""
+
+                new_trans_len = len(new_transliterated)
+                new_char_mappings: list[tuple[int, int, int, int]] = []
+                for orig_start, orig_end, trans_start, trans_end in mapping.char_mappings:
+                    # Skip mappings that start beyond the new transliterated length.
+                    if trans_start >= new_trans_len:
+                        break
+                    # If the entire mapping lies within the new length, keep as is.
+                    if trans_end <= new_trans_len:
+                        new_char_mappings.append((orig_start, orig_end, trans_start, trans_end))
+                    else:
+                        # Truncate a partially overlapping mapping at the new end.
+                        new_char_mappings.append((orig_start, orig_end, trans_start, new_trans_len))
+                        break
+
+                stripped_mappings.append(
+                    TransliterationMapping(
+                        original=mapping.original,
+                        transliterated=new_transliterated,
+                        char_mappings=new_char_mappings,
+                    )
+                )
+
+            processed_tokens = stripped_tokens
+            offset_mappings = stripped_mappings
-            stripped = []
-            for token in processed_tokens:
-                # Remove the longest matching suffix (at most one per token)
-                for suffix in sorted_suffixes:
-                    if token.endswith(suffix) and len(token) > len(suffix):
-                        token = token[: -len(suffix)]
-                        break
-                stripped.append(token)
-            processed_tokens = stripped
+            stripped_tokens: list[str] = []
+            stripped_mappings: list[TransliterationMapping] = []
+            for idx, (token, mapping) in enumerate(zip(processed_tokens, offset_mappings)):
+                original_token = token
+                removed_len = 0
+                matched_suffix = ""
+                # Remove the longest matching suffix (at most one per token)
+                for suffix in sorted_suffixes:
+                    if token.endswith(suffix) and len(token) > len(suffix):
+                        removed_len = len(suffix)
+                        matched_suffix = suffix
+                        token = token[: -removed_len]
+                        break
+                stripped_tokens.append(token)
+
+                # Keep offset mappings consistent with suffix-stripped tokens.
+                # If we did not remove any suffix, reuse the existing mapping.
+                if removed_len == 0:
+                    stripped_mappings.append(mapping)
+                    continue
+
+                # Update the transliterated form by stripping the same suffix length.
+                transliterated = mapping.transliterated
+                if len(transliterated) >= removed_len:
+                    new_transliterated = transliterated[:-removed_len]
+                else:
+                    # Fallback: avoid negative slicing if lengths somehow mismatch.
+                    new_transliterated = ""
+
+                new_trans_len = len(new_transliterated)
+                new_char_mappings: list[tuple[int, int, int, int]] = []
+                for orig_start, orig_end, trans_start, trans_end in mapping.char_mappings:
+                    # Skip mappings that start beyond the new transliterated length.
+                    if trans_start >= new_trans_len:
+                        break
+                    # If the entire mapping lies within the new length, keep as is.
+                    if trans_end <= new_trans_len:
+                        new_char_mappings.append((orig_start, orig_end, trans_start, trans_end))
+                    else:
+                        # Truncate a partially overlapping mapping at the new end.
+                        new_char_mappings.append((orig_start, orig_end, trans_start, new_trans_len))
+                        break
+
+                stripped_mappings.append(
+                    TransliterationMapping(
+                        original=mapping.original,
+                        transliterated=new_transliterated,
+                        char_mappings=new_char_mappings,
+                    )
+                )
+
+            processed_tokens = stripped_tokens
+            offset_mappings = stripped_mappings
+        # Step 7: Remove stopwords (using processed/modern tokens for matching)
         if self.stopwords:
             filtered_indices = [
                 i for i, token in enumerate(processed_tokens)

diff --git a/python/durak/ottoman/transliterator.py b/python/durak/ottoman/transliterator.py
@@ -165,6 +165,7 @@ def arabic_to_latin(self, text: str) -> TransliterationMapping:
         mappings = []
         ambiguous = []
 
+        trans_pos = 0
         i = 0
         while i < len(text):
             char = text[i]
@@ -186,19 +187,21 @@ def arabic_to_latin(self, text: str) -> TransliterationMapping:
                 if char in AMBIGUOUS_MAPPINGS:
                     ambiguous.append((i, char, latin_char))
 
-                # Record mapping
-                trans_start = len("".join(transliterated_chars))
+                # Record mapping using running counter instead of O(n) join
+                trans_start = trans_pos
                 transliterated_chars.append(latin_char)
-                trans_end = len("".join(transliterated_chars))
+                trans_pos += len(latin_char)
+                trans_end = trans_pos
 
                 mappings.append((i, i + 1, trans_start, trans_end))
                 i += 1
             else:
                 # Non-Arabic character (space, punctuation, etc.)
                 # Pass through but record mapping
-                trans_start = len("".join(transliterated_chars))
+                trans_start = trans_pos
                 transliterated_chars.append(char)
-                trans_end = len("".join(transliterated_chars))
+                trans_pos += len(char)
+                trans_end = trans_pos
 
                 mappings.append((i, i + 1, trans_start, trans_end))
                 i += 1
@@ -225,6 +228,7 @@ def scholarly_to_modern(self, text: str) -> TransliterationMapping:
         result_chars = []
         mappings = []
 
+        trans_pos = 0
         i = 0
         while i < len(text):
             char = text[i]
@@ -237,16 +241,18 @@ def scholarly_to_modern(self, text: str) -> TransliterationMapping:
                     # Keep original if preserving and would be removed
                     modern = char
 
-                # Record mapping
-                trans_start = len("".join(result_chars))
+                # Record mapping using running counter instead of O(n) join
+                trans_start = trans_pos
                 result_chars.append(modern)
-                trans_end = len("".join(result_chars))
+                trans_pos += len(modern)
+                trans_end = trans_pos
                 mappings.append((i, i + 1, trans_start, trans_end))
             else:
                 # Pass through unchanged (regular Latin letters, spaces, etc.)
-                trans_start = len("".join(result_chars))
+                trans_start = trans_pos
                 result_chars.append(char)
-                trans_end = len("".join(result_chars))
+                trans_pos += len(char)
+                trans_end = trans_pos
                 mappings.append((i, i + 1, trans_start, trans_end))
 
             i += 1

diff --git a/python/durak/processor.py b/python/durak/processor.py
@@ -24,7 +24,7 @@
 from dataclasses import dataclass, field
 from typing import Callable, Literal
 
-from durak.cleaning import clean_text, normalize_case
+from durak.cleaning import clean_text, normalize_case, DEFAULT_CLEANING_STEPS
 from durak.exceptions import ConfigurationError
 from durak.lemmatizer import Lemmatizer
 from durak.stopwords import BASE_STOPWORDS, StopwordManager, remove_stopwords
@@ -152,28 +152,37 @@ def process(self, text: str) -> ProcessingResult:
         result = ProcessingResult()
 
         # Step 1: Clean text (with emoji handling)
+        # When lowercase=False, use a custom steps pipeline that omits case normalization
+        if self.config.lowercase:
+            cleaning_steps = None  # use DEFAULT_CLEANING_STEPS (includes lowercase)
+        else:
+            cleaning_steps = tuple(
+                step for step in DEFAULT_CLEANING_STEPS
+                if not (
+                    step is normalize_case
+                    or getattr(step, "func", None) is normalize_case
+                )
+            )
+
         if self.config.emoji_mode == "extract":
-            cleaned, emojis = clean_text(text, emoji_mode="extract")
+            cleaned, emojis = clean_text(text, steps=cleaning_steps, emoji_mode="extract")
             result.emojis = emojis
         else:
-            cleaned = clean_text(text, emoji_mode=self.config.emoji_mode)
-
-        # Step 2: Additional lowercase normalization if needed
-        # (clean_text already lowercases via DEFAULT_CLEANING_STEPS)
+            cleaned = clean_text(text, steps=cleaning_steps, emoji_mode=self.config.emoji_mode)
 
-        # Step 3: Tokenize
+        # Step 2: Tokenize
         tokens = tokenize(cleaned, strip_punct=self.config.remove_punctuation)
 
-        # Step 4: Reattach detached suffixes
+        # Step 3: Reattach detached suffixes
         if self.config.attach_suffixes:
             tokens = attach_detached_suffixes(tokens)
 
-        # Step 5: Lemmatize (before stopword removal to help with matching)
+        # Step 4: Lemmatize (before stopword removal to help with matching)
         if self.config.lemmatize and self.config.lemmatizer:
             lemmas = [self.config.lemmatizer(token) for token in tokens]
             result.lemmas = lemmas
 
-        # Step 6: Remove stopwords
+        # Step 5: Remove stopwords
         if self.config.remove_stopwords and self.config.stopword_manager:
             # Filter both tokens and lemmas together
             filtered_indices = [

diff --git a/python/durak/stats/frequencies.py b/python/durak/stats/frequencies.py
@@ -8,7 +8,7 @@
 
 Example:
     >>> from durak.stats import FrequencyCounter, ngrams
-    >>> from durak import TextProcessor
+    >>> from durak import TextProcessor, ProcessorConfig
     >>> 
     >>> processor = TextProcessor(ProcessorConfig(lemmatize=True))
     >>> texts = ["Kitap okuyorum.", "Kitap yazıyorum."]