From a1a28257a7f11957b9c060e74c5dfc94a4e78875 Mon Sep 17 00:00:00 2001 From: Jigar <1252580+artvandelay@users.noreply.github.com> Date: Sat, 10 Jan 2026 00:51:57 -0800 Subject: [PATCH] Add term tokenization and counters --- src/nlbt/terms/__init__.py | 21 ++++++++ src/nlbt/terms/counters.py | 98 +++++++++++++++++++++++++++++++++++++ src/nlbt/terms/diff.py | 45 +++++++++++++++++ src/nlbt/terms/ngrams.py | 50 +++++++++++++++++++ src/nlbt/terms/tokenizer.py | 53 ++++++++++++++++++++ tests/test_terms.py | 47 ++++++++++++++++++ 6 files changed, 314 insertions(+) create mode 100644 src/nlbt/terms/__init__.py create mode 100644 src/nlbt/terms/counters.py create mode 100644 src/nlbt/terms/diff.py create mode 100644 src/nlbt/terms/ngrams.py create mode 100644 src/nlbt/terms/tokenizer.py create mode 100644 tests/test_terms.py diff --git a/src/nlbt/terms/__init__.py b/src/nlbt/terms/__init__.py new file mode 100644 index 0000000..a4e154f --- /dev/null +++ b/src/nlbt/terms/__init__.py @@ -0,0 +1,21 @@ +from .counters import TermBucketStats, TermCounterStore, TermWindowStats +from .diff import DiffTerms, DiffTokens, extract_diff_terms, tokenize_diff +from .ngrams import TermCandidates, extract_term_candidates, generate_ngrams +from .tokenizer import TokenizedText, extract_capitalized_phrases, tokenize, tokenize_text + +__all__ = [ + "TokenizedText", + "tokenize", + "tokenize_text", + "extract_capitalized_phrases", + "DiffTokens", + "DiffTerms", + "tokenize_diff", + "extract_diff_terms", + "generate_ngrams", + "TermCandidates", + "extract_term_candidates", + "TermBucketStats", + "TermWindowStats", + "TermCounterStore", +] diff --git a/src/nlbt/terms/counters.py b/src/nlbt/terms/counters.py new file mode 100644 index 0000000..70247eb --- /dev/null +++ b/src/nlbt/terms/counters.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Dict, Iterable, List, Set + + +@dataclass +class TermBucketStats: + added: int = 0 + removed: int = 0 + pages: Set[str] = field(default_factory=set) + editors: Set[str] = field(default_factory=set) + + def record(self, delta_added: int, delta_removed: int, page_id: str, editor_id: str) -> None: + self.added += delta_added + self.removed += delta_removed + if page_id: + self.pages.add(page_id) + if editor_id: + self.editors.add(editor_id) + + +@dataclass(frozen=True) +class TermWindowStats: + term: str + added: int + removed: int + pages: Set[str] + editors: Set[str] + + +class TermCounterStore: + """Time-bucketed counters for term additions/removals.""" + + def __init__(self, bucket_size: timedelta = timedelta(hours=1)) -> None: + if bucket_size.total_seconds() <= 0: + raise ValueError("bucket_size must be positive") + self.bucket_size = bucket_size + self.table: Dict[datetime, Dict[str, TermBucketStats]] = {} + + def _bucket_start(self, timestamp: datetime) -> datetime: + bucket_seconds = int(self.bucket_size.total_seconds()) + epoch = int(timestamp.timestamp()) + bucket_epoch = epoch - (epoch % bucket_seconds) + return datetime.fromtimestamp(bucket_epoch, tz=timestamp.tzinfo) + + def add_terms( + self, + terms_added: Iterable[str], + terms_removed: Iterable[str], + page_id: str, + editor_id: str, + timestamp: datetime, + ) -> None: + bucket = self._bucket_start(timestamp) + bucket_terms = self.table.setdefault(bucket, {}) + + for term in terms_added: + stats = bucket_terms.setdefault(term, TermBucketStats()) + stats.record(delta_added=1, delta_removed=0, page_id=page_id, editor_id=editor_id) + + for term in terms_removed: + stats = bucket_terms.setdefault(term, TermBucketStats()) + stats.record(delta_added=0, delta_removed=1, page_id=page_id, editor_id=editor_id) + + def _iter_buckets(self, start: datetime, end: datetime) -> Iterable[Dict[str, TermBucketStats]]: + for bucket_time, term_map in self.table.items(): + if start <= bucket_time <= end: + yield term_map + + def get_window_stats(self, term: str, end: datetime, window: timedelta) -> TermWindowStats: + start = end - window + added = 0 + removed = 0 + pages: Set[str] = set() + editors: Set[str] = set() + for term_map in self._iter_buckets(start, end): + stats = term_map.get(term) + if stats: + added += stats.added + removed += stats.removed + pages.update(stats.pages) + editors.update(stats.editors) + return TermWindowStats(term=term, added=added, removed=removed, pages=pages, editors=editors) + + def get_rollups(self, term: str, now: datetime) -> Dict[str, TermWindowStats]: + return { + "24h": self.get_window_stats(term, now, timedelta(hours=24)), + "7d": self.get_window_stats(term, now, timedelta(days=7)), + "30d": self.get_window_stats(term, now, timedelta(days=30)), + } + + def list_terms(self) -> List[str]: + terms: Set[str] = set() + for term_map in self.table.values(): + terms.update(term_map.keys()) + return sorted(terms) diff --git a/src/nlbt/terms/diff.py b/src/nlbt/terms/diff.py new file mode 100644 index 0000000..b3e2884 --- /dev/null +++ b/src/nlbt/terms/diff.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Callable + +from .ngrams import TermCandidates, extract_term_candidates +from .tokenizer import TokenizedText, tokenize_text + + +@dataclass(frozen=True) +class DiffTerms: + added: TermCandidates + removed: TermCandidates + + +@dataclass(frozen=True) +class DiffTokens: + added: TokenizedText + removed: TokenizedText + + +def tokenize_diff( + added_text: str, + removed_text: str, + normalize: Callable[[str], str] | None = None, +) -> DiffTokens: + return DiffTokens( + added=tokenize_text(added_text, normalize=normalize), + removed=tokenize_text(removed_text, normalize=normalize), + ) + + +def extract_diff_terms( + added_text: str, + removed_text: str, + min_n: int = 1, + max_n: int = 4, + normalize: Callable[[str], str] | None = str.lower, +) -> DiffTerms: + return DiffTerms( + added=extract_term_candidates(added_text, min_n=min_n, max_n=max_n, normalize=normalize), + removed=extract_term_candidates( + removed_text, min_n=min_n, max_n=max_n, normalize=normalize + ), + ) diff --git a/src/nlbt/terms/ngrams.py b/src/nlbt/terms/ngrams.py new file mode 100644 index 0000000..1603d48 --- /dev/null +++ b/src/nlbt/terms/ngrams.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Callable, Iterable, List + +from .tokenizer import extract_capitalized_phrases, tokenize + + +@dataclass(frozen=True) +class TermCandidates: + tokens: List[str] + ngrams: List[str] + proper_nouns: List[str] + + +def generate_ngrams( + tokens: Iterable[str], + min_n: int = 1, + max_n: int = 4, + normalize: Callable[[str], str] | None = str.lower, +) -> List[str]: + """Generate n-grams from tokens. + + Args: + tokens: Input token sequence. + min_n: Minimum n-gram size. + max_n: Maximum n-gram size. + normalize: Optional normalization function applied per token. + """ + token_list = list(tokens) + if normalize is not None: + token_list = [normalize(token) for token in token_list] + ngrams: List[str] = [] + count = len(token_list) + for n in range(min_n, max_n + 1): + for start in range(0, max(count - n + 1, 0)): + ngrams.append(" ".join(token_list[start : start + n])) + return ngrams + + +def extract_term_candidates( + text: str, + min_n: int = 1, + max_n: int = 4, + normalize: Callable[[str], str] | None = str.lower, +) -> TermCandidates: + tokens = tokenize(text) + ngrams = generate_ngrams(tokens, min_n=min_n, max_n=max_n, normalize=normalize) + proper_nouns = extract_capitalized_phrases(tokens) + return TermCandidates(tokens=tokens, ngrams=ngrams, proper_nouns=proper_nouns) diff --git a/src/nlbt/terms/tokenizer.py b/src/nlbt/terms/tokenizer.py new file mode 100644 index 0000000..fcad3d4 --- /dev/null +++ b/src/nlbt/terms/tokenizer.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import Callable, Iterable, List + +WORD_RE = re.compile(r"[A-Za-z0-9]+(?:'[A-Za-z0-9]+)?") + + +@dataclass(frozen=True) +class TokenizedText: + text: str + tokens: List[str] + + +def tokenize(text: str, normalize: Callable[[str], str] | None = None) -> List[str]: + """Tokenize text into word-like tokens. + + Args: + text: Input text. + normalize: Optional normalization function applied to each token. + """ + tokens = WORD_RE.findall(text or "") + if normalize is None: + return tokens + return [normalize(token) for token in tokens] + + +def tokenize_text(text: str, normalize: Callable[[str], str] | None = None) -> TokenizedText: + return TokenizedText(text=text, tokens=tokenize(text, normalize=normalize)) + + +def is_capitalized(token: str) -> bool: + return bool(token) and token[0].isupper() + + +def extract_capitalized_phrases(tokens: Iterable[str]) -> List[str]: + """Extract contiguous sequences of capitalized tokens. + + Example: ["New", "York", "Times"] -> ["New York Times"]. + """ + phrases: List[str] = [] + current: List[str] = [] + for token in tokens: + if is_capitalized(token): + current.append(token) + else: + if current: + phrases.append(" ".join(current)) + current = [] + if current: + phrases.append(" ".join(current)) + return phrases diff --git a/tests/test_terms.py b/tests/test_terms.py new file mode 100644 index 0000000..e9ac942 --- /dev/null +++ b/tests/test_terms.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +"""Tests for term tokenization and n-gram extraction.""" + +import sys +sys.path.insert(0, 'src') + +from nlbt.terms import extract_diff_terms, tokenize_diff + + +print("Testing term tokenization and n-gram extraction") +print("=" * 50) + +added_text = "the article mentions New York Times and OpenAI research." +removed_text = "removed old version of beta api." + +print("\nāœ“ Test 1: Tokenize added/removed text") +result = tokenize_diff(added_text, removed_text) +assert result.added.tokens == [ + "the", + "article", + "mentions", + "New", + "York", + "Times", + "and", + "OpenAI", + "research", +] +assert result.removed.tokens == ["removed", "old", "version", "of", "beta", "api"] +print(" Tokens extracted") + +print("\nāœ“ Test 2: N-gram and proper noun extraction") +term_diff = extract_diff_terms(added_text, removed_text) +added_ngrams = set(term_diff.added.ngrams) +added_proper = set(term_diff.added.proper_nouns) + +assert "new york" in added_ngrams +assert "new york times" in added_ngrams +assert "openai" in added_ngrams +assert "openai research" in added_ngrams + +assert "New York Times" in added_proper +assert "OpenAI" in added_proper +print(" N-grams and proper nouns captured") + +print("\n" + "=" * 50) +print("āœ… Term tests passed!")