Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions src/nlbt/terms/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from .counters import TermBucketStats, TermCounterStore, TermWindowStats
from .diff import DiffTerms, DiffTokens, extract_diff_terms, tokenize_diff
from .ngrams import TermCandidates, extract_term_candidates, generate_ngrams
from .tokenizer import TokenizedText, extract_capitalized_phrases, tokenize, tokenize_text

__all__ = [
"TokenizedText",
"tokenize",
"tokenize_text",
"extract_capitalized_phrases",
"DiffTokens",
"DiffTerms",
"tokenize_diff",
"extract_diff_terms",
"generate_ngrams",
"TermCandidates",
"extract_term_candidates",
"TermBucketStats",
"TermWindowStats",
"TermCounterStore",
]
98 changes: 98 additions & 0 deletions src/nlbt/terms/counters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from __future__ import annotations

from dataclasses import dataclass, field
from datetime import datetime, timedelta
from typing import Dict, Iterable, List, Set


@dataclass
class TermBucketStats:
added: int = 0
removed: int = 0
pages: Set[str] = field(default_factory=set)
editors: Set[str] = field(default_factory=set)

def record(self, delta_added: int, delta_removed: int, page_id: str, editor_id: str) -> None:
self.added += delta_added
self.removed += delta_removed
if page_id:
self.pages.add(page_id)
if editor_id:
self.editors.add(editor_id)


@dataclass(frozen=True)
class TermWindowStats:
term: str
added: int
removed: int
pages: Set[str]
editors: Set[str]


class TermCounterStore:
"""Time-bucketed counters for term additions/removals."""

def __init__(self, bucket_size: timedelta = timedelta(hours=1)) -> None:
if bucket_size.total_seconds() <= 0:
raise ValueError("bucket_size must be positive")
self.bucket_size = bucket_size
self.table: Dict[datetime, Dict[str, TermBucketStats]] = {}

def _bucket_start(self, timestamp: datetime) -> datetime:
bucket_seconds = int(self.bucket_size.total_seconds())
epoch = int(timestamp.timestamp())
bucket_epoch = epoch - (epoch % bucket_seconds)
return datetime.fromtimestamp(bucket_epoch, tz=timestamp.tzinfo)

def add_terms(
self,
terms_added: Iterable[str],
terms_removed: Iterable[str],
page_id: str,
editor_id: str,
timestamp: datetime,
) -> None:
bucket = self._bucket_start(timestamp)
bucket_terms = self.table.setdefault(bucket, {})

for term in terms_added:
stats = bucket_terms.setdefault(term, TermBucketStats())
stats.record(delta_added=1, delta_removed=0, page_id=page_id, editor_id=editor_id)

for term in terms_removed:
stats = bucket_terms.setdefault(term, TermBucketStats())
stats.record(delta_added=0, delta_removed=1, page_id=page_id, editor_id=editor_id)

def _iter_buckets(self, start: datetime, end: datetime) -> Iterable[Dict[str, TermBucketStats]]:
for bucket_time, term_map in self.table.items():
if start <= bucket_time <= end:
yield term_map

def get_window_stats(self, term: str, end: datetime, window: timedelta) -> TermWindowStats:
start = end - window
added = 0
removed = 0
pages: Set[str] = set()
editors: Set[str] = set()
for term_map in self._iter_buckets(start, end):
stats = term_map.get(term)
if stats:
added += stats.added
removed += stats.removed
pages.update(stats.pages)
editors.update(stats.editors)
return TermWindowStats(term=term, added=added, removed=removed, pages=pages, editors=editors)

def get_rollups(self, term: str, now: datetime) -> Dict[str, TermWindowStats]:
return {
"24h": self.get_window_stats(term, now, timedelta(hours=24)),
"7d": self.get_window_stats(term, now, timedelta(days=7)),
"30d": self.get_window_stats(term, now, timedelta(days=30)),
}

def list_terms(self) -> List[str]:
terms: Set[str] = set()
for term_map in self.table.values():
terms.update(term_map.keys())
return sorted(terms)
45 changes: 45 additions & 0 deletions src/nlbt/terms/diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import Callable

from .ngrams import TermCandidates, extract_term_candidates
from .tokenizer import TokenizedText, tokenize_text


@dataclass(frozen=True)
class DiffTerms:
added: TermCandidates
removed: TermCandidates


@dataclass(frozen=True)
class DiffTokens:
added: TokenizedText
removed: TokenizedText


def tokenize_diff(
added_text: str,
removed_text: str,
normalize: Callable[[str], str] | None = None,
) -> DiffTokens:
return DiffTokens(
added=tokenize_text(added_text, normalize=normalize),
removed=tokenize_text(removed_text, normalize=normalize),
)


def extract_diff_terms(
added_text: str,
removed_text: str,
min_n: int = 1,
max_n: int = 4,
normalize: Callable[[str], str] | None = str.lower,
) -> DiffTerms:
return DiffTerms(
added=extract_term_candidates(added_text, min_n=min_n, max_n=max_n, normalize=normalize),
removed=extract_term_candidates(
removed_text, min_n=min_n, max_n=max_n, normalize=normalize
),
)
50 changes: 50 additions & 0 deletions src/nlbt/terms/ngrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import Callable, Iterable, List

from .tokenizer import extract_capitalized_phrases, tokenize


@dataclass(frozen=True)
class TermCandidates:
tokens: List[str]
ngrams: List[str]
proper_nouns: List[str]


def generate_ngrams(
tokens: Iterable[str],
min_n: int = 1,
max_n: int = 4,
normalize: Callable[[str], str] | None = str.lower,
) -> List[str]:
"""Generate n-grams from tokens.

Args:
tokens: Input token sequence.
min_n: Minimum n-gram size.
max_n: Maximum n-gram size.
normalize: Optional normalization function applied per token.
"""
token_list = list(tokens)
if normalize is not None:
token_list = [normalize(token) for token in token_list]
ngrams: List[str] = []
count = len(token_list)
for n in range(min_n, max_n + 1):
for start in range(0, max(count - n + 1, 0)):
ngrams.append(" ".join(token_list[start : start + n]))
return ngrams


def extract_term_candidates(
text: str,
min_n: int = 1,
max_n: int = 4,
normalize: Callable[[str], str] | None = str.lower,
) -> TermCandidates:
tokens = tokenize(text)
ngrams = generate_ngrams(tokens, min_n=min_n, max_n=max_n, normalize=normalize)
proper_nouns = extract_capitalized_phrases(tokens)
return TermCandidates(tokens=tokens, ngrams=ngrams, proper_nouns=proper_nouns)
53 changes: 53 additions & 0 deletions src/nlbt/terms/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from __future__ import annotations

import re
from dataclasses import dataclass
from typing import Callable, Iterable, List

WORD_RE = re.compile(r"[A-Za-z0-9]+(?:'[A-Za-z0-9]+)?")


@dataclass(frozen=True)
class TokenizedText:
text: str
tokens: List[str]


def tokenize(text: str, normalize: Callable[[str], str] | None = None) -> List[str]:
"""Tokenize text into word-like tokens.

Args:
text: Input text.
normalize: Optional normalization function applied to each token.
"""
tokens = WORD_RE.findall(text or "")
if normalize is None:
return tokens
return [normalize(token) for token in tokens]


def tokenize_text(text: str, normalize: Callable[[str], str] | None = None) -> TokenizedText:
return TokenizedText(text=text, tokens=tokenize(text, normalize=normalize))


def is_capitalized(token: str) -> bool:
return bool(token) and token[0].isupper()


def extract_capitalized_phrases(tokens: Iterable[str]) -> List[str]:
"""Extract contiguous sequences of capitalized tokens.

Example: ["New", "York", "Times"] -> ["New York Times"].
"""
phrases: List[str] = []
current: List[str] = []
for token in tokens:
if is_capitalized(token):
current.append(token)
else:
if current:
phrases.append(" ".join(current))
current = []
if current:
phrases.append(" ".join(current))
return phrases
47 changes: 47 additions & 0 deletions tests/test_terms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python3
"""Tests for term tokenization and n-gram extraction."""

import sys
sys.path.insert(0, 'src')

from nlbt.terms import extract_diff_terms, tokenize_diff


print("Testing term tokenization and n-gram extraction")
print("=" * 50)

added_text = "the article mentions New York Times and OpenAI research."
removed_text = "removed old version of beta api."

print("\n✓ Test 1: Tokenize added/removed text")
result = tokenize_diff(added_text, removed_text)
assert result.added.tokens == [
"the",
"article",
"mentions",
"New",
"York",
"Times",
"and",
"OpenAI",
"research",
]
assert result.removed.tokens == ["removed", "old", "version", "of", "beta", "api"]
print(" Tokens extracted")

print("\n✓ Test 2: N-gram and proper noun extraction")
term_diff = extract_diff_terms(added_text, removed_text)
added_ngrams = set(term_diff.added.ngrams)
added_proper = set(term_diff.added.proper_nouns)

assert "new york" in added_ngrams
assert "new york times" in added_ngrams
assert "openai" in added_ngrams
assert "openai research" in added_ngrams

assert "New York Times" in added_proper
assert "OpenAI" in added_proper
print(" N-grams and proper nouns captured")

print("\n" + "=" * 50)
print("✅ Term tests passed!")