Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 129 additions & 7 deletions mempalace/entity_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,112 @@
"inference",
}

# Technical/architecture terms that appear capitalized in docs but are never
# real entity names on their own (issue #476)
_TECH_STOPWORDS: frozenset = frozenset(
{
# Architecture / design patterns
"handler",
"service",
"manager",
"controller",
"middleware",
"provider",
"factory",
"builder",
"registry",
"resolver",
"listener",
"dispatcher",
"adapter",
"wrapper",
"plugin",
# Runtime / infrastructure
"node",
"client",
"server",
"worker",
"thread",
"process",
"queue",
"cache",
"buffer",
"socket",
"router",
"proxy",
# Code constructs
"module",
"package",
"instance",
"event",
"callback",
"promise",
"object",
"function",
"method",
"boolean",
"integer",
"string",
"array",
"default",
# Common doc/config terms
"config",
"context",
"session",
"token",
"schema",
"index",
"record",
"message",
"action",
"state",
"task",
"job",
"hook",
"flag",
"stage",
"batch",
"chunk",
"block",
"stream",
"channel",
"driver",
"backend",
"frontend",
"endpoint",
"payload",
"header",
"body",
"query",
"filter",
"timeout",
"retry",
"log",
"build",
"deploy",
"release",
"pipeline",
"migration",
"seed",
"reset",
"repair",
# Extremely common sentence-opening words
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
}
)

# Merge tech stopwords into the main set
STOPWORDS = STOPWORDS | _TECH_STOPWORDS

# For entity detection — prose only, no code files
# Code files have too many capitalized names (classes, functions) that aren't entities
PROSE_EXTENSIONS = {
Expand Down Expand Up @@ -439,19 +545,35 @@

# ==================== CANDIDATE EXTRACTION ====================

# Matches text that ends with a sentence-terminating character (., !, ?, or newline)
# Used to skip words that appear at the very start of a sentence (issue #476)
_SENTENCE_END_RE = re.compile(r"[.!?\n]\s*$")


def extract_candidates(text: str) -> dict:
"""
Extract all capitalized proper noun candidates from text.
Returns {name: frequency} for names appearing 3+ times.
"""
# Find all capitalized words (not at sentence start — harder, so we use frequency as filter)
raw = re.findall(r"\b([A-Z][a-z]{1,19})\b", text)

counts = defaultdict(int)
for word in raw:
if word.lower() not in STOPWORDS and len(word) > 1:
counts[word] += 1
Skips words that are:
- in STOPWORDS (includes _TECH_STOPWORDS merged in)
- at position 0 (absolute document start)
- immediately following a sentence-ending character (sentence-start position)
"""
counts: dict[str, int] = defaultdict(int)
for match in re.finditer(r"\b([A-Z][a-z]{1,19})\b", text):
word = match.group(1)
if word.lower() in STOPWORDS or len(word) <= 1:
continue
start = match.start()
# Skip words at the absolute start of the document
if start == 0:
continue
# Skip words that follow a sentence-ending punctuation or newline
preceding = text[max(0, start - 50) : start].rstrip()
if not preceding or _SENTENCE_END_RE.search(preceding):
continue
counts[word] += 1

# Also find multi-word proper nouns (e.g. "Memory Palace", "Claude Code")
multi = re.findall(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b", text)
Expand Down
2 changes: 1 addition & 1 deletion mempalace/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Single source of truth for the MemPalace package version."""

__version__ = "3.1.0"
__version__ = "3.2.0"
26 changes: 17 additions & 9 deletions tests/test_entity_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@


def test_extract_candidates_finds_frequent_names():
text = "Riley said hello. Riley laughed. Riley smiled. Riley waved."
# Names must appear mid-sentence (not at sentence start) to be captured.
# The sentence-start filter (issue #476) skips words immediately following
# punctuation or at position 0, so we embed the name after an opener.
text = (
"Today Riley said hello. Yesterday Riley laughed loudly. "
"Then Riley smiled again. Finally Riley waved goodbye."
)
result = extract_candidates(text)
assert "Riley" in result
assert result["Riley"] >= 3
Expand Down Expand Up @@ -162,16 +168,18 @@ def test_classify_entity_mixed_signals():

def test_detect_entities_with_person_file(tmp_path):
f = tmp_path / "notes.txt"
# Names are placed mid-sentence so the sentence-start filter (issue #476)
# does not discard them. Each line has an adverb/connector before the name.
content = "\n".join(
[
"Riley said hello today.",
"Riley asked about the project.",
"Riley told me she was happy.",
"Riley: I think we should go.",
"Hey Riley, thanks for the help.",
"Riley laughed and smiled.",
"Riley decided to join.",
"Riley pushed the change.",
"Today Riley said hello.",
"Later Riley asked about the project.",
"Then Riley told me she was happy.",
"Afterwards Riley laughed and smiled.",
"Next Riley decided to join.",
"Finally Riley pushed the change.",
"Also, hey Riley, thanks for the help.",
"Meanwhile Riley waved goodbye.",
]
)
f.write_text(content)
Expand Down