structflo
diff --git a/‎notebooks/02_fast_ner.ipynb‎
Lines changed: 416 additions & 0 deletions b/‎notebooks/02_fast_ner.ipynb‎
Lines changed: 416 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎structflo/ner/__init__.py‎
Lines changed: 9 additions & 2 deletions b/‎structflo/ner/__init__.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎structflo/ner/fast/README.md‎
Lines changed: 149 additions & 0 deletions b/‎structflo/ner/fast/README.md‎
Lines changed: 149 additions & 0 deletions
diff --git a/‎structflo/ner/fast/__init__.py‎
Lines changed: 22 additions & 0 deletions b/‎structflo/ner/fast/__init__.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎structflo/ner/fast/_loader.py‎
Lines changed: 116 additions & 0 deletions b/‎structflo/ner/fast/_loader.py‎
Lines changed: 116 additions & 0 deletions
@@ -11,6 +11,7 @@ dependencies = [
 
 [project.optional-dependencies]
 dataframe = ["pandas>=1.5"]
+fast = ["rapidfuzz>=3.0", "PyYAML>=6.0"]
 
 [dependency-groups]
 dev = [
 
@@ -49,6 +49,12 @@
     TargetEntity,
 )
 from structflo.ner.extractor import NERExtractor
+
+try:
+    from structflo.ner.fast import FastNERExtractor
+except ImportError:  # rapidfuzz / PyYAML not installed
+    FastNERExtractor = None  # type: ignore[assignment,misc]
+
 from structflo.ner.profiles import (
     BIOACTIVITY,
     BIOLOGY,
@@ -61,11 +67,12 @@
     EntityProfile,
 )
 
-__version__ = "0.2.1"
+__version__ = "0.2.2"
 
 __all__ = [
-    # Main class
+    # Main classes
     "NERExtractor",
+    "FastNERExtractor",
     # Profile system
     "EntityProfile",
     "FULL",
 
@@ -0,0 +1,149 @@
+# structflo.ner.fast — Dictionary-Based NER for TB Drug Discovery
+
+Fast, deterministic entity extraction using curated YAML gazetteers. No LLM, no API key, no network — runs in milliseconds.
+
+## Install
+
+```bash
+uv add "structflo-ner[fast]"
+
+# with DataFrame support
+uv add "structflo-ner[fast,dataframe]"
+```
+
+## Quick Start
+
+```python
+from structflo.ner.fast import FastNERExtractor
+
+fast = FastNERExtractor()
+result = fast.extract("Bedaquiline inhibits AtpE (Rv1305) in MDR-TB.")
+
+print(result.compounds)    # [ChemicalEntity(text='Bedaquiline', ...)]
+print(result.targets)      # [TargetEntity(text='AtpE', ...)]
+print(result.accessions)   # [AccessionEntity(text='Rv1305', ...)]
+print(result.diseases)     # [DiseaseEntity(text='MDR-TB', ...)]
+
+df = result.to_dataframe()
+result.display()  # interactive HTML in Jupyter
+```
+
+## How It Works
+
+Three-phase matching, all without an LLM:
+
+### Phase 1 — Exact Dictionary Match
+Looks up every text span against a normalized dictionary built from the YAML gazetteers. Auto-derived variants include:
+- **Case variants**: InhA, inha, INHA
+- **Hyphen-optional**: DprE-1 ↔ DprE1, MDR-TB ↔ MDRTB
+- **Period-optional**: M. tuberculosis ↔ M tuberculosis
+- **Greek letters**: β-lactam ↔ beta-lactam
+
+Word boundaries are enforced — "Rho" won't match inside "Rhodamine".
+
+### Phase 1b — Regex Patterns (Accession Numbers)
+Seed entries in `accession_number.yml` auto-derive regex patterns for entire ID families:
+
+| Seed | Auto-derived Pattern | Matches |
+|---|---|---|
+| `Rv0005` | `Rv\d{4}[c]?` | All Rv locus tags |
+| `MT0005` | `MT\w+` | Mycobrowser IDs |
+| `P9WGR1` | `[OPQ][0-9][A-Z0-9]{3}[0-9]` | UniProt accessions |
+| `4TZK` | `[0-9][A-Z0-9]{3}` | PDB codes |
+| `WP_003407354` | `WP_\d+` | NCBI RefSeq proteins |
+
+### Phase 2 — Fuzzy Match
+Unmatched "entity-like" tokens (capitalized, contain digits, length ≥ 4) are compared against the dictionary using rapidfuzz. Catches typos and minor variants.
+
+```python
+# Configurable threshold (0–100, default 85)
+strict = FastNERExtractor(fuzzy_threshold=0)   # disable fuzzy
+lenient = FastNERExtractor(fuzzy_threshold=75)  # more permissive
+```
+
+## Gazetteers
+
+YAML files live in `structflo/ner/fast/gazetteers/`. Each file is a simple list of names — **nothing else**:
+
+```yaml
+# target.yml
+- InhA
+- DprE1
+- MmpL3
+- AtpE
+```
+
+The filename (without `.yml`) becomes the `entity_type`. Built-in gazetteers:
+
+| File | Entity Type | Coverage |
+|---|---|---|
+| `target.yml` | target → `TargetEntity` | ~80 TB drug targets |
+| `gene_name.yml` | gene_name → `TargetEntity` | ~75 Mtb gene names |
+| `compound_name.yml` | compound_name → `ChemicalEntity` | ~50 TB compounds & abbreviations |
+| `disease.yml` | disease → `DiseaseEntity` | TB disease variants |
+| `accession_number.yml` | accession_number → `AccessionEntity` | Seed entries → regex patterns |
+| `screening_method.yml` | screening_method → `ScreeningMethodEntity` | ~35 screening approaches |
+| `functional_category.yml` | functional_category → `FunctionalCategoryEntity` | ~25 Mtb functional categories |
+| `product.yml` | product → `ProductEntity` | ~35 gene product descriptions |
+
+## Adding New Gazetteers
+
+### Option 1: Add to existing files
+Edit a YAML file and add names:
+
+```yaml
+# target.yml
+- InhA
+- DprE1
+- MyNewTarget  # just add it
+```
+
+### Option 2: Create a new YAML file
+Drop a new `.yml` file into any directory:
+
+```yaml
+# my_gazetteers/assay.yml
+- resazurin assay
+- luciferase reporter assay
+- disk diffusion assay
+```
+
+```python
+fast = FastNERExtractor(gazetteer_dir="my_gazetteers/")
+```
+
+### Option 3: Add terms programmatically
+
+```python
+fast = FastNERExtractor(
+    extra_gazetteers={
+        "target": ["NovelTarget1", "NovelTarget2"],
+        "compound_name": ["CompoundXYZ"],
+    }
+)
+```
+
+## Output Compatibility
+
+`FastNERExtractor` produces identical `NERResult` objects as the LLM-based `NERExtractor`. Everything downstream works the same:
+
+```python
+result.all_entities()    # flat list
+result.to_dict()         # serializable dict
+result.to_dataframe()    # pandas DataFrame
+result.display()         # interactive HTML
+```
+
+Each entity includes `match_method` ("exact", "regex", or "fuzzy") and `canonical` (the gazetteer term it matched) in its `attributes` dict.
+
+## Fast vs LLM
+
+| | `FastNERExtractor` | `NERExtractor` |
+|---|---|---|
+| Speed | ~1–5 ms per abstract | ~2–5 s per abstract |
+| Novel entities | Only known terms | Discovers new entities |
+| Context | String matching | Full contextual understanding |
+| Cost | Free | API calls or GPU |
+| Setup | Zero config | API key or Ollama |
+
+**Recommended workflow**: Fast extractor as first pass (bulk screening), LLM extractor as second pass (deep analysis on interesting papers).
@@ -0,0 +1,22 @@
+"""Fast dictionary-based NER for TB drug discovery — no LLM required.
+
+Quick start::
+
+    from structflo.ner.fast import FastNERExtractor
+
+    extractor = FastNERExtractor()
+    result = extractor.extract(
+        "Bedaquiline inhibits AtpE (Rv1305) in M. tuberculosis."
+    )
+    print(result.compounds)
+    print(result.targets)
+    df = result.to_dataframe()
+
+Custom gazetteers::
+
+    extractor = FastNERExtractor(gazetteer_dir="/path/to/my/gazetteers")
+"""
+
+from structflo.ner.fast.extractor import FastNERExtractor
+
+__all__ = ["FastNERExtractor"]
@@ -0,0 +1,116 @@
+"""Load YAML gazetteer files and auto-derive regex patterns for accession numbers."""
+
+from __future__ import annotations
+
+import logging
+import re
+from pathlib import Path
+
+import yaml
+
+from structflo.ner._entities import _ENTITY_CLASS_MAP
+
+logger = logging.getLogger(__name__)
+
+# Default gazetteer directory (shipped with the package)
+_DEFAULT_GAZETTEER_DIR = Path(__file__).parent / "gazetteers"
+
+# Known accession-number patterns: (regex_to_detect_seed, full_pattern_with_word_boundaries)
+_ACCESSION_PATTERNS: list[tuple[re.Pattern[str], re.Pattern[str], str]] = [
+    # Rv locus tags: Rv0005, Rv3854c
+    (re.compile(r"^Rv\d{4}[c]?$"), re.compile(r"\bRv\d{4}[c]?\b"), "Rv locus tag"),
+    # Mycobrowser MT IDs: MT0005, MTCI00.01
+    (re.compile(r"^MT\w+$"), re.compile(r"\bMT\w+\b"), "Mycobrowser ID"),
+    # UniProt accessions: P9WGR1, O53617
+    (
+        re.compile(r"^[OPQ][0-9][A-Z0-9]{3}[0-9]$"),
+        re.compile(r"\b[OPQ][0-9][A-Z0-9]{3}[0-9]\b"),
+        "UniProt accession",
+    ),
+    # PDB codes: 4TZK, 1P44
+    (re.compile(r"^[0-9][A-Z0-9]{3}$"), re.compile(r"\b[0-9][A-Z0-9]{3}\b"), "PDB code"),
+    # NCBI RefSeq protein: WP_003407354
+    (re.compile(r"^WP_\d+$"), re.compile(r"\bWP_\d+\b"), "NCBI RefSeq"),
+]
+
+
+def load_gazetteer(path: Path) -> tuple[str, list[str]]:
+    """Load a single YAML gazetteer file.
+
+    Returns:
+        Tuple of (entity_type, list_of_terms) where entity_type is derived
+        from the filename stem.
+    """
+    entity_type = path.stem
+    with open(path) as f:
+        terms = yaml.safe_load(f)
+
+    if not isinstance(terms, list):
+        msg = f"Gazetteer {path.name} must be a YAML list, got {type(terms).__name__}"
+        raise ValueError(msg)
+
+    # Coerce all entries to strings
+    terms = [str(t).strip() for t in terms if t is not None and str(t).strip()]
+    return entity_type, terms
+
+
+def load_all_gazetteers(
+    directory: Path | str | None = None,
+) -> dict[str, list[str]]:
+    """Load all YAML gazetteer files from a directory.
+
+    Args:
+        directory: Path to gazetteer directory. Defaults to the built-in
+            gazetteers shipped with the package.
+
+    Returns:
+        Dict mapping entity_type → list of canonical terms.
+    """
+    dirpath = Path(directory) if directory is not None else _DEFAULT_GAZETTEER_DIR
+
+    if not dirpath.is_dir():
+        msg = f"Gazetteer directory does not exist: {dirpath}"
+        raise FileNotFoundError(msg)
+
+    gazetteers: dict[str, list[str]] = {}
+
+    for yml_path in sorted(dirpath.glob("*.yml")):
+        entity_type, terms = load_gazetteer(yml_path)
+
+        if entity_type not in _ENTITY_CLASS_MAP:
+            logger.warning(
+                "Gazetteer %s maps to unknown entity_type %r — entities will be unclassified",
+                yml_path.name,
+                entity_type,
+            )
+
+        gazetteers[entity_type] = terms
+        logger.debug("Loaded %d terms for %s from %s", len(terms), entity_type, yml_path.name)
+
+    return gazetteers
+
+
+def derive_accession_patterns(terms: list[str]) -> list[tuple[re.Pattern[str], str]]:
+    """Auto-derive regex patterns from accession number seed entries.
+
+    Examines each term against known ID formats and returns compiled regex
+    patterns that will match the entire family (not just the listed seeds).
+
+    Returns:
+        List of (compiled_pattern, description) tuples.
+    """
+    detected: list[tuple[re.Pattern[str], str]] = []
+    seen_descriptions: set[str] = set()
+
+    for term in terms:
+        for seed_re, full_re, description in _ACCESSION_PATTERNS:
+            if description not in seen_descriptions and seed_re.match(term):
+                detected.append((full_re, description))
+                seen_descriptions.add(description)
+                logger.debug(
+                    "Auto-derived %s pattern from seed %r",
+                    description,
+                    term,
+                )
+
+    return detected