Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,20 @@ Tier 1 benchmark entrypoints:

The runner loads benchmark inputs from the HF dataset and writes benchmark outputs to the HF experiments bucket when `--local-only` is not set. DDInter data is not stored in HF: if the local SQLite file is absent, configure `INTERACTION_DB_REPO`, `INTERACTION_DB_TAG`, and optionally `INTERACTION_DB_SHA256` so the runner can fetch `ddinter.db` from the pinned GitHub release source.

`predictions.jsonl` includes `elapsed_ms` keys for `ocr_clean`, `ner`, `rxnorm`, `ddinter_rxcui`, `ddinter_fts`, `openfda`, `severity`, `analyze`, `interactions`, and `total`. These measurements intentionally overlap: `analyze` includes OCR, NER, and RxNorm work; `interactions` includes DDInter, OpenFDA, and severity work; `total` includes the top-level phases plus benchmark overhead. Do not sum the keys as a disjoint latency partition.
`predictions.jsonl` includes benchmark-only diagnostics for each record:

1. `elapsed_ms` preserves the original 10 timing keys: `ocr_clean`, `ner`, `rxnorm`, `ddinter_rxcui`, `ddinter_fts`, `openfda`, `severity`, `analyze`, `interactions`, and `total`.
2. `component_timings_ms` repeats those keys and adds `critical_path` plus `slowest_component_ms`; `critical_path` is the sum of the non-aggregate component buckets, and `slowest_component` names the largest non-aggregate component bucket for that record.
3. `ner_diagnostics` includes predicted entities plus per-record strict and lenient TP/FP/FN counts when `expected_names` is present.
4. `rxnorm_attempts` records benchmark-stage, method, query, returned RxCUI, status, elapsed time, output summary, and error metadata for RxNorm calls.
5. `interaction_attempts` records one row per checked pair, including pair names, RxCUIs, DDInter RxCUI lookup, DDInter FTS lookup, OpenFDA fallback, final source, final severity, and miss reason.
6. `pipeline_errors` records timeout or component errors tied to the record without requiring the whole benchmark to fail.

Timing measurements intentionally overlap: `analyze` includes OCR, NER, and RxNorm work; `interactions` includes DDInter, OpenFDA, and severity work; `total` includes the top-level phases plus benchmark overhead. Do not sum all timing keys as a disjoint latency partition. Starting with `metric_schema_version: "benchmark-diagnostics-v1"`, the `rxnorm` timing bucket covers all benchmark-wrapped RxNorm calls (`get_rxcui`, `approximate_term`, `search_by_name`, and `get_drug_details`), so compare it with earlier runs only as a changed-instrumentation metric.

`results.json` groups rollups by `overall`, `timing`, `ner`, `linking`, `rxnorm`, `interactions`, `errors`, and `fp_taxonomy`. `linking` is kept for backward compatibility with the original link-coverage fields; `rxnorm` carries those core fields plus RxNorm attempt diagnostics such as method hit/miss/error counts, unresolved queries, and canonicalization collisions. Interaction diagnostics report DDInter RxCUI hit rate, DDInter FTS rescue rate, OpenFDA rescue rate, source counts, and common unknown pairs. These are routing/source-coverage diagnostics unless reviewed `expected_interactions` and `known_safe_pairs` are present.

`manifest.json` includes `metric_schema_version`, dataset revision, run id, sample size, model IDs, concurrency, and DDInter release metadata. `summary.md` highlights top-line metrics, timing bottlenecks, unresolved RxNorm queries, unknown interaction pairs, and an explicit warning when outputs are not accuracy-certified.

Use `--record-timeout-seconds` to bound each input record so a stuck RxNorm, OpenFDA, or model path records a `record_timeout` error instead of hanging the whole run. Use `--local-only` for development and smoke runs. Without `--local-only`, result artifacts upload to the experiments bucket under an immutable `benchmark-results/<YYYY-MM-DD>/<run-id>/` prefix; do not commit generated candidate JSON or benchmark result directories to GitHub.

Expand Down
78 changes: 75 additions & 3 deletions eval/benchmark_results.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,42 @@
"title": "PillChecker benchmark results",
"type": "object",
"additionalProperties": true,
"required": ["ner", "linking", "interactions", "fp_taxonomy"],
"required": ["overall", "timing", "ner", "linking", "rxnorm", "interactions", "errors", "fp_taxonomy"],
"properties": {
"overall": {
"type": "object",
"additionalProperties": true,
"required": [
"records_total",
"records_completed",
"records_errored",
"error_rate",
"timeout_count",
"concurrency",
"wall_time_seconds",
"records_per_second"
],
"properties": {
"records_total": {"type": "integer", "minimum": 0},
"records_completed": {"type": "integer", "minimum": 0},
"records_errored": {"type": "integer", "minimum": 0},
"error_rate": {"type": "number", "minimum": 0, "maximum": 1},
"timeout_count": {"type": "integer", "minimum": 0},
"concurrency": {"type": "integer", "minimum": 1},
"wall_time_seconds": {"type": "number", "minimum": 0},
"records_per_second": {"type": ["number", "null"], "minimum": 0}
}
},
"timing": {
"type": "object",
"additionalProperties": true,
"required": ["components", "slowest_component", "slowest_component_counts"],
"properties": {
"components": {"type": "object"},
"slowest_component": {"type": ["string", "null"]},
"slowest_component_counts": {"type": "object"}
}
},
"ner": {
"type": "object",
"additionalProperties": true,
Expand Down Expand Up @@ -50,6 +84,23 @@
"incorrect_link_rate": {"type": ["number", "null"], "minimum": 0, "maximum": 1}
}
},
"rxnorm": {
"type": "object",
"additionalProperties": true,
"required": [
"coverage",
"fallback_rate",
"nil_rate",
"n_link_attempts",
"n_drugs_total",
"acc_at_1",
"incorrect_link_rate",
"n_rxnorm_attempts",
"rxnorm_by_method",
"unresolved_queries",
"canonicalization_collisions"
]
},
"interactions": {
"type": "object",
"additionalProperties": true,
Expand All @@ -65,7 +116,12 @@
"unknown_rate",
"severity_distribution",
"uncertain_rate",
"records_with_any_interaction"
"records_with_any_interaction",
"ddinter_rxcui_hit_rate",
"ddinter_fts_rescue_rate",
"openfda_rescue_rate",
"source_counts",
"top_unknown_pairs"
],
"properties": {
"total_pairs_checked": {"type": "integer", "minimum": 0},
Expand All @@ -74,7 +130,12 @@
"unknown_rate": {"type": "number", "minimum": 0, "maximum": 1},
"severity_distribution": {"$ref": "#/$defs/severity_counts"},
"uncertain_rate": {"type": "number", "minimum": 0, "maximum": 1},
"records_with_any_interaction": {"type": "integer", "minimum": 0}
"records_with_any_interaction": {"type": "integer", "minimum": 0},
"ddinter_rxcui_hit_rate": {"type": "number", "minimum": 0, "maximum": 1},
"ddinter_fts_rescue_rate": {"type": "number", "minimum": 0, "maximum": 1},
"openfda_rescue_rate": {"type": "number", "minimum": 0, "maximum": 1},
"source_counts": {"type": "object"},
"top_unknown_pairs": {"type": "array"}
}
},
"accuracy": {"type": ["object", "null"]},
Expand All @@ -91,6 +152,17 @@
}
}
},
"errors": {
"type": "object",
"additionalProperties": true,
"required": ["total", "by_stage", "by_class", "records"],
"properties": {
"total": {"type": "integer", "minimum": 0},
"by_stage": {"type": "object"},
"by_class": {"type": "object"},
"records": {"type": "array"}
}
},
"fp_taxonomy": {
"type": "object",
"additionalProperties": true,
Expand Down
11 changes: 11 additions & 0 deletions eval/benchmark_run_manifest.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
"dataset_revision",
"command",
"model_ids",
"sample_size",
"concurrency",
"metric_schema_version",
"metrics"
],
"properties": {
Expand Down Expand Up @@ -63,6 +66,14 @@
"type": "integer",
"minimum": 1
},
"concurrency": {
"type": ["integer", "null"],
"minimum": 1
},
"metric_schema_version": {
"type": "string",
"const": "benchmark-diagnostics-v1"
},
"random_seed": {
"type": ["integer", "string", "null"]
},
Expand Down
37 changes: 36 additions & 1 deletion eval/metrics/interactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

from collections import defaultdict
from collections import Counter, defaultdict
from typing import Any, Awaitable, Callable


Expand Down Expand Up @@ -140,6 +140,7 @@ def compute(
uncertain = 0
returned = 0
records_with_any = 0
attempts = [attempt for prediction in predictions for attempt in prediction.get("interaction_attempts", [])]

for prediction in predictions:
interactions_response = prediction.get("interactions") or {}
Expand All @@ -159,6 +160,7 @@ def compute(
uncertain += 1

total_pairs = sum(coverage.values())
attempt_diagnostics = _attempt_diagnostics(attempts)
return {
"descriptive": {
"total_pairs_checked": total_pairs,
Expand All @@ -168,7 +170,40 @@ def compute(
"severity_distribution": severity_distribution,
"uncertain_rate": _rate(uncertain, returned),
"records_with_any_interaction": records_with_any,
**attempt_diagnostics,
},
"accuracy": _accuracy(predictions, dataset),
"seed_smoke": compute_seed_smoke(seed_cases, seed_results),
}


def _status(attempt: dict, component: str) -> str:
block = attempt.get(component) or {}
return str(block.get("status") or "skipped")


def _attempt_diagnostics(attempts: list[dict]) -> dict:
total = len(attempts)
source_counts = Counter(str(attempt.get("final_source") or "unknown") for attempt in attempts)
ddinter_rxcui_hits = sum(1 for attempt in attempts if _status(attempt, "ddinter_rxcui") == "hit")
ddinter_fts_hits = sum(1 for attempt in attempts if _status(attempt, "ddinter_fts") == "hit")
openfda_hits = sum(1 for attempt in attempts if _status(attempt, "openfda") == "hit")
unknown_pairs = Counter(
_pair_key(str(attempt.get("drug_a", "")), str(attempt.get("drug_b", "")))
for attempt in attempts
if attempt.get("final_source") == "unknown"
)
return {
"ddinter_rxcui_hit_rate": _rate(ddinter_rxcui_hits, total),
"ddinter_fts_rescue_rate": _rate(ddinter_fts_hits, total),
"openfda_rescue_rate": _rate(openfda_hits, total),
"source_counts": {
"ddinter": int(source_counts.get("ddinter", 0)),
"openfda": int(source_counts.get("openfda", 0)),
"unknown": int(source_counts.get("unknown", 0)),
},
"top_unknown_pairs": [
{"drug_a": drug_a, "drug_b": drug_b, "count": count}
for (drug_a, drug_b), count in unknown_pairs.most_common(10)
],
}
45 changes: 45 additions & 0 deletions eval/metrics/linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from __future__ import annotations

from collections import defaultdict


def _rate(numerator: int, denominator: int) -> float:
return numerator / denominator if denominator else 0.0
Expand All @@ -10,6 +12,7 @@ def _rate(numerator: int, denominator: int) -> float:
def compute(predictions: list[dict], dataset: list[dict]) -> dict:
drugs = [drug for pred in predictions for drug in pred.get("drugs", [])]
attempts = [attempt for pred in predictions for attempt in pred.get("link_attempts", [])]
rxnorm_attempts = [attempt for pred in predictions for attempt in pred.get("rxnorm_attempts", [])]
resolved = sum(1 for drug in drugs if drug.get("rxcui"))
fallback = sum(1 for drug in drugs if drug.get("source") == "rxnorm_fallback")
nil_count = sum(1 for attempt in attempts if attempt.get("rxcui") is None)
Expand Down Expand Up @@ -39,6 +42,7 @@ def compute(predictions: list[dict], dataset: list[dict]) -> dict:
acc_at_1 = sum(acc_values) / len(acc_values)
incorrect_link_rate = _rate(incorrect, predicted_with_rxcui_total)

diagnostics = _rxnorm_diagnostics(rxnorm_attempts)
return {
"coverage": _rate(resolved, len(drugs)),
"fallback_rate": _rate(fallback, len(drugs)),
Expand All @@ -47,4 +51,45 @@ def compute(predictions: list[dict], dataset: list[dict]) -> dict:
"n_drugs_total": len(drugs),
"acc_at_1": acc_at_1,
"incorrect_link_rate": incorrect_link_rate,
**diagnostics,
}


def _rxnorm_diagnostics(attempts: list[dict]) -> dict:
by_method: dict[str, dict[str, int]] = defaultdict(lambda: {"hit": 0, "miss": 0, "error": 0})
unresolved = []
queries_by_rxcui: dict[str, set[str]] = defaultdict(set)

for attempt in attempts:
method = str(attempt.get("method") or "unknown")
status = str(attempt.get("status") or "unknown")
if status not in {"hit", "miss", "error"}:
status = "miss" if attempt.get("rxcui") is None else "hit"
by_method[method][status] += 1

query = str(attempt.get("query") or attempt.get("name") or "")
rxcui = attempt.get("rxcui")
if rxcui:
if method != "get_drug_details" and query:
queries_by_rxcui[str(rxcui)].add(query)
elif status in {"miss", "error"}:
unresolved.append({
"query": query,
"stage": attempt.get("stage"),
"method": method,
})

collisions = [
{"rxcui": rxcui, "queries": sorted(query for query in queries if query)}
for rxcui, queries in sorted(queries_by_rxcui.items())
if len({query.casefold() for query in queries if query}) > 1
]
return {
"n_rxnorm_attempts": len(attempts),
"rxnorm_by_method": {
method: counts
for method, counts in sorted(by_method.items())
},
"unresolved_queries": unresolved[:20],
"canonicalization_collisions": collisions[:20],
}
22 changes: 21 additions & 1 deletion eval/metrics/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ def _prf(tp: int, fp: int, fn: int) -> dict[str, float]:


def _record_metrics(predicted: list[str], expected: list[str], matcher: Callable[[str, str], bool]) -> dict[str, float]:
counts = _record_counts(predicted, expected, matcher)
return _prf(counts["tp"], counts["fp"], counts["fn"])


def _record_counts(predicted: list[str], expected: list[str], matcher: Callable[[str, str], bool]) -> dict[str, int]:
matched_expected: set[int] = set()
tp = 0
for pred in predicted:
Expand All @@ -58,7 +63,22 @@ def _record_metrics(predicted: list[str], expected: list[str], matcher: Callable
tp += 1
fp = len(predicted) - tp
fn = len(expected) - tp
return _prf(tp, fp, fn)
return {"tp": tp, "fp": fp, "fn": fn}


def diagnostics_for_entities(entities: list[dict], expected_names: list[str]) -> dict:
predicted = [str(entity.get("text", "")) for entity in entities]
expected = [str(name) for name in expected_names]
strict = _record_counts(predicted, expected, _strict_match)
lenient = _record_counts(predicted, expected, _lenient_match)
return {
"entities": entities,
"strict": strict,
"lenient": lenient,
"expected_count": len(expected),
"predicted_count": len(predicted),
"low_confidence_count": sum(1 for entity in entities if float(entity.get("score", 1.0)) < 0.85),
}


def _average(blocks: list[dict[str, float]]) -> dict[str, float]:
Expand Down
Loading
Loading