SPerekrestova · SPerekrestova · May 22, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/eval/README.md b/eval/README.md
@@ -79,7 +79,20 @@ Tier 1 benchmark entrypoints:
 
 The runner loads benchmark inputs from the HF dataset and writes benchmark outputs to the HF experiments bucket when `--local-only` is not set. DDInter data is not stored in HF: if the local SQLite file is absent, configure `INTERACTION_DB_REPO`, `INTERACTION_DB_TAG`, and optionally `INTERACTION_DB_SHA256` so the runner can fetch `ddinter.db` from the pinned GitHub release source.
 
-`predictions.jsonl` includes `elapsed_ms` keys for `ocr_clean`, `ner`, `rxnorm`, `ddinter_rxcui`, `ddinter_fts`, `openfda`, `severity`, `analyze`, `interactions`, and `total`. These measurements intentionally overlap: `analyze` includes OCR, NER, and RxNorm work; `interactions` includes DDInter, OpenFDA, and severity work; `total` includes the top-level phases plus benchmark overhead. Do not sum the keys as a disjoint latency partition.
+`predictions.jsonl` includes benchmark-only diagnostics for each record:
+
+1. `elapsed_ms` preserves the original 10 timing keys: `ocr_clean`, `ner`, `rxnorm`, `ddinter_rxcui`, `ddinter_fts`, `openfda`, `severity`, `analyze`, `interactions`, and `total`.
+2. `component_timings_ms` repeats those keys and adds `critical_path` plus `slowest_component_ms`; `critical_path` is the sum of the non-aggregate component buckets, and `slowest_component` names the largest non-aggregate component bucket for that record.
+3. `ner_diagnostics` includes predicted entities plus per-record strict and lenient TP/FP/FN counts when `expected_names` is present.
+4. `rxnorm_attempts` records benchmark-stage, method, query, returned RxCUI, status, elapsed time, output summary, and error metadata for RxNorm calls.
+5. `interaction_attempts` records one row per checked pair, including pair names, RxCUIs, DDInter RxCUI lookup, DDInter FTS lookup, OpenFDA fallback, final source, final severity, and miss reason.
+6. `pipeline_errors` records timeout or component errors tied to the record without requiring the whole benchmark to fail.
+
+Timing measurements intentionally overlap: `analyze` includes OCR, NER, and RxNorm work; `interactions` includes DDInter, OpenFDA, and severity work; `total` includes the top-level phases plus benchmark overhead. Do not sum all timing keys as a disjoint latency partition. Starting with `metric_schema_version: "benchmark-diagnostics-v1"`, the `rxnorm` timing bucket covers all benchmark-wrapped RxNorm calls (`get_rxcui`, `approximate_term`, `search_by_name`, and `get_drug_details`), so compare it with earlier runs only as a changed-instrumentation metric.
+
+`results.json` groups rollups by `overall`, `timing`, `ner`, `linking`, `rxnorm`, `interactions`, `errors`, and `fp_taxonomy`. `linking` is kept for backward compatibility with the original link-coverage fields; `rxnorm` carries those core fields plus RxNorm attempt diagnostics such as method hit/miss/error counts, unresolved queries, and canonicalization collisions. Interaction diagnostics report DDInter RxCUI hit rate, DDInter FTS rescue rate, OpenFDA rescue rate, source counts, and common unknown pairs. These are routing/source-coverage diagnostics unless reviewed `expected_interactions` and `known_safe_pairs` are present.
+
+`manifest.json` includes `metric_schema_version`, dataset revision, run id, sample size, model IDs, concurrency, and DDInter release metadata. `summary.md` highlights top-line metrics, timing bottlenecks, unresolved RxNorm queries, unknown interaction pairs, and an explicit warning when outputs are not accuracy-certified.
 
 Use `--record-timeout-seconds` to bound each input record so a stuck RxNorm, OpenFDA, or model path records a `record_timeout` error instead of hanging the whole run. Use `--local-only` for development and smoke runs. Without `--local-only`, result artifacts upload to the experiments bucket under an immutable `benchmark-results/<YYYY-MM-DD>/<run-id>/` prefix; do not commit generated candidate JSON or benchmark result directories to GitHub.
 

diff --git a/eval/benchmark_results.schema.json b/eval/benchmark_results.schema.json
@@ -4,8 +4,42 @@
   "title": "PillChecker benchmark results",
   "type": "object",
   "additionalProperties": true,
-  "required": ["ner", "linking", "interactions", "fp_taxonomy"],
+  "required": ["overall", "timing", "ner", "linking", "rxnorm", "interactions", "errors", "fp_taxonomy"],
   "properties": {
+    "overall": {
+      "type": "object",
+      "additionalProperties": true,
+      "required": [
+        "records_total",
+        "records_completed",
+        "records_errored",
+        "error_rate",
+        "timeout_count",
+        "concurrency",
+        "wall_time_seconds",
+        "records_per_second"
+      ],
+      "properties": {
+        "records_total": {"type": "integer", "minimum": 0},
+        "records_completed": {"type": "integer", "minimum": 0},
+        "records_errored": {"type": "integer", "minimum": 0},
+        "error_rate": {"type": "number", "minimum": 0, "maximum": 1},
+        "timeout_count": {"type": "integer", "minimum": 0},
+        "concurrency": {"type": "integer", "minimum": 1},
+        "wall_time_seconds": {"type": "number", "minimum": 0},
+        "records_per_second": {"type": ["number", "null"], "minimum": 0}
+      }
+    },
+    "timing": {
+      "type": "object",
+      "additionalProperties": true,
+      "required": ["components", "slowest_component", "slowest_component_counts"],
+      "properties": {
+        "components": {"type": "object"},
+        "slowest_component": {"type": ["string", "null"]},
+        "slowest_component_counts": {"type": "object"}
+      }
+    },
     "ner": {
       "type": "object",
       "additionalProperties": true,
@@ -50,6 +84,23 @@
         "incorrect_link_rate": {"type": ["number", "null"], "minimum": 0, "maximum": 1}
       }
     },
+    "rxnorm": {
+      "type": "object",
+      "additionalProperties": true,
+      "required": [
+        "coverage",
+        "fallback_rate",
+        "nil_rate",
+        "n_link_attempts",
+        "n_drugs_total",
+        "acc_at_1",
+        "incorrect_link_rate",
+        "n_rxnorm_attempts",
+        "rxnorm_by_method",
+        "unresolved_queries",
+        "canonicalization_collisions"
+      ]
+    },
     "interactions": {
       "type": "object",
       "additionalProperties": true,
@@ -65,7 +116,12 @@
             "unknown_rate",
             "severity_distribution",
             "uncertain_rate",
-            "records_with_any_interaction"
+            "records_with_any_interaction",
+            "ddinter_rxcui_hit_rate",
+            "ddinter_fts_rescue_rate",
+            "openfda_rescue_rate",
+            "source_counts",
+            "top_unknown_pairs"
           ],
           "properties": {
             "total_pairs_checked": {"type": "integer", "minimum": 0},
@@ -74,7 +130,12 @@
             "unknown_rate": {"type": "number", "minimum": 0, "maximum": 1},
             "severity_distribution": {"$ref": "#/$defs/severity_counts"},
             "uncertain_rate": {"type": "number", "minimum": 0, "maximum": 1},
-            "records_with_any_interaction": {"type": "integer", "minimum": 0}
+            "records_with_any_interaction": {"type": "integer", "minimum": 0},
+            "ddinter_rxcui_hit_rate": {"type": "number", "minimum": 0, "maximum": 1},
+            "ddinter_fts_rescue_rate": {"type": "number", "minimum": 0, "maximum": 1},
+            "openfda_rescue_rate": {"type": "number", "minimum": 0, "maximum": 1},
+            "source_counts": {"type": "object"},
+            "top_unknown_pairs": {"type": "array"}
           }
         },
         "accuracy": {"type": ["object", "null"]},
@@ -91,6 +152,17 @@
         }
       }
     },
+    "errors": {
+      "type": "object",
+      "additionalProperties": true,
+      "required": ["total", "by_stage", "by_class", "records"],
+      "properties": {
+        "total": {"type": "integer", "minimum": 0},
+        "by_stage": {"type": "object"},
+        "by_class": {"type": "object"},
+        "records": {"type": "array"}
+      }
+    },
     "fp_taxonomy": {
       "type": "object",
       "additionalProperties": true,

diff --git a/eval/benchmark_run_manifest.schema.json b/eval/benchmark_run_manifest.schema.json
@@ -12,6 +12,9 @@
     "dataset_revision",
     "command",
     "model_ids",
+    "sample_size",
+    "concurrency",
+    "metric_schema_version",
     "metrics"
   ],
   "properties": {
@@ -63,6 +66,14 @@
       "type": "integer",
       "minimum": 1
     },
+    "concurrency": {
+      "type": ["integer", "null"],
+      "minimum": 1
+    },
+    "metric_schema_version": {
+      "type": "string",
+      "const": "benchmark-diagnostics-v1"
+    },
     "random_seed": {
       "type": ["integer", "string", "null"]
     },

diff --git a/eval/metrics/interactions.py b/eval/metrics/interactions.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from collections import defaultdict
+from collections import Counter, defaultdict
 from typing import Any, Awaitable, Callable
 
 
@@ -140,6 +140,7 @@ def compute(
     uncertain = 0
     returned = 0
     records_with_any = 0
+    attempts = [attempt for prediction in predictions for attempt in prediction.get("interaction_attempts", [])]
 
     for prediction in predictions:
         interactions_response = prediction.get("interactions") or {}
@@ -159,6 +160,7 @@ def compute(
                 uncertain += 1
 
     total_pairs = sum(coverage.values())
+    attempt_diagnostics = _attempt_diagnostics(attempts)
     return {
         "descriptive": {
             "total_pairs_checked": total_pairs,
@@ -168,7 +170,40 @@ def compute(
             "severity_distribution": severity_distribution,
             "uncertain_rate": _rate(uncertain, returned),
             "records_with_any_interaction": records_with_any,
+            **attempt_diagnostics,
         },
         "accuracy": _accuracy(predictions, dataset),
         "seed_smoke": compute_seed_smoke(seed_cases, seed_results),
     }
+
+
+def _status(attempt: dict, component: str) -> str:
+    block = attempt.get(component) or {}
+    return str(block.get("status") or "skipped")
+
+
+def _attempt_diagnostics(attempts: list[dict]) -> dict:
+    total = len(attempts)
+    source_counts = Counter(str(attempt.get("final_source") or "unknown") for attempt in attempts)
+    ddinter_rxcui_hits = sum(1 for attempt in attempts if _status(attempt, "ddinter_rxcui") == "hit")
+    ddinter_fts_hits = sum(1 for attempt in attempts if _status(attempt, "ddinter_fts") == "hit")
+    openfda_hits = sum(1 for attempt in attempts if _status(attempt, "openfda") == "hit")
+    unknown_pairs = Counter(
+        _pair_key(str(attempt.get("drug_a", "")), str(attempt.get("drug_b", "")))
+        for attempt in attempts
+        if attempt.get("final_source") == "unknown"
+    )
+    return {
+        "ddinter_rxcui_hit_rate": _rate(ddinter_rxcui_hits, total),
+        "ddinter_fts_rescue_rate": _rate(ddinter_fts_hits, total),
+        "openfda_rescue_rate": _rate(openfda_hits, total),
+        "source_counts": {
+            "ddinter": int(source_counts.get("ddinter", 0)),
+            "openfda": int(source_counts.get("openfda", 0)),
+            "unknown": int(source_counts.get("unknown", 0)),
+        },
+        "top_unknown_pairs": [
+            {"drug_a": drug_a, "drug_b": drug_b, "count": count}
+            for (drug_a, drug_b), count in unknown_pairs.most_common(10)
+        ],
+    }
diff --git a/eval/metrics/linking.py b/eval/metrics/linking.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+from collections import defaultdict
+
 
 def _rate(numerator: int, denominator: int) -> float:
     return numerator / denominator if denominator else 0.0
@@ -10,6 +12,7 @@ def _rate(numerator: int, denominator: int) -> float:
 def compute(predictions: list[dict], dataset: list[dict]) -> dict:
     drugs = [drug for pred in predictions for drug in pred.get("drugs", [])]
     attempts = [attempt for pred in predictions for attempt in pred.get("link_attempts", [])]
+    rxnorm_attempts = [attempt for pred in predictions for attempt in pred.get("rxnorm_attempts", [])]
     resolved = sum(1 for drug in drugs if drug.get("rxcui"))
     fallback = sum(1 for drug in drugs if drug.get("source") == "rxnorm_fallback")
     nil_count = sum(1 for attempt in attempts if attempt.get("rxcui") is None)
@@ -39,6 +42,7 @@ def compute(predictions: list[dict], dataset: list[dict]) -> dict:
         acc_at_1 = sum(acc_values) / len(acc_values)
         incorrect_link_rate = _rate(incorrect, predicted_with_rxcui_total)
 
+    diagnostics = _rxnorm_diagnostics(rxnorm_attempts)
     return {
         "coverage": _rate(resolved, len(drugs)),
         "fallback_rate": _rate(fallback, len(drugs)),
@@ -47,4 +51,45 @@ def compute(predictions: list[dict], dataset: list[dict]) -> dict:
         "n_drugs_total": len(drugs),
         "acc_at_1": acc_at_1,
         "incorrect_link_rate": incorrect_link_rate,
+        **diagnostics,
+    }
+
+
+def _rxnorm_diagnostics(attempts: list[dict]) -> dict:
+    by_method: dict[str, dict[str, int]] = defaultdict(lambda: {"hit": 0, "miss": 0, "error": 0})
+    unresolved = []
+    queries_by_rxcui: dict[str, set[str]] = defaultdict(set)
+
+    for attempt in attempts:
+        method = str(attempt.get("method") or "unknown")
+        status = str(attempt.get("status") or "unknown")
+        if status not in {"hit", "miss", "error"}:
+            status = "miss" if attempt.get("rxcui") is None else "hit"
+        by_method[method][status] += 1
+
+        query = str(attempt.get("query") or attempt.get("name") or "")
+        rxcui = attempt.get("rxcui")
+        if rxcui:
+            if method != "get_drug_details" and query:
+                queries_by_rxcui[str(rxcui)].add(query)
+        elif status in {"miss", "error"}:
+            unresolved.append({
+                "query": query,
+                "stage": attempt.get("stage"),
+                "method": method,
+            })
+
+    collisions = [
+        {"rxcui": rxcui, "queries": sorted(query for query in queries if query)}
+        for rxcui, queries in sorted(queries_by_rxcui.items())
+        if len({query.casefold() for query in queries if query}) > 1
+    ]
+    return {
+        "n_rxnorm_attempts": len(attempts),
+        "rxnorm_by_method": {
+            method: counts
+            for method, counts in sorted(by_method.items())
+        },
+        "unresolved_queries": unresolved[:20],
+        "canonicalization_collisions": collisions[:20],
     }
diff --git a/eval/metrics/ner.py b/eval/metrics/ner.py
@@ -46,6 +46,11 @@ def _prf(tp: int, fp: int, fn: int) -> dict[str, float]:
 
 
 def _record_metrics(predicted: list[str], expected: list[str], matcher: Callable[[str, str], bool]) -> dict[str, float]:
+    counts = _record_counts(predicted, expected, matcher)
+    return _prf(counts["tp"], counts["fp"], counts["fn"])
+
+
+def _record_counts(predicted: list[str], expected: list[str], matcher: Callable[[str, str], bool]) -> dict[str, int]:
     matched_expected: set[int] = set()
     tp = 0
     for pred in predicted:
@@ -58,7 +63,22 @@ def _record_metrics(predicted: list[str], expected: list[str], matcher: Callable
             tp += 1
     fp = len(predicted) - tp
     fn = len(expected) - tp
-    return _prf(tp, fp, fn)
+    return {"tp": tp, "fp": fp, "fn": fn}
+
+
+def diagnostics_for_entities(entities: list[dict], expected_names: list[str]) -> dict:
+    predicted = [str(entity.get("text", "")) for entity in entities]
+    expected = [str(name) for name in expected_names]
+    strict = _record_counts(predicted, expected, _strict_match)
+    lenient = _record_counts(predicted, expected, _lenient_match)
+    return {
+        "entities": entities,
+        "strict": strict,
+        "lenient": lenient,
+        "expected_count": len(expected),
+        "predicted_count": len(predicted),
+        "low_confidence_count": sum(1 for entity in entities if float(entity.get("score", 1.0)) < 0.85),
+    }
 
 
 def _average(blocks: list[dict[str, float]]) -> dict[str, float]: