From 19ba74cbfda43d9543bcd27d92c187caa8be6e40 Mon Sep 17 00:00:00 2001 From: Moon-python Date: Mon, 8 Jun 2026 22:25:33 +0900 Subject: [PATCH 1/4] Add CLI-level regression ship-gate corpus evals.json covers skill-level LLM behavior; there was no deterministic, machine-checkable regression for the check-file engine itself. This adds a labeled corpus + runner so unit/source/matcher changes can be gated without an LLM in the loop. Why: a usability test on a materials-science reference set surfaced that the tool accepted 0/8 real claims. Root causes split into a matcher/units layer (#7/#8 landed; #10/#11/#14 + an up-to comparator gap open) and an abstract reachability layer (#12/#13). This corpus pins both as invariants vs. progress. - cli_regression.jsonl: 12 rows spanning the publisher matrix (crossref / openalex-only / none) and claim types (numeric value+unit, relational, fabricated, dead DOI), each labeled with expected_verdict, must_accept / must_not_accept invariants, and gated_on issues. - run_cli_regression.py: stdlib runner. SAFETY invariants (no fabricated / relational / unreachable / over-accepting claim ever ACCEPTs; the one clean supported claim stays ACCEPT) exit non-zero; gated rows report as PENDING. - cli_regression.md: documents the two invariant classes and how to run. Current snapshot flags B3 (>220 evidence accepts exact-220 claim, #11) as the one live SAFETY failure; ACCEPT labels for A2/A3/B2 were grounded against live CrossRef/OpenAlex abstracts. Co-Authored-By: Claude Opus 4.8 (1M context) --- evals/cli_regression.jsonl | 12 +++++ evals/cli_regression.md | 54 +++++++++++++++++++ evals/run_cli_regression.py | 100 ++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+) create mode 100644 evals/cli_regression.jsonl create mode 100644 evals/cli_regression.md create mode 100644 evals/run_cli_regression.py diff --git a/evals/cli_regression.jsonl b/evals/cli_regression.jsonl new file mode 100644 index 0000000..d5a6328 --- /dev/null +++ b/evals/cli_regression.jsonl @@ -0,0 +1,12 @@ +{"id":"A1-diez-thermal-above","doi":"10.3390/polym9020059","claim":"The hyperbranched sulfur networks provide a thermal resistance above 200 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract states a high thermal resistance (>220 °C); >220 entails >200, subject+unit co-located, no hedge/scope. Happy-path anchor — must stay ACCEPT on every commit."} +{"id":"A3-sessler-workfn","doi":"10.1063/1.337646","claim":"The effective work function for aluminum-polyimide is 1.7 eV.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":false,"must_not_accept":false,"gated_on":["#14"],"reachable_via":"crossref","rationale":"Abstract: 'effective work function for aluminum-polyimide is estimated to be 1.7 eV in the temperature range'. Value verbatim present; blocked only by the scope-suffix guard (#14) on 'in the temperature range'."} +{"id":"A2-bellucci-30C","doi":"10.1149/1.2086797","claim":"The conductivity measurements were carried out at 30 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":false,"must_not_accept":false,"gated_on":["#13","#14"],"reachable_via":"openalex","rationale":"CrossRef has no abstract; OpenAlex (#13) does ('Measurements were carried out at 30°C ... in the range'). Even when reachable, the 'at 30 °C ... in the range' qualifier is blocked by #14. Demonstrates #13/#14 coupling for physical-science values."} +{"id":"B2-diez-200g","doi":"10.3390/polym9020059","claim":"The sulfur networks were synthesized on a 200 g scale.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":false,"must_not_accept":false,"gated_on":["#10"],"reachable_via":"crossref","rationale":"Abstract: 'we synthesized a 200 g scale of amorphous, ... hyperbranched polymeric sulfur networks'. Value and subject are split across comma-clauses (#10), so subject binding fails."} +{"id":"E2-pelrine-117","doi":"10.1126/science.287.5454.836","claim":"Actuated strains up to 117% were demonstrated with silicone elastomers.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":false,"must_not_accept":false,"gated_on":["up-to-comparator"],"reachable_via":"crossref","rationale":"Abstract verbatim: 'Actuated strains up to 117% were demonstrated with silicone elastomers'. The claim 'up to' comparator is not recognized (treated as exact), so exact-vs-up_to entailment fails. Known comparator gap."} +{"id":"B1-diez-fabricated","doi":"10.3390/polym9020059","claim":"This paper reports a dielectric breakdown strength of 1200 MV/m.","category":"fabricated_control","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Fabricated number absent from the abstract. Negative control — must never ACCEPT."} +{"id":"C2-diaham-relational","doi":"10.1063/5.0108674","claim":"The AC conductivity follows sigma_ac proportional to omega^s with the exponent s approaching 1.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Relational claim, no value+unit; not explicitly stated in the abstract. Out of scope — must never ACCEPT."} +{"id":"C1-simmonstam-relational","doi":"10.1103/PhysRevB.7.3706","claim":"The isothermal current decays as i(t) proportional to 1/t for a uniform trap distribution.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":["#13"],"reachable_via":"openalex","rationale":"Relational. Currently UNVERIFIABLE (no CrossRef abstract, S2 429); OpenAlex (#13) makes it reachable but it must still stay PARTIAL — reachability must not turn a relational claim into ACCEPT."} +{"id":"D1-amiour-elsevier","doi":"10.1016/j.elstat.2021.103551","claim":"Kapton HN deep trap energies are in the range 0.79 to 1.05 eV.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"Elsevier withholds the abstract from CrossRef and OpenAlex; the value lives in full text. Abstract-only ceiling — must remain UNVERIFIABLE, never ACCEPT, regardless of source additions."} +{"id":"D2-jonscher-relational","doi":"10.1038/267673a0","claim":"The dielectric response exponent n lies between 0 and 1.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"No abstract in any source (old Nature) and the claim is relational. Must remain UNVERIFIABLE."} +{"id":"E1-fake-doi","doi":"10.9999/nonexistent.fake.0000","claim":"This material shows 95% energy conversion efficiency.","category":"dead_doi_control","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"DOI does not resolve. Must fail/UNVERIFIABLE, never ACCEPT."} +{"id":"B3-diez-overaccept","doi":"10.3390/polym9020059","claim":"The polymeric sulfur networks provide a high thermal resistance of 220 °C.","category":"over_acceptance_regression","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":["#11"],"reachable_via":"crossref","rationale":"Abstract evidence is '(>220 °C)'. An exact '220 °C' claim must NOT be entailed by '>220'. Currently ACCEPTs (over-acceptance, #11); target verdict is PARTIAL once #11 lands."} diff --git a/evals/cli_regression.md b/evals/cli_regression.md new file mode 100644 index 0000000..2d80fa6 --- /dev/null +++ b/evals/cli_regression.md @@ -0,0 +1,54 @@ +# CLI regression corpus (ship-gate) + +`cli_regression.jsonl` is a labeled, deterministic regression set for the +`check-file` engine. It complements `evals.json` (which evaluates skill-level LLM +behavior); this one pins **machine-checkable verdicts** so unit/source/matcher +changes can be regression-tested without an LLM in the loop. + +Each row carries the claim **plus** ground-truth labels: + +| field | meaning | +|---|---| +| `expected_verdict` | the verdict the engine *should* reach | +| `must_accept` | invariant: this row must end `ACCEPT` on every commit | +| `must_not_accept` | invariant: this row must **never** end `ACCEPT` | +| `gated_on` | open issues that currently block `expected_verdict` | +| `reachable_via` | where an abstract exists: `crossref` / `openalex` / `none` | +| `category` | `numeric_supported`, `fabricated_control`, `relational_out_of_scope`, `unreachable_ceiling`, `dead_doi_control`, `over_acceptance_regression` | + +## Two invariant classes + +**SAFETY (release blocker).** `must_accept` rows must stay `ACCEPT`; `must_not_accept` +rows must never become `ACCEPT`. This is the tool's core promise — no fabricated, +relational, unreachable, or over-accepting claim is waved through, and the one +clean supported claim stays green. A break here fails the gate (non-zero exit). + +**PROGRESS (informational).** Gated rows do not yet reach `expected_verdict` +because a fix has not landed. They are reported, not failed, and flip to PASS as +their `gated_on` issue is resolved. This is how the corpus tracks the roadmap. + +## How to run + +```bash +PYTHONPATH=src python3 evals/run_cli_regression.py +``` + +Exit code is non-zero iff a SAFETY invariant is violated. (Live network: CrossRef / +OpenAlex / Semantic Scholar / PubMed. Semantic-Scholar free-tier 429 only affects +PROGRESS rows that depend on it, never SAFETY rows.) + +## What the corpus encodes (snapshot, latest `main`) + +- **1 supported happy path** — `A1` (`>220 °C` entails `>200 °C`): ACCEPT today, must stay. +- **5 never-accept controls** — `B1` fabricated number, `C1`/`C2` relational, `D1`/`D2` + unreachable (Elsevier / old Nature, abstract-only ceiling), `E1` dead DOI. +- **Gated false-negatives** (target ACCEPT once fixed): `A3` → #14, `B2` → #10, + `A2` → #13 **and** #14 (physical-science values are reachable *and* scope-blocked — + the two are coupled), `E2` → up-to comparator. +- **Over-acceptance regression** — `B3`: `>220` evidence currently ACCEPTs an exact + `220` claim; target `PARTIAL` once #11 lands. Listed `must_not_accept` so it also + guards against the bug regressing. + +The verdict labels for `A2`/`A3`/`B2` were grounded by fetching the live abstracts +(CrossRef + OpenAlex) and confirming the value appears verbatim; no label asserts +support that is not in a fetched abstract. diff --git a/evals/run_cli_regression.py b/evals/run_cli_regression.py new file mode 100644 index 0000000..6736cbc --- /dev/null +++ b/evals/run_cli_regression.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""Deterministic CLI regression gate for ref-verify. + +Runs the labeled corpus in ``cli_regression.jsonl`` through ``check-file`` and +classifies every row into one of: + +- SAFETY pass/fail — invariants that must hold on every commit: + * ``must_accept`` rows must end ACCEPT (the supported happy path stays green) + * ``must_not_accept`` rows must NOT end ACCEPT (no fabricated/relational/ + unreachable/over-accepting claim is ever waved through) + A SAFETY failure exits non-zero and should block release. + +- PROGRESS — gated rows whose ``expected_verdict`` is not yet reached because the + named issue (``gated_on``) has not landed. These are reported, not failed; they + flip to PASS as #10/#11/#13/#14 / the up-to comparator are fixed. + +Stdlib only. Usage: + PYTHONPATH=src python3 evals/run_cli_regression.py +""" + +from __future__ import annotations + +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +CORPUS = Path(__file__).with_name("cli_regression.jsonl") + + +def _load_corpus() -> list[dict]: + rows = [] + for line in CORPUS.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if line: + rows.append(json.loads(line)) + return rows + + +def _run_cli(rows: list[dict]) -> dict[str, dict]: + with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps({"id": row["id"], "doi": row["doi"], "claim": row["claim"]}) + "\n") + claims_path = handle.name + proc = subprocess.run( + [sys.executable, "-m", "ref_verify.cli", "check-file", claims_path, "--json"], + capture_output=True, + text=True, + ) + if not proc.stdout.strip(): + raise SystemExit(f"check-file produced no JSON. stderr:\n{proc.stderr}") + payload = json.loads(proc.stdout) + return {r["id"]: r for r in payload["results"]} + + +def main() -> int: + rows = _load_corpus() + results = _run_cli(rows) + + safety_failures: list[str] = [] + progress_pending: list[str] = [] + print(f"{'id':26}{'verdict':20}{'expected':14}{'class':13}note") + print("-" * 92) + for row in rows: + res = results.get(row["id"], {}) + verdict = res.get("verdict", "MISSING") + status = res.get("status", "") + accepted = verdict == "ACCEPT" + klass, note = "PASS", "" + + if row.get("must_accept") and not accepted: + klass, note = "SAFETY-FAIL", "must ACCEPT but did not" + safety_failures.append(row["id"]) + elif row.get("must_not_accept") and accepted: + klass, note = "SAFETY-FAIL", "must NOT ACCEPT but did" + safety_failures.append(row["id"]) + elif verdict != row["expected_verdict"] and status != row["expected_verdict"]: + gated = ",".join(row.get("gated_on") or []) or "?" + klass, note = "PENDING", f"want {row['expected_verdict']} after {gated}" + progress_pending.append(row["id"]) + + shown = verdict if verdict != "WARN" else f"{verdict}/{status}" + print(f"{row['id']:26}{shown:20}{row['expected_verdict']:14}{klass:13}{note}") + + print("-" * 92) + print( + f"SAFETY: {len(rows) - len(safety_failures)}/{len(rows)} ok" + f" | PROGRESS pending: {len(progress_pending)}" + ) + if safety_failures: + print("SAFETY FAILURES (release blockers):", ", ".join(safety_failures)) + return 1 + if progress_pending: + print("Pending (informational, not a failure):", ", ".join(progress_pending)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 683e02f853dbd9adceac3181e31d1a6ae2962f23 Mon Sep 17 00:00:00 2001 From: Moon-python Date: Mon, 8 Jun 2026 22:34:17 +0900 Subject: [PATCH 2/4] Verify corpus against latest main; refine control handling Re-ran the gate after #11/#13/#14 merged to main (commits 329cde9, a41e92f): - A3 (1.7 eV) flipped PARTIAL -> ACCEPT (#14 condition handling) - B3 over-acceptance flipped ACCEPT -> PARTIAL (#11) - A2 now reachable via OpenAlex (#13) but still PARTIAL (residual trailing scope qualifier) -> remains PENDING SAFETY is now 12/12 (gate green). Runner: control rows (must_not_accept) now PASS on any non-ACCEPT verdict instead of pinning a single one, since UNVERIFIABLE vs PARTIAL can vary with Semantic-Scholar availability (e.g. D1 Amiour). The invariant is "never ACCEPT". Co-Authored-By: Claude Opus 4.8 (1M context) --- evals/cli_regression.jsonl | 2 +- evals/cli_regression.md | 23 ++++++++++++++--------- evals/run_cli_regression.py | 5 +++++ 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/evals/cli_regression.jsonl b/evals/cli_regression.jsonl index d5a6328..c75fb5d 100644 --- a/evals/cli_regression.jsonl +++ b/evals/cli_regression.jsonl @@ -6,7 +6,7 @@ {"id":"B1-diez-fabricated","doi":"10.3390/polym9020059","claim":"This paper reports a dielectric breakdown strength of 1200 MV/m.","category":"fabricated_control","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Fabricated number absent from the abstract. Negative control — must never ACCEPT."} {"id":"C2-diaham-relational","doi":"10.1063/5.0108674","claim":"The AC conductivity follows sigma_ac proportional to omega^s with the exponent s approaching 1.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Relational claim, no value+unit; not explicitly stated in the abstract. Out of scope — must never ACCEPT."} {"id":"C1-simmonstam-relational","doi":"10.1103/PhysRevB.7.3706","claim":"The isothermal current decays as i(t) proportional to 1/t for a uniform trap distribution.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":["#13"],"reachable_via":"openalex","rationale":"Relational. Currently UNVERIFIABLE (no CrossRef abstract, S2 429); OpenAlex (#13) makes it reachable but it must still stay PARTIAL — reachability must not turn a relational claim into ACCEPT."} -{"id":"D1-amiour-elsevier","doi":"10.1016/j.elstat.2021.103551","claim":"Kapton HN deep trap energies are in the range 0.79 to 1.05 eV.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"Elsevier withholds the abstract from CrossRef and OpenAlex; the value lives in full text. Abstract-only ceiling — must remain UNVERIFIABLE, never ACCEPT, regardless of source additions."} +{"id":"D1-amiour-elsevier","doi":"10.1016/j.elstat.2021.103551","claim":"Kapton HN deep trap energies are in the range 0.79 to 1.05 eV.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"Elsevier withholds the abstract from CrossRef and OpenAlex; the 0.79-1.05 eV deep-trap values live in full text, not any abstract. Verdict may be UNVERIFIABLE (no abstract) or PARTIAL (Semantic Scholar abstract reached but the values are not in it) depending on S2 availability; both satisfy the must_not_accept invariant. Abstract-only ceiling — must never ACCEPT."} {"id":"D2-jonscher-relational","doi":"10.1038/267673a0","claim":"The dielectric response exponent n lies between 0 and 1.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"No abstract in any source (old Nature) and the claim is relational. Must remain UNVERIFIABLE."} {"id":"E1-fake-doi","doi":"10.9999/nonexistent.fake.0000","claim":"This material shows 95% energy conversion efficiency.","category":"dead_doi_control","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"DOI does not resolve. Must fail/UNVERIFIABLE, never ACCEPT."} {"id":"B3-diez-overaccept","doi":"10.3390/polym9020059","claim":"The polymeric sulfur networks provide a high thermal resistance of 220 °C.","category":"over_acceptance_regression","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":["#11"],"reachable_via":"crossref","rationale":"Abstract evidence is '(>220 °C)'. An exact '220 °C' claim must NOT be entailed by '>220'. Currently ACCEPTs (over-acceptance, #11); target verdict is PARTIAL once #11 lands."} diff --git a/evals/cli_regression.md b/evals/cli_regression.md index 2d80fa6..d387812 100644 --- a/evals/cli_regression.md +++ b/evals/cli_regression.md @@ -39,15 +39,20 @@ PROGRESS rows that depend on it, never SAFETY rows.) ## What the corpus encodes (snapshot, latest `main`) -- **1 supported happy path** — `A1` (`>220 °C` entails `>200 °C`): ACCEPT today, must stay. -- **5 never-accept controls** — `B1` fabricated number, `C1`/`C2` relational, `D1`/`D2` - unreachable (Elsevier / old Nature, abstract-only ceiling), `E1` dead DOI. -- **Gated false-negatives** (target ACCEPT once fixed): `A3` → #14, `B2` → #10, - `A2` → #13 **and** #14 (physical-science values are reachable *and* scope-blocked — - the two are coupled), `E2` → up-to comparator. -- **Over-acceptance regression** — `B3`: `>220` evidence currently ACCEPTs an exact - `220` claim; target `PARTIAL` once #11 lands. Listed `must_not_accept` so it also - guards against the bug regressing. +``` +SAFETY: 12/12 ok | PROGRESS pending: 3 +Pending: A2-bellucci-30C, B2-diez-200g, E2-pelrine-117 +``` + +- **Supported happy paths** — `A1` (`>220 °C` entails `>200 °C`) and `A3` (`1.7 eV`, + unblocked once #14 landed) ACCEPT and must stay green. +- **Never-accept controls (all PASS)** — `B1` fabricated number, `C1`/`C2` relational, + `D1`/`D2` unreachable (Elsevier / old Nature, abstract-only ceiling), `E1` dead DOI. + `B3` (over-acceptance) is now `PARTIAL` after #11 — kept `must_not_accept` so the bug + cannot silently regress. +- **Remaining gated false-negatives** (target ACCEPT once fixed): `B2` → #10, + `E2` → up-to comparator, `A2` → residual condition handling (#13 already makes it + reachable via OpenAlex, but a trailing `in the range` qualifier still blocks ACCEPT). The verdict labels for `A2`/`A3`/`B2` were grounded by fetching the live abstracts (CrossRef + OpenAlex) and confirming the value appears verbatim; no label asserts diff --git a/evals/run_cli_regression.py b/evals/run_cli_regression.py index 6736cbc..c540524 100644 --- a/evals/run_cli_regression.py +++ b/evals/run_cli_regression.py @@ -75,6 +75,11 @@ def main() -> int: elif row.get("must_not_accept") and accepted: klass, note = "SAFETY-FAIL", "must NOT ACCEPT but did" safety_failures.append(row["id"]) + elif row.get("must_not_accept"): + # Control row: the only invariant is "never ACCEPT". The exact non-ACCEPT + # verdict (UNVERIFIABLE vs PARTIAL) can vary with source availability, so it + # is not pinned. + klass = "PASS" elif verdict != row["expected_verdict"] and status != row["expected_verdict"]: gated = ",".join(row.get("gated_on") or []) or "?" klass, note = "PENDING", f"want {row['expected_verdict']} after {gated}" From 50c224a8c3e921784f08ac69676aa9da796425b7 Mon Sep 17 00:00:00 2001 From: Moon-Young Date: Mon, 8 Jun 2026 13:47:51 +0000 Subject: [PATCH 3/4] Update CLI regression corpus gate status --- evals/cli_regression.jsonl | 12 ++++++------ evals/cli_regression.md | 15 ++++++++------- evals/run_cli_regression.py | 17 ++++++++++------- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/evals/cli_regression.jsonl b/evals/cli_regression.jsonl index c75fb5d..d555d02 100644 --- a/evals/cli_regression.jsonl +++ b/evals/cli_regression.jsonl @@ -1,12 +1,12 @@ {"id":"A1-diez-thermal-above","doi":"10.3390/polym9020059","claim":"The hyperbranched sulfur networks provide a thermal resistance above 200 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract states a high thermal resistance (>220 °C); >220 entails >200, subject+unit co-located, no hedge/scope. Happy-path anchor — must stay ACCEPT on every commit."} -{"id":"A3-sessler-workfn","doi":"10.1063/1.337646","claim":"The effective work function for aluminum-polyimide is 1.7 eV.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":false,"must_not_accept":false,"gated_on":["#14"],"reachable_via":"crossref","rationale":"Abstract: 'effective work function for aluminum-polyimide is estimated to be 1.7 eV in the temperature range'. Value verbatim present; blocked only by the scope-suffix guard (#14) on 'in the temperature range'."} -{"id":"A2-bellucci-30C","doi":"10.1149/1.2086797","claim":"The conductivity measurements were carried out at 30 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":false,"must_not_accept":false,"gated_on":["#13","#14"],"reachable_via":"openalex","rationale":"CrossRef has no abstract; OpenAlex (#13) does ('Measurements were carried out at 30°C ... in the range'). Even when reachable, the 'at 30 °C ... in the range' qualifier is blocked by #14. Demonstrates #13/#14 coupling for physical-science values."} -{"id":"B2-diez-200g","doi":"10.3390/polym9020059","claim":"The sulfur networks were synthesized on a 200 g scale.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":false,"must_not_accept":false,"gated_on":["#10"],"reachable_via":"crossref","rationale":"Abstract: 'we synthesized a 200 g scale of amorphous, ... hyperbranched polymeric sulfur networks'. Value and subject are split across comma-clauses (#10), so subject binding fails."} -{"id":"E2-pelrine-117","doi":"10.1126/science.287.5454.836","claim":"Actuated strains up to 117% were demonstrated with silicone elastomers.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":false,"must_not_accept":false,"gated_on":["up-to-comparator"],"reachable_via":"crossref","rationale":"Abstract verbatim: 'Actuated strains up to 117% were demonstrated with silicone elastomers'. The claim 'up to' comparator is not recognized (treated as exact), so exact-vs-up_to entailment fails. Known comparator gap."} +{"id":"A3-sessler-workfn","doi":"10.1063/1.337646","claim":"The effective work function for aluminum-polyimide is 1.7 eV.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract: 'effective work function for aluminum-polyimide is estimated to be 1.7 eV in the temperature range'. Value is verbatim present; this row guards the physical-measurement condition suffix fix."} +{"id":"A2-bellucci-30C","doi":"10.1149/1.2086797","claim":"The conductivity measurements were carried out at 30 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"openalex","rationale":"CrossRef has no abstract; OpenAlex provides a DOI-bound abstract with 'Measurements were carried out at 30°C ... in the range'. This row guards OpenAlex reachability plus physical range/field condition handling."} +{"id":"B2-diez-200g","doi":"10.3390/polym9020059","claim":"The sulfur networks were synthesized on a 200 g scale.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract: 'we synthesized a 200 g scale of amorphous, ... hyperbranched polymeric sulfur networks'. This row guards subject binding across descriptive comma clauses."} +{"id":"E2-pelrine-117","doi":"10.1126/science.287.5454.836","claim":"Actuated strains up to 117% were demonstrated with silicone elastomers.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract verbatim: 'Actuated strains up to 117% were demonstrated with silicone elastomers'. This row guards claim-side 'up to' comparator handling."} {"id":"B1-diez-fabricated","doi":"10.3390/polym9020059","claim":"This paper reports a dielectric breakdown strength of 1200 MV/m.","category":"fabricated_control","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Fabricated number absent from the abstract. Negative control — must never ACCEPT."} {"id":"C2-diaham-relational","doi":"10.1063/5.0108674","claim":"The AC conductivity follows sigma_ac proportional to omega^s with the exponent s approaching 1.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Relational claim, no value+unit; not explicitly stated in the abstract. Out of scope — must never ACCEPT."} -{"id":"C1-simmonstam-relational","doi":"10.1103/PhysRevB.7.3706","claim":"The isothermal current decays as i(t) proportional to 1/t for a uniform trap distribution.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":["#13"],"reachable_via":"openalex","rationale":"Relational. Currently UNVERIFIABLE (no CrossRef abstract, S2 429); OpenAlex (#13) makes it reachable but it must still stay PARTIAL — reachability must not turn a relational claim into ACCEPT."} +{"id":"C1-simmonstam-relational","doi":"10.1103/PhysRevB.7.3706","claim":"The isothermal current decays as i(t) proportional to 1/t for a uniform trap distribution.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"openalex","rationale":"Relational. OpenAlex may make the abstract reachable, but reachability must not turn an out-of-scope relational claim into ACCEPT."} {"id":"D1-amiour-elsevier","doi":"10.1016/j.elstat.2021.103551","claim":"Kapton HN deep trap energies are in the range 0.79 to 1.05 eV.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"Elsevier withholds the abstract from CrossRef and OpenAlex; the 0.79-1.05 eV deep-trap values live in full text, not any abstract. Verdict may be UNVERIFIABLE (no abstract) or PARTIAL (Semantic Scholar abstract reached but the values are not in it) depending on S2 availability; both satisfy the must_not_accept invariant. Abstract-only ceiling — must never ACCEPT."} {"id":"D2-jonscher-relational","doi":"10.1038/267673a0","claim":"The dielectric response exponent n lies between 0 and 1.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"No abstract in any source (old Nature) and the claim is relational. Must remain UNVERIFIABLE."} {"id":"E1-fake-doi","doi":"10.9999/nonexistent.fake.0000","claim":"This material shows 95% energy conversion efficiency.","category":"dead_doi_control","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"DOI does not resolve. Must fail/UNVERIFIABLE, never ACCEPT."} -{"id":"B3-diez-overaccept","doi":"10.3390/polym9020059","claim":"The polymeric sulfur networks provide a high thermal resistance of 220 °C.","category":"over_acceptance_regression","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":["#11"],"reachable_via":"crossref","rationale":"Abstract evidence is '(>220 °C)'. An exact '220 °C' claim must NOT be entailed by '>220'. Currently ACCEPTs (over-acceptance, #11); target verdict is PARTIAL once #11 lands."} +{"id":"B3-diez-overaccept","doi":"10.3390/polym9020059","claim":"The polymeric sulfur networks provide a high thermal resistance of 220 °C.","category":"over_acceptance_regression","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract evidence is '(>220 °C)'. An exact '220 °C' claim must NOT be entailed by '>220'; this row guards against exact-claim over-acceptance."} diff --git a/evals/cli_regression.md b/evals/cli_regression.md index d387812..49da245 100644 --- a/evals/cli_regression.md +++ b/evals/cli_regression.md @@ -40,19 +40,20 @@ PROGRESS rows that depend on it, never SAFETY rows.) ## What the corpus encodes (snapshot, latest `main`) ``` -SAFETY: 12/12 ok | PROGRESS pending: 3 -Pending: A2-bellucci-30C, B2-diez-200g, E2-pelrine-117 +SAFETY: 12/12 ok | PROGRESS pending: 0 ``` -- **Supported happy paths** — `A1` (`>220 °C` entails `>200 °C`) and `A3` (`1.7 eV`, - unblocked once #14 landed) ACCEPT and must stay green. +- **Supported happy paths** — `A1` (`>220 °C` entails `>200 °C`), `A2` + (OpenAlex-reached `30 °C` conductivity measurements), `A3` (`1.7 eV`), `B2` + (`200 g` sulfur-network synthesis), and `E2` (`up to 117%` actuated strain) + ACCEPT and must stay green. - **Never-accept controls (all PASS)** — `B1` fabricated number, `C1`/`C2` relational, `D1`/`D2` unreachable (Elsevier / old Nature, abstract-only ceiling), `E1` dead DOI. `B3` (over-acceptance) is now `PARTIAL` after #11 — kept `must_not_accept` so the bug cannot silently regress. -- **Remaining gated false-negatives** (target ACCEPT once fixed): `B2` → #10, - `E2` → up-to comparator, `A2` → residual condition handling (#13 already makes it - reachable via OpenAlex, but a trailing `in the range` qualifier still blocks ACCEPT). +- **No current gated false-negatives** — if future supported rows are added before + their matcher/source work lands, they should use `gated_on` and report as + PROGRESS rather than failing SAFETY. The verdict labels for `A2`/`A3`/`B2` were grounded by fetching the live abstracts (CrossRef + OpenAlex) and confirming the value appears verbatim; no label asserts diff --git a/evals/run_cli_regression.py b/evals/run_cli_regression.py index c540524..0698a7d 100644 --- a/evals/run_cli_regression.py +++ b/evals/run_cli_regression.py @@ -10,9 +10,9 @@ unreachable/over-accepting claim is ever waved through) A SAFETY failure exits non-zero and should block release. -- PROGRESS — gated rows whose ``expected_verdict`` is not yet reached because the +- PROGRESS — gated rows whose ``expected_verdict`` is not yet reached because a named issue (``gated_on``) has not landed. These are reported, not failed; they - flip to PASS as #10/#11/#13/#14 / the up-to comparator are fixed. + flip to PASS as their fixes land. Stdlib only. Usage: PYTHONPATH=src python3 evals/run_cli_regression.py @@ -43,11 +43,14 @@ def _run_cli(rows: list[dict]) -> dict[str, dict]: for row in rows: handle.write(json.dumps({"id": row["id"], "doi": row["doi"], "claim": row["claim"]}) + "\n") claims_path = handle.name - proc = subprocess.run( - [sys.executable, "-m", "ref_verify.cli", "check-file", claims_path, "--json"], - capture_output=True, - text=True, - ) + try: + proc = subprocess.run( + [sys.executable, "-m", "ref_verify.cli", "check-file", claims_path, "--json"], + capture_output=True, + text=True, + ) + finally: + Path(claims_path).unlink(missing_ok=True) if not proc.stdout.strip(): raise SystemExit(f"check-file produced no JSON. stderr:\n{proc.stderr}") payload = json.loads(proc.stdout) From d7af64f7c21b6aa7d64f26e104afc76a83aad753 Mon Sep 17 00:00:00 2001 From: Moon-Young Date: Mon, 8 Jun 2026 13:50:01 +0000 Subject: [PATCH 4/4] Wire CLI regression gate into live smoke --- .github/workflows/live-smoke.yml | 3 +++ evals/cli_regression.md | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/.github/workflows/live-smoke.yml b/.github/workflows/live-smoke.yml index 4d49eb6..d7222a3 100644 --- a/.github/workflows/live-smoke.yml +++ b/.github/workflows/live-smoke.yml @@ -34,3 +34,6 @@ jobs: ref-verify check-claim 10.1126/science.287.5454.836 \ --claim "actuation strain above 100%" \ --json + + - name: CLI regression ship gate + run: PYTHONPATH=src python evals/run_cli_regression.py diff --git a/evals/cli_regression.md b/evals/cli_regression.md index 49da245..98b23e2 100644 --- a/evals/cli_regression.md +++ b/evals/cli_regression.md @@ -33,6 +33,10 @@ their `gated_on` issue is resolved. This is how the corpus tracks the roadmap. PYTHONPATH=src python3 evals/run_cli_regression.py ``` +The same command is also wired into the manual GitHub Actions **Live Smoke** +workflow. It is intentionally not part of every pull-request CI run because it +depends on live public APIs. + Exit code is non-zero iff a SAFETY invariant is violated. (Live network: CrossRef / OpenAlex / Semantic Scholar / PubMed. Semantic-Scholar free-tier 429 only affects PROGRESS rows that depend on it, never SAFETY rows.)