Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/live-smoke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ jobs:
ref-verify check-claim 10.1126/science.287.5454.836 \
--claim "actuation strain above 100%" \
--json

- name: CLI regression ship gate
run: PYTHONPATH=src python evals/run_cli_regression.py
12 changes: 12 additions & 0 deletions evals/cli_regression.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{"id":"A1-diez-thermal-above","doi":"10.3390/polym9020059","claim":"The hyperbranched sulfur networks provide a thermal resistance above 200 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract states a high thermal resistance (>220 °C); >220 entails >200, subject+unit co-located, no hedge/scope. Happy-path anchor — must stay ACCEPT on every commit."}
{"id":"A3-sessler-workfn","doi":"10.1063/1.337646","claim":"The effective work function for aluminum-polyimide is 1.7 eV.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract: 'effective work function for aluminum-polyimide is estimated to be 1.7 eV in the temperature range'. Value is verbatim present; this row guards the physical-measurement condition suffix fix."}
{"id":"A2-bellucci-30C","doi":"10.1149/1.2086797","claim":"The conductivity measurements were carried out at 30 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"openalex","rationale":"CrossRef has no abstract; OpenAlex provides a DOI-bound abstract with 'Measurements were carried out at 30°C ... in the range'. This row guards OpenAlex reachability plus physical range/field condition handling."}
{"id":"B2-diez-200g","doi":"10.3390/polym9020059","claim":"The sulfur networks were synthesized on a 200 g scale.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract: 'we synthesized a 200 g scale of amorphous, ... hyperbranched polymeric sulfur networks'. This row guards subject binding across descriptive comma clauses."}
{"id":"E2-pelrine-117","doi":"10.1126/science.287.5454.836","claim":"Actuated strains up to 117% were demonstrated with silicone elastomers.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract verbatim: 'Actuated strains up to 117% were demonstrated with silicone elastomers'. This row guards claim-side 'up to' comparator handling."}
{"id":"B1-diez-fabricated","doi":"10.3390/polym9020059","claim":"This paper reports a dielectric breakdown strength of 1200 MV/m.","category":"fabricated_control","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Fabricated number absent from the abstract. Negative control — must never ACCEPT."}
{"id":"C2-diaham-relational","doi":"10.1063/5.0108674","claim":"The AC conductivity follows sigma_ac proportional to omega^s with the exponent s approaching 1.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Relational claim, no value+unit; not explicitly stated in the abstract. Out of scope — must never ACCEPT."}
{"id":"C1-simmonstam-relational","doi":"10.1103/PhysRevB.7.3706","claim":"The isothermal current decays as i(t) proportional to 1/t for a uniform trap distribution.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"openalex","rationale":"Relational. OpenAlex may make the abstract reachable, but reachability must not turn an out-of-scope relational claim into ACCEPT."}
{"id":"D1-amiour-elsevier","doi":"10.1016/j.elstat.2021.103551","claim":"Kapton HN deep trap energies are in the range 0.79 to 1.05 eV.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"Elsevier withholds the abstract from CrossRef and OpenAlex; the 0.79-1.05 eV deep-trap values live in full text, not any abstract. Verdict may be UNVERIFIABLE (no abstract) or PARTIAL (Semantic Scholar abstract reached but the values are not in it) depending on S2 availability; both satisfy the must_not_accept invariant. Abstract-only ceiling — must never ACCEPT."}
{"id":"D2-jonscher-relational","doi":"10.1038/267673a0","claim":"The dielectric response exponent n lies between 0 and 1.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"No abstract in any source (old Nature) and the claim is relational. Must remain UNVERIFIABLE."}
{"id":"E1-fake-doi","doi":"10.9999/nonexistent.fake.0000","claim":"This material shows 95% energy conversion efficiency.","category":"dead_doi_control","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"DOI does not resolve. Must fail/UNVERIFIABLE, never ACCEPT."}
{"id":"B3-diez-overaccept","doi":"10.3390/polym9020059","claim":"The polymeric sulfur networks provide a high thermal resistance of 220 °C.","category":"over_acceptance_regression","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract evidence is '(>220 °C)'. An exact '220 °C' claim must NOT be entailed by '>220'; this row guards against exact-claim over-acceptance."}
64 changes: 64 additions & 0 deletions evals/cli_regression.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# CLI regression corpus (ship-gate)

`cli_regression.jsonl` is a labeled, deterministic regression set for the
`check-file` engine. It complements `evals.json` (which evaluates skill-level LLM
behavior); this one pins **machine-checkable verdicts** so unit/source/matcher
changes can be regression-tested without an LLM in the loop.

Each row carries the claim **plus** ground-truth labels:

| field | meaning |
|---|---|
| `expected_verdict` | the verdict the engine *should* reach |
| `must_accept` | invariant: this row must end `ACCEPT` on every commit |
| `must_not_accept` | invariant: this row must **never** end `ACCEPT` |
| `gated_on` | open issues that currently block `expected_verdict` |
| `reachable_via` | where an abstract exists: `crossref` / `openalex` / `none` |
| `category` | `numeric_supported`, `fabricated_control`, `relational_out_of_scope`, `unreachable_ceiling`, `dead_doi_control`, `over_acceptance_regression` |

## Two invariant classes

**SAFETY (release blocker).** `must_accept` rows must stay `ACCEPT`; `must_not_accept`
rows must never become `ACCEPT`. This is the tool's core promise — no fabricated,
relational, unreachable, or over-accepting claim is waved through, and the one
clean supported claim stays green. A break here fails the gate (non-zero exit).

**PROGRESS (informational).** Gated rows do not yet reach `expected_verdict`
because a fix has not landed. They are reported, not failed, and flip to PASS as
their `gated_on` issue is resolved. This is how the corpus tracks the roadmap.

## How to run

```bash
PYTHONPATH=src python3 evals/run_cli_regression.py
```

The same command is also wired into the manual GitHub Actions **Live Smoke**
workflow. It is intentionally not part of every pull-request CI run because it
depends on live public APIs.

Exit code is non-zero iff a SAFETY invariant is violated. (Live network: CrossRef /
OpenAlex / Semantic Scholar / PubMed. Semantic-Scholar free-tier 429 only affects
PROGRESS rows that depend on it, never SAFETY rows.)

## What the corpus encodes (snapshot, latest `main`)

```
SAFETY: 12/12 ok | PROGRESS pending: 0
```

- **Supported happy paths** — `A1` (`>220 °C` entails `>200 °C`), `A2`
(OpenAlex-reached `30 °C` conductivity measurements), `A3` (`1.7 eV`), `B2`
(`200 g` sulfur-network synthesis), and `E2` (`up to 117%` actuated strain)
ACCEPT and must stay green.
- **Never-accept controls (all PASS)** — `B1` fabricated number, `C1`/`C2` relational,
`D1`/`D2` unreachable (Elsevier / old Nature, abstract-only ceiling), `E1` dead DOI.
`B3` (over-acceptance) is now `PARTIAL` after #11 — kept `must_not_accept` so the bug
cannot silently regress.
- **No current gated false-negatives** — if future supported rows are added before
their matcher/source work lands, they should use `gated_on` and report as
PROGRESS rather than failing SAFETY.

The verdict labels for `A2`/`A3`/`B2` were grounded by fetching the live abstracts
(CrossRef + OpenAlex) and confirming the value appears verbatim; no label asserts
support that is not in a fetched abstract.
108 changes: 108 additions & 0 deletions evals/run_cli_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env python3
"""Deterministic CLI regression gate for ref-verify.

Runs the labeled corpus in ``cli_regression.jsonl`` through ``check-file`` and
classifies every row into one of:

- SAFETY pass/fail — invariants that must hold on every commit:
* ``must_accept`` rows must end ACCEPT (the supported happy path stays green)
* ``must_not_accept`` rows must NOT end ACCEPT (no fabricated/relational/
unreachable/over-accepting claim is ever waved through)
A SAFETY failure exits non-zero and should block release.

- PROGRESS — gated rows whose ``expected_verdict`` is not yet reached because a
named issue (``gated_on``) has not landed. These are reported, not failed; they
flip to PASS as their fixes land.

Stdlib only. Usage:
PYTHONPATH=src python3 evals/run_cli_regression.py
"""

from __future__ import annotations

import json
import subprocess
import sys
import tempfile
from pathlib import Path

CORPUS = Path(__file__).with_name("cli_regression.jsonl")


def _load_corpus() -> list[dict]:
rows = []
for line in CORPUS.read_text(encoding="utf-8").splitlines():
line = line.strip()
if line:
rows.append(json.loads(line))
return rows


def _run_cli(rows: list[dict]) -> dict[str, dict]:
with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as handle:
for row in rows:
handle.write(json.dumps({"id": row["id"], "doi": row["doi"], "claim": row["claim"]}) + "\n")
claims_path = handle.name
try:
proc = subprocess.run(
[sys.executable, "-m", "ref_verify.cli", "check-file", claims_path, "--json"],
capture_output=True,
text=True,
)
finally:
Path(claims_path).unlink(missing_ok=True)
if not proc.stdout.strip():
raise SystemExit(f"check-file produced no JSON. stderr:\n{proc.stderr}")
payload = json.loads(proc.stdout)
return {r["id"]: r for r in payload["results"]}


def main() -> int:
rows = _load_corpus()
results = _run_cli(rows)

safety_failures: list[str] = []
progress_pending: list[str] = []
print(f"{'id':26}{'verdict':20}{'expected':14}{'class':13}note")
print("-" * 92)
for row in rows:
res = results.get(row["id"], {})
verdict = res.get("verdict", "MISSING")
status = res.get("status", "")
accepted = verdict == "ACCEPT"
klass, note = "PASS", ""

if row.get("must_accept") and not accepted:
klass, note = "SAFETY-FAIL", "must ACCEPT but did not"
safety_failures.append(row["id"])
elif row.get("must_not_accept") and accepted:
klass, note = "SAFETY-FAIL", "must NOT ACCEPT but did"
safety_failures.append(row["id"])
elif row.get("must_not_accept"):
# Control row: the only invariant is "never ACCEPT". The exact non-ACCEPT
# verdict (UNVERIFIABLE vs PARTIAL) can vary with source availability, so it
# is not pinned.
klass = "PASS"
elif verdict != row["expected_verdict"] and status != row["expected_verdict"]:
gated = ",".join(row.get("gated_on") or []) or "?"
klass, note = "PENDING", f"want {row['expected_verdict']} after {gated}"
progress_pending.append(row["id"])

shown = verdict if verdict != "WARN" else f"{verdict}/{status}"
print(f"{row['id']:26}{shown:20}{row['expected_verdict']:14}{klass:13}{note}")

print("-" * 92)
print(
f"SAFETY: {len(rows) - len(safety_failures)}/{len(rows)} ok"
f" | PROGRESS pending: {len(progress_pending)}"
)
if safety_failures:
print("SAFETY FAILURES (release blockers):", ", ".join(safety_failures))
return 1
if progress_pending:
print("Pending (informational, not a failure):", ", ".join(progress_pending))
return 0


if __name__ == "__main__":
raise SystemExit(main())