diff --git a/benchmarks/harness/test_gate_zero_evidence.py b/benchmarks/harness/test_gate_zero_evidence.py index 746c8cc..7f32f9e 100644 --- a/benchmarks/harness/test_gate_zero_evidence.py +++ b/benchmarks/harness/test_gate_zero_evidence.py @@ -14,6 +14,10 @@ REAL_MACOS_G1 = ROOT / "benchmarks" / "results" / "gate-zero" / "macos-arm64" / "g1.json" TIMESTAMP = "20260612T081702Z" BENCHMARK_COMMIT = "c68389c28535bbab74a1efbe5bd923c8ff4ec341" +requires_real_macos_g1 = unittest.skipUnless( + REAL_MACOS_G1.is_file(), + "requires generated macos-arm64 Gate Zero g1.json", +) REPRODUCTION_COMMAND = ( "python3 benchmarks/harness/run_gate_zero.py --mode ethos --repo-root . " "--manifest benchmarks/gate-zero/manifest.json " @@ -46,6 +50,7 @@ def build_real_bundle(out_root: Path) -> Path: class GateZeroEvidenceBundleTests(unittest.TestCase): + @requires_real_macos_g1 def test_evidence_bundle_from_existing_macos_g1_json(self) -> None: with tempfile.TemporaryDirectory() as tmp: bundle_dir = build_real_bundle(Path(tmp)) @@ -79,6 +84,7 @@ def test_evidence_bundle_from_existing_macos_g1_json(self) -> None: self.assertEqual(digest["payload_sha256"], run_gate_zero.sha256_file(checksums_path)) self.assertIn("not a public-key signature", digest["note"]) + @requires_real_macos_g1 def test_reproduction_env_records_resolved_paths_and_hashes(self) -> None: with tempfile.TemporaryDirectory() as tmp: root = Path(tmp) @@ -114,6 +120,7 @@ def test_reproduction_env_records_resolved_paths_and_hashes(self) -> None: self.assertIsNotNone(install_entry["tree_sha256"]) self.assertIn("ETHOS_LITEPARSE_BIN is not set", reproduction_env["blockers"]) + @requires_real_macos_g1 def test_host_attestation_matches_result(self) -> None: with tempfile.TemporaryDirectory() as tmp: bundle_dir = build_real_bundle(Path(tmp)) @@ -134,6 +141,7 @@ def test_host_attestation_matches_result(self) -> None: ) self.assertEqual(attestation["result_host"]["selected"]["id"], "mac-m4pro-arm64") + @requires_real_macos_g1 def test_human_summary_preserves_edgeparse_failure_truth(self) -> None: with tempfile.TemporaryDirectory() as tmp: bundle_dir = build_real_bundle(Path(tmp)) @@ -151,6 +159,7 @@ def test_human_summary_preserves_edgeparse_failure_truth(self) -> None: self.assertIn("Ethos passed all top-level G1 determinism checks", summary) self.assertIn("does not claim Ethos is the fastest parser overall", summary) + @requires_real_macos_g1 def test_checksum_verifier_detects_tampered_payload(self) -> None: with tempfile.TemporaryDirectory() as tmp: bundle_dir = build_real_bundle(Path(tmp)) diff --git a/docs/benchmark-ownership.md b/docs/benchmark-ownership.md index 2abb9ef..fd0bbb6 100644 --- a/docs/benchmark-ownership.md +++ b/docs/benchmark-ownership.md @@ -14,6 +14,7 @@ This repository owns implementation-adjacent benchmark inputs and historical evi Generated Gate Zero benchmark results are not checked into `ethos`; they belong in `ethos-bench`. Public wording must continue to point readers at `docs/execution-status.md` for the current pre-alpha status and blockers. +The controlled-run handoff is `docs/gate-zero-evidence-runbook.md`. Before any public repository push, run the public-release checklist in `docs/public-release-checklist.md`. Historical generated evidence may contain local reproduction @@ -62,3 +63,4 @@ cross-platform rendered-crop byte-identity claims 4. Record cross-host or claim-affecting evidence in `ethos/docs/validation/` or in signed `ethos-bench/benchmarks/results/gate-zero/` files, depending on whether the evidence is a product-boundary validation or a benchmark result. +5. Fill ADR-0005 only after the required G1/G2/G3 result files and evidence bundles exist. diff --git a/docs/execution-status.md b/docs/execution-status.md index ad25047..d9a8924 100644 --- a/docs/execution-status.md +++ b/docs/execution-status.md @@ -1,8 +1,8 @@ # Ethos Execution Status -Date: 2026-06-15 +Date: 2026-06-16 Owner: product / decider -Status: Pre-alpha / Milestone A implementation. Week 0 governance is accepted, WS-ENGINE Phase 1 has a real narrow PDFium path, WS-VERIFY-ALPHA has real deterministic evidence checks over native Ethos JSON and pinned OpenDataLoader output, WS-HARNESS has fail-closed readiness scaffolding, the Gate Zero corpus/hardware manifest and direct competitor lock are frozen/signed, ADR-0006 closes package identifier/trademark validation, ADR-0007 locks the product direction, and the public-source preflight is green for a source-only pre-alpha GitHub push. Signed host result generation still blocks Gate Zero, public benchmark reports, releases, packages, and all performance/quality claims. +Status: Pre-alpha / Milestone A implementation. Week 0 governance is accepted, WS-ENGINE Phase 1 has a real narrow PDFium path, WS-VERIFY-ALPHA has real deterministic evidence checks over native Ethos JSON and pinned OpenDataLoader output, WS-HARNESS has fail-closed readiness scaffolding, the Gate Zero corpus/hardware manifest and direct competitor lock are frozen/signed, ADR-0006 closes package identifier/trademark validation, ADR-0007 locks the product direction, and the public-source preflight is green for a source-only pre-alpha GitHub push. Signed host result generation still blocks Gate Zero, public benchmark reports, releases, packages, and all performance/quality claims. The next controlled-run handoff is `docs/gate-zero-evidence-runbook.md`. ## Current Reality @@ -16,9 +16,9 @@ The committed implementation now includes: - `ethos doc parse` / `ethos fingerprint` PDF execution through a worker process with `max_parse_ms` timeout enforcement, stable error-envelope relay, diagnostics-gated worker stderr, and page-range validation/filtering. - Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements and simple column reading order for the current born-digital fixtures. - Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract plus referential-integrity and bbox sanity checks outside JSON Schema. -- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas. +- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, not-found, stale-fingerprint, capability-limited, malformed-citation, and summary-format reject paths. - Verification semantics are now trust-honest at alpha scope: quote containment is explicitly labeled, value/table-cell checks require normalized equality, fingerprint-pinned citations fail closed when source fingerprints are unavailable, and structured capability limits explain why a run is downgraded. -- `make verify-alpha` is now the product-proof command for the alpha trust loop: it checks native examples, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, byte-identical repeated verification reports, byte-identical native crop descriptors, and foreign fixture manifest hash binding. +- `make verify-alpha` is the current alpha trust-loop command: it checks native examples, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding. - Native Ethos verification can emit deterministic, schema-backed crop descriptor JSON artifacts through `--crop-dir`; these bind `document_fingerprint`, page, bbox, and check ids. Native `crop_ref` filenames are logical evidence references derived from document fingerprint, check id, and page, while descriptors still record the exact observed bbox. When `--crop-source-pdf` is supplied, the CLI validates source-PDF fingerprint binding and emits PNG crop artifacts whose filenames, byte hashes, dimensions, and source fingerprint are bound from the descriptor. `make verify-rendered-crops` checks same-host repeated-run stability for the rendered artifact path, and `make compare-rendered-crops` classifies two rendered-crop runs by separating logical evidence identity from rendered artifact byte equality. Cross-platform rendered image determinism is not claimed; the 2026-06-14 macOS arm64 vs Linux x64 validation record in `docs/validation/rendered-crops-2026-06-14.md` preserved document fingerprint and `payload_sha256` but failed rendered artifact byte equality because the evidence bbox differed slightly across platforms. Still absent or not claimable: reproducible benchmark result JSON, executed competitor comparisons, public speed/quality/footprint claims, OCR/image-only support, real table extraction, mature list/heading/layout semantics, semantic/arithmetic verification beyond deterministic evidence lookup, Phase 2 project-maintained PDFium builds, release packaging, and full frozen-corpus multi-platform determinism evidence. @@ -29,8 +29,8 @@ PM execution packet: `benchmarks/gate-zero/FREEZE_PACKET.md`. | ID | Blocker | Required output | Owner | Blocks | | --- | --- | --- | --- | --- | -| H1 | Generate signed Gate Zero host results | `benchmarks/results/gate-zero/{macos-arm64,linux-x64}/g1.json` plus G2/G3 result files are produced from the frozen manifest and pinned lock | Benchmark owner / decider | Valid Gate Zero run, public benchmark trust | -| H2 | Execute pinned competitor comparisons | Harness executes the pinned OpenDataLoader, EdgeParse, LiteParse, and PyMuPDF4LLM artifacts and records signed comparison rows where applicable | Benchmark owner | Public competitor comparison | +| H1 | Generate signed Gate Zero host results | `../ethos-bench/benchmarks/results/gate-zero/{macos-arm64,linux-x64}/g1.json` plus G2/G3 result files are produced from the frozen manifest and pinned lock | Benchmark owner / decider | Valid Gate Zero run, public benchmark trust | +| H2 | Execute pinned competitor comparisons | Harness executes the pinned OpenDataLoader, EdgeParse, LiteParse, and PyMuPDF4LLM artifacts and records signed comparison rows where applicable in `ethos-bench` | Benchmark owner | Public competitor comparison | | H3 | Accept package identifier ADR | Closed by ADR-0006 acceptance on 2026-06-15 | Devrel / decider | Unblocked package identifier/trademark gate; broader public-release checklist still applies | The corpus/hardware freeze and direct competitor pins are recorded in `benchmarks/gate-zero/manifest.json` and `benchmarks/competitors.lock.json`. The remaining blockers are result production and signed evidence, not manifest/pin placeholders. diff --git a/docs/gate-zero-evidence-runbook.md b/docs/gate-zero-evidence-runbook.md new file mode 100644 index 0000000..1ea6742 --- /dev/null +++ b/docs/gate-zero-evidence-runbook.md @@ -0,0 +1,146 @@ +# Gate Zero Evidence Runbook + +This runbook starts the controlled evidence path required before Ethos can move beyond +source-only pre-alpha language. + +It does not approve benchmark publication, package publication, release artifacts, or launch +claims. Generated Gate Zero result files and evidence bundles belong in the sibling +`ethos-bench` repository, not in this repository. + +## Current Boundary + +Use this repository for: + +- frozen corpus, manifest, gates, profiles, source code, and implementation-facing harness code; +- local preflight checks before controlled host runs; +- human-readable status and validation records. + +Use `../ethos-bench` for: + +- generated `g1.json`, `g2.json`, and `g3.json` result files; +- per-gate evidence bundles and checksum manifests; +- public-safe derived benchmark evidence. + +Do not commit generated Gate Zero output under `ethos/benchmarks/results/gate-zero/`. + +## Required Inputs + +Before running host evidence: + +- `benchmarks/gate-zero/manifest.json` is frozen and signed. +- `benchmarks/competitors.lock.json` pins the selected direct competitor artifacts. +- `benchmarks/gate-zero/gates.json` records the active G2/G3 definitions. +- The host is one of the recorded Gate Zero hosts in the manifest. +- The matching pinned PDFium runtime and artifact are available for the host. +- The sibling `ethos-bench` checkout is clean and ready to receive generated evidence. + +## Preflight + +Run from the Ethos checkout: + +```bash +git switch main +git pull --ff-only +make verify-alpha PYTHON=/private/tmp/ethos-jsonschema-venv/bin/python +python3 .github/scripts/readiness_gate.py gate-zero +make -C benchmarks/harness smoke +make -C benchmarks/harness test +git status --short --branch +``` + +The `readiness_gate.py gate-zero` command only checks that frozen inputs and pins are present. It +does not produce benchmark results. + +## Per-Host G1 Result + +Set these paths for each controlled host: + +```bash +export ETHOS_REPO=/path/to/ethos +export ETHOS_BENCH=/path/to/ethos-bench +export GATE_ZERO_PLATFORM=macos-arm64 # or linux-x64 +export ETHOS_PDFIUM_LIBRARY_PATH=/path/to/libpdfium +export ETHOS_PDFIUM_VERSION=chromium/7881 +export ETHOS_PDFIUM_ARTIFACT_PATH=/path/to/pdfium-artifact +``` + +Then run the G1 result into `ethos-bench`: + +```bash +make -C "$ETHOS_REPO/benchmarks/harness" gate-zero-results \ + GATE_ZERO_PLATFORM="$GATE_ZERO_PLATFORM" \ + GATE_ZERO_RESULT_REPORT="$ETHOS_BENCH/benchmarks/results/gate-zero/$GATE_ZERO_PLATFORM/g1.json" \ + OPENDATALOADER_COMMAND=/path/to/opendataloader-pdf \ + OPENDATALOADER_ARTIFACT=/path/to/opendataloader_pdf-2.4.7-py3-none-any.whl \ + OPENDATALOADER_INSTALL_PATH=/path/to/opendataloader-install +``` + +Add other competitor command/artifact/install variables only when those pinned adapters are ready +for the controlled run. Missing competitor evidence must remain explicit; do not backfill it from +ad hoc terminal output. + +## Per-Host G2 Result + +Run G2 into `ethos-bench` with explicit footprint inputs: + +```bash +make -C "$ETHOS_REPO/benchmarks/harness" gate-zero-g2 \ + GATE_ZERO_PLATFORM="$GATE_ZERO_PLATFORM" \ + GATE_ZERO_G2_RESULT_REPORT="$ETHOS_BENCH/benchmarks/results/gate-zero/$GATE_ZERO_PLATFORM/g2.json" \ + 'GATE_ZERO_ETHOS_FOOTPRINT=ethos-cli=target/release/ethos pdfium-library=/path/to/libpdfium' \ + OPENDATALOADER_INSTALL_PATH=/path/to/opendataloader-install \ + OPENDATALOADER_ARTIFACT=/path/to/opendataloader_pdf-2.4.7-py3-none-any.whl \ + GATE_ZERO_PDFIUM_LIBRARY_PATH=/path/to/libpdfium \ + GATE_ZERO_PDFIUM_ARTIFACT=/path/to/pdfium-artifact +``` + +G2 must cite the active gate definition hash through the runner output. Do not replace the full +base-parser artifact set with a narrower measurement. + +## Cross-Host G3 Result + +After both platform-scoped G1 files exist in `ethos-bench`, generate G3: + +```bash +make -C "$ETHOS_REPO/benchmarks/harness" gate-zero-g3 \ + GATE_ZERO_G3_RESULT_REPORT="$ETHOS_BENCH/benchmarks/results/gate-zero/g3.json" \ + GATE_ZERO_G3_PLATFORM_RESULTS="macos-arm64=$ETHOS_BENCH/benchmarks/results/gate-zero/macos-arm64/g1.json linux-x64=$ETHOS_BENCH/benchmarks/results/gate-zero/linux-x64/g1.json" +``` + +G3 cannot pass from one host. Diagnostic geometry experiments can explain failures, but they are +not Gate Zero pass/fail results. + +## Evidence Bundles + +For each saved result file, build an evidence bundle in `ethos-bench`: + +```bash +make -C "$ETHOS_REPO/benchmarks/harness" gate-zero-evidence \ + GATE_ZERO_PLATFORM="$GATE_ZERO_PLATFORM" \ + GATE_ZERO_GATE=g1 \ + GATE_ZERO_RESULT_REPORT="$ETHOS_BENCH/benchmarks/results/gate-zero/$GATE_ZERO_PLATFORM/g1.json" \ + GATE_ZERO_EVIDENCE_OUT_ROOT="$ETHOS_BENCH/benchmarks/results/gate-zero" \ + GATE_ZERO_REPRODUCTION_COMMAND_FILE=/path/to/reproduction-command.txt \ + GATE_ZERO_REPRODUCTION_ENV_FILE=/path/to/reproduction-env.json \ + GATE_ZERO_BENCHMARK_COMMIT="$(git -C "$ETHOS_REPO" rev-parse HEAD)" +``` + +Repeat for G2 and G3 with the matching gate/result paths. Reproduction command and environment +sidecars must describe the actual controlled run; placeholders keep the bundle incomplete. + +## Decision Step + +Fill `docs/decisions/ADR-0005-gate-zero-decision.md` only after: + +- required G1 files exist for both recorded hosts; +- required G2 files exist for both recorded hosts; +- G3 has compared the required hosts; +- evidence bundles exist for the source result files; +- the decider has reviewed the result JSON and reproduction sidecars. + +Until that ADR is filled, public language remains: + +```text +Ethos is pre-alpha. It verifies whether AI citations are grounded in document evidence across +native Ethos JSON and supported foreign parser outputs. +```