Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions benchmarks/harness/test_gate_zero_evidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
REAL_MACOS_G1 = ROOT / "benchmarks" / "results" / "gate-zero" / "macos-arm64" / "g1.json"
TIMESTAMP = "20260612T081702Z"
BENCHMARK_COMMIT = "c68389c28535bbab74a1efbe5bd923c8ff4ec341"
requires_real_macos_g1 = unittest.skipUnless(
REAL_MACOS_G1.is_file(),
"requires generated macos-arm64 Gate Zero g1.json",
)
REPRODUCTION_COMMAND = (
"python3 benchmarks/harness/run_gate_zero.py --mode ethos --repo-root . "
"--manifest benchmarks/gate-zero/manifest.json "
Expand Down Expand Up @@ -46,6 +50,7 @@ def build_real_bundle(out_root: Path) -> Path:


class GateZeroEvidenceBundleTests(unittest.TestCase):
@requires_real_macos_g1
def test_evidence_bundle_from_existing_macos_g1_json(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
bundle_dir = build_real_bundle(Path(tmp))
Expand Down Expand Up @@ -79,6 +84,7 @@ def test_evidence_bundle_from_existing_macos_g1_json(self) -> None:
self.assertEqual(digest["payload_sha256"], run_gate_zero.sha256_file(checksums_path))
self.assertIn("not a public-key signature", digest["note"])

@requires_real_macos_g1
def test_reproduction_env_records_resolved_paths_and_hashes(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
Expand Down Expand Up @@ -114,6 +120,7 @@ def test_reproduction_env_records_resolved_paths_and_hashes(self) -> None:
self.assertIsNotNone(install_entry["tree_sha256"])
self.assertIn("ETHOS_LITEPARSE_BIN is not set", reproduction_env["blockers"])

@requires_real_macos_g1
def test_host_attestation_matches_result(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
bundle_dir = build_real_bundle(Path(tmp))
Expand All @@ -134,6 +141,7 @@ def test_host_attestation_matches_result(self) -> None:
)
self.assertEqual(attestation["result_host"]["selected"]["id"], "mac-m4pro-arm64")

@requires_real_macos_g1
def test_human_summary_preserves_edgeparse_failure_truth(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
bundle_dir = build_real_bundle(Path(tmp))
Expand All @@ -151,6 +159,7 @@ def test_human_summary_preserves_edgeparse_failure_truth(self) -> None:
self.assertIn("Ethos passed all top-level G1 determinism checks", summary)
self.assertIn("does not claim Ethos is the fastest parser overall", summary)

@requires_real_macos_g1
def test_checksum_verifier_detects_tampered_payload(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
bundle_dir = build_real_bundle(Path(tmp))
Expand Down
2 changes: 2 additions & 0 deletions docs/benchmark-ownership.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ This repository owns implementation-adjacent benchmark inputs and historical evi
Generated Gate Zero benchmark results are not checked into `ethos`; they belong in
`ethos-bench`. Public wording must continue to point readers at `docs/execution-status.md` for
the current pre-alpha status and blockers.
The controlled-run handoff is `docs/gate-zero-evidence-runbook.md`.

Before any public repository push, run the public-release checklist in
`docs/public-release-checklist.md`. Historical generated evidence may contain local reproduction
Expand Down Expand Up @@ -62,3 +63,4 @@ cross-platform rendered-crop byte-identity claims
4. Record cross-host or claim-affecting evidence in `ethos/docs/validation/` or in signed
`ethos-bench/benchmarks/results/gate-zero/` files, depending on whether the evidence is a
product-boundary validation or a benchmark result.
5. Fill ADR-0005 only after the required G1/G2/G3 result files and evidence bundles exist.
12 changes: 6 additions & 6 deletions docs/execution-status.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Ethos Execution Status

Date: 2026-06-15
Date: 2026-06-16
Owner: product / decider
Status: Pre-alpha / Milestone A implementation. Week 0 governance is accepted, WS-ENGINE Phase 1 has a real narrow PDFium path, WS-VERIFY-ALPHA has real deterministic evidence checks over native Ethos JSON and pinned OpenDataLoader output, WS-HARNESS has fail-closed readiness scaffolding, the Gate Zero corpus/hardware manifest and direct competitor lock are frozen/signed, ADR-0006 closes package identifier/trademark validation, ADR-0007 locks the product direction, and the public-source preflight is green for a source-only pre-alpha GitHub push. Signed host result generation still blocks Gate Zero, public benchmark reports, releases, packages, and all performance/quality claims.
Status: Pre-alpha / Milestone A implementation. Week 0 governance is accepted, WS-ENGINE Phase 1 has a real narrow PDFium path, WS-VERIFY-ALPHA has real deterministic evidence checks over native Ethos JSON and pinned OpenDataLoader output, WS-HARNESS has fail-closed readiness scaffolding, the Gate Zero corpus/hardware manifest and direct competitor lock are frozen/signed, ADR-0006 closes package identifier/trademark validation, ADR-0007 locks the product direction, and the public-source preflight is green for a source-only pre-alpha GitHub push. Signed host result generation still blocks Gate Zero, public benchmark reports, releases, packages, and all performance/quality claims. The next controlled-run handoff is `docs/gate-zero-evidence-runbook.md`.

## Current Reality

Expand All @@ -16,9 +16,9 @@ The committed implementation now includes:
- `ethos doc parse` / `ethos fingerprint` PDF execution through a worker process with `max_parse_ms` timeout enforcement, stable error-envelope relay, diagnostics-gated worker stderr, and page-range validation/filtering.
- Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements and simple column reading order for the current born-digital fixtures.
- Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract plus referential-integrity and bbox sanity checks outside JSON Schema.
- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas.
- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, not-found, stale-fingerprint, capability-limited, malformed-citation, and summary-format reject paths.
- Verification semantics are now trust-honest at alpha scope: quote containment is explicitly labeled, value/table-cell checks require normalized equality, fingerprint-pinned citations fail closed when source fingerprints are unavailable, and structured capability limits explain why a run is downgraded.
- `make verify-alpha` is now the product-proof command for the alpha trust loop: it checks native examples, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, byte-identical repeated verification reports, byte-identical native crop descriptors, and foreign fixture manifest hash binding.
- `make verify-alpha` is the current alpha trust-loop command: it checks native examples, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding.
- Native Ethos verification can emit deterministic, schema-backed crop descriptor JSON artifacts through `--crop-dir`; these bind `document_fingerprint`, page, bbox, and check ids. Native `crop_ref` filenames are logical evidence references derived from document fingerprint, check id, and page, while descriptors still record the exact observed bbox. When `--crop-source-pdf` is supplied, the CLI validates source-PDF fingerprint binding and emits PNG crop artifacts whose filenames, byte hashes, dimensions, and source fingerprint are bound from the descriptor. `make verify-rendered-crops` checks same-host repeated-run stability for the rendered artifact path, and `make compare-rendered-crops` classifies two rendered-crop runs by separating logical evidence identity from rendered artifact byte equality. Cross-platform rendered image determinism is not claimed; the 2026-06-14 macOS arm64 vs Linux x64 validation record in `docs/validation/rendered-crops-2026-06-14.md` preserved document fingerprint and `payload_sha256` but failed rendered artifact byte equality because the evidence bbox differed slightly across platforms.

Still absent or not claimable: reproducible benchmark result JSON, executed competitor comparisons, public speed/quality/footprint claims, OCR/image-only support, real table extraction, mature list/heading/layout semantics, semantic/arithmetic verification beyond deterministic evidence lookup, Phase 2 project-maintained PDFium builds, release packaging, and full frozen-corpus multi-platform determinism evidence.
Expand All @@ -29,8 +29,8 @@ PM execution packet: `benchmarks/gate-zero/FREEZE_PACKET.md`.

| ID | Blocker | Required output | Owner | Blocks |
| --- | --- | --- | --- | --- |
| H1 | Generate signed Gate Zero host results | `benchmarks/results/gate-zero/{macos-arm64,linux-x64}/g1.json` plus G2/G3 result files are produced from the frozen manifest and pinned lock | Benchmark owner / decider | Valid Gate Zero run, public benchmark trust |
| H2 | Execute pinned competitor comparisons | Harness executes the pinned OpenDataLoader, EdgeParse, LiteParse, and PyMuPDF4LLM artifacts and records signed comparison rows where applicable | Benchmark owner | Public competitor comparison |
| H1 | Generate signed Gate Zero host results | `../ethos-bench/benchmarks/results/gate-zero/{macos-arm64,linux-x64}/g1.json` plus G2/G3 result files are produced from the frozen manifest and pinned lock | Benchmark owner / decider | Valid Gate Zero run, public benchmark trust |
| H2 | Execute pinned competitor comparisons | Harness executes the pinned OpenDataLoader, EdgeParse, LiteParse, and PyMuPDF4LLM artifacts and records signed comparison rows where applicable in `ethos-bench` | Benchmark owner | Public competitor comparison |
| H3 | Accept package identifier ADR | Closed by ADR-0006 acceptance on 2026-06-15 | Devrel / decider | Unblocked package identifier/trademark gate; broader public-release checklist still applies |

The corpus/hardware freeze and direct competitor pins are recorded in `benchmarks/gate-zero/manifest.json` and `benchmarks/competitors.lock.json`. The remaining blockers are result production and signed evidence, not manifest/pin placeholders.
Expand Down
146 changes: 146 additions & 0 deletions docs/gate-zero-evidence-runbook.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Gate Zero Evidence Runbook

This runbook starts the controlled evidence path required before Ethos can move beyond
source-only pre-alpha language.

It does not approve benchmark publication, package publication, release artifacts, or launch
claims. Generated Gate Zero result files and evidence bundles belong in the sibling
`ethos-bench` repository, not in this repository.

## Current Boundary

Use this repository for:

- frozen corpus, manifest, gates, profiles, source code, and implementation-facing harness code;
- local preflight checks before controlled host runs;
- human-readable status and validation records.

Use `../ethos-bench` for:

- generated `g1.json`, `g2.json`, and `g3.json` result files;
- per-gate evidence bundles and checksum manifests;
- public-safe derived benchmark evidence.

Do not commit generated Gate Zero output under `ethos/benchmarks/results/gate-zero/`.

## Required Inputs

Before running host evidence:

- `benchmarks/gate-zero/manifest.json` is frozen and signed.
- `benchmarks/competitors.lock.json` pins the selected direct competitor artifacts.
- `benchmarks/gate-zero/gates.json` records the active G2/G3 definitions.
- The host is one of the recorded Gate Zero hosts in the manifest.
- The matching pinned PDFium runtime and artifact are available for the host.
- The sibling `ethos-bench` checkout is clean and ready to receive generated evidence.

## Preflight

Run from the Ethos checkout:

```bash
git switch main
git pull --ff-only
make verify-alpha PYTHON=/private/tmp/ethos-jsonschema-venv/bin/python
python3 .github/scripts/readiness_gate.py gate-zero
make -C benchmarks/harness smoke
make -C benchmarks/harness test
git status --short --branch
```

The `readiness_gate.py gate-zero` command only checks that frozen inputs and pins are present. It
does not produce benchmark results.

## Per-Host G1 Result

Set these paths for each controlled host:

```bash
export ETHOS_REPO=/path/to/ethos
export ETHOS_BENCH=/path/to/ethos-bench
export GATE_ZERO_PLATFORM=macos-arm64 # or linux-x64
export ETHOS_PDFIUM_LIBRARY_PATH=/path/to/libpdfium
export ETHOS_PDFIUM_VERSION=chromium/7881
export ETHOS_PDFIUM_ARTIFACT_PATH=/path/to/pdfium-artifact
```

Then run the G1 result into `ethos-bench`:

```bash
make -C "$ETHOS_REPO/benchmarks/harness" gate-zero-results \
GATE_ZERO_PLATFORM="$GATE_ZERO_PLATFORM" \
GATE_ZERO_RESULT_REPORT="$ETHOS_BENCH/benchmarks/results/gate-zero/$GATE_ZERO_PLATFORM/g1.json" \
OPENDATALOADER_COMMAND=/path/to/opendataloader-pdf \
OPENDATALOADER_ARTIFACT=/path/to/opendataloader_pdf-2.4.7-py3-none-any.whl \
OPENDATALOADER_INSTALL_PATH=/path/to/opendataloader-install
```

Add other competitor command/artifact/install variables only when those pinned adapters are ready
for the controlled run. Missing competitor evidence must remain explicit; do not backfill it from
ad hoc terminal output.

## Per-Host G2 Result

Run G2 into `ethos-bench` with explicit footprint inputs:

```bash
make -C "$ETHOS_REPO/benchmarks/harness" gate-zero-g2 \
GATE_ZERO_PLATFORM="$GATE_ZERO_PLATFORM" \
GATE_ZERO_G2_RESULT_REPORT="$ETHOS_BENCH/benchmarks/results/gate-zero/$GATE_ZERO_PLATFORM/g2.json" \
'GATE_ZERO_ETHOS_FOOTPRINT=ethos-cli=target/release/ethos pdfium-library=/path/to/libpdfium' \
OPENDATALOADER_INSTALL_PATH=/path/to/opendataloader-install \
OPENDATALOADER_ARTIFACT=/path/to/opendataloader_pdf-2.4.7-py3-none-any.whl \
GATE_ZERO_PDFIUM_LIBRARY_PATH=/path/to/libpdfium \
GATE_ZERO_PDFIUM_ARTIFACT=/path/to/pdfium-artifact
```

G2 must cite the active gate definition hash through the runner output. Do not replace the full
base-parser artifact set with a narrower measurement.

## Cross-Host G3 Result

After both platform-scoped G1 files exist in `ethos-bench`, generate G3:

```bash
make -C "$ETHOS_REPO/benchmarks/harness" gate-zero-g3 \
GATE_ZERO_G3_RESULT_REPORT="$ETHOS_BENCH/benchmarks/results/gate-zero/g3.json" \
GATE_ZERO_G3_PLATFORM_RESULTS="macos-arm64=$ETHOS_BENCH/benchmarks/results/gate-zero/macos-arm64/g1.json linux-x64=$ETHOS_BENCH/benchmarks/results/gate-zero/linux-x64/g1.json"
```

G3 cannot pass from one host. Diagnostic geometry experiments can explain failures, but they are
not Gate Zero pass/fail results.

## Evidence Bundles

For each saved result file, build an evidence bundle in `ethos-bench`:

```bash
make -C "$ETHOS_REPO/benchmarks/harness" gate-zero-evidence \
GATE_ZERO_PLATFORM="$GATE_ZERO_PLATFORM" \
GATE_ZERO_GATE=g1 \
GATE_ZERO_RESULT_REPORT="$ETHOS_BENCH/benchmarks/results/gate-zero/$GATE_ZERO_PLATFORM/g1.json" \
GATE_ZERO_EVIDENCE_OUT_ROOT="$ETHOS_BENCH/benchmarks/results/gate-zero" \
GATE_ZERO_REPRODUCTION_COMMAND_FILE=/path/to/reproduction-command.txt \
GATE_ZERO_REPRODUCTION_ENV_FILE=/path/to/reproduction-env.json \
GATE_ZERO_BENCHMARK_COMMIT="$(git -C "$ETHOS_REPO" rev-parse HEAD)"
```

Repeat for G2 and G3 with the matching gate/result paths. Reproduction command and environment
sidecars must describe the actual controlled run; placeholders keep the bundle incomplete.

## Decision Step

Fill `docs/decisions/ADR-0005-gate-zero-decision.md` only after:

- required G1 files exist for both recorded hosts;
- required G2 files exist for both recorded hosts;
- G3 has compared the required hosts;
- evidence bundles exist for the source result files;
- the decider has reviewed the result JSON and reproduction sidecars.

Until that ADR is filled, public language remains:

```text
Ethos is pre-alpha. It verifies whether AI citations are grounded in document evidence across
native Ethos JSON and supported foreign parser outputs.
```
Loading