Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/scripts/test_ci_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def test_ci_workflow_guard_is_run_by_ci(self) -> None:
self.assertIn("python3 .github/scripts/test_execution_status.py", text)
self.assertIn("python3 .github/scripts/test_roadmap_status.py", text)
self.assertIn("python3 .github/scripts/test_milestone_b_closeout_record.py", text)
self.assertIn("python3 .github/scripts/test_milestone_c_closeout_record.py", text)
self.assertIn("python3 .github/scripts/test_milestone_b_exit_checklist.py", text)


Expand Down
11 changes: 9 additions & 2 deletions .github/scripts/test_execution_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,26 @@ class ExecutionStatusTests(unittest.TestCase):
def test_status_is_scoped_to_internal_closeout(self) -> None:
text = status_text()

self.assertIn("Status: Pre-alpha / internal Milestone B closeout.", text)
self.assertIn(
"Status: Pre-alpha / internal Milestone C artifact-validation closeout.",
text,
)
self.assertNotIn("Status: Pre-alpha / Milestone B entry.", text)

def test_internal_check_command_is_documented(self) -> None:
text = status_text()

self.assertIn("make milestone-b-internal-checks", text)
self.assertIn("make milestone-c-internal-checks", text)
self.assertIn("CI has a static guard for that target's command wiring", text)

def test_public_posture_boundary_remains_explicit(self) -> None:
text = status_text()

self.assertIn('Public language stays at "pre-alpha / Milestone B internal continuation"', text)
self.assertIn(
'Public language stays at "pre-alpha / internal Milestone C artifact-validation closeout"',
text,
)
self.assertIn("claim audit approves specific wording", text)
self.assertIn("product-differentiating path remains verification and grounding first", text)

Expand Down
88 changes: 88 additions & 0 deletions .github/scripts/test_milestone_c_closeout_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env python3
#
# Copyright 2026 The Ethos maintainers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import annotations

import re
import unittest
from pathlib import Path


ROOT = Path(__file__).resolve().parents[2]
RECORD = ROOT / "docs/validation/milestone-c-closeout-validation-2026-06-18.md"
VALIDATION_README = ROOT / "docs/validation/README.md"


def record_text() -> str:
return RECORD.read_text(encoding="utf-8")


def normalized_record_text() -> str:
return re.sub(r"\s+", " ", record_text())


class MilestoneCCloseoutRecordTests(unittest.TestCase):
def test_record_is_indexed(self) -> None:
text = VALIDATION_README.read_text(encoding="utf-8")

self.assertIn("milestone-c-closeout-validation-2026-06-18.md", text)

def test_record_names_internal_validation_command(self) -> None:
text = record_text()

self.assertIn("Validated `main` HEAD: `4e3adbb`", text)
self.assertIn(
"make milestone-c-internal-checks PYTHON=<jsonschema-venv>/bin/python",
text,
)
self.assertIn("make rag-chunk-alpha", text)
self.assertIn("make security-report-alpha", text)
self.assertIn(".github/scripts/test_milestone_c_closeout_record.py", text)
self.assertIn(".github/scripts/test_milestone_c_internal_checks.py", text)

def test_record_keeps_public_boundaries_explicit(self) -> None:
text = normalized_record_text()

self.assertIn("does not approve public benchmark reports", text)
self.assertIn("release artifacts", text)
self.assertIn("package publication", text)
self.assertIn("production positioning", text)
self.assertIn(
"Performance, quality, footprint, table-quality, and parser-quality claims remain blocked",
text,
)

def test_record_scopes_remaining_work(self) -> None:
text = normalized_record_text()

self.assertIn("Debug overlay work remains future work outside this closeout record", text)
self.assertIn(
"Broader table semantics and parser-quality evaluation remain future work outside this closeout record",
text,
)
self.assertIn("Cross-platform rendered artifact byte equality remains unclaimed", text)

def test_record_avoids_local_private_paths(self) -> None:
text = record_text()

self.assertNotIn("/Users/", text)
self.assertNotIn("/private/tmp", text)
self.assertNotIn("/private/var", text)


if __name__ == "__main__":
unittest.main()
1 change: 1 addition & 0 deletions .github/scripts/test_milestone_c_internal_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def test_target_composes_current_artifact_gates(self) -> None:
required = [
"$(MAKE) rag-chunk-alpha PYTHON=$(PYTHON)",
"$(MAKE) security-report-alpha PYTHON=$(PYTHON)",
"$(PYTHON) .github/scripts/test_milestone_c_closeout_record.py",
"$(PYTHON) .github/scripts/test_milestone_c_internal_checks.py",
"git diff --check",
]
Expand Down
3 changes: 2 additions & 1 deletion .github/scripts/test_roadmap_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def test_roadmap_points_to_execution_status_for_current_posture(self) -> None:
text = roadmap_text()

self.assertIn("Current PM status and blockers: `docs/execution-status.md`.", text)
self.assertIn("Milestone B is in internal closeout", text)
self.assertIn("Milestone C has an internal source-tree artifact-validation closeout", text)
self.assertIn("milestone-c-closeout-validation-2026-06-18.md", text)

def test_closeout_note_keeps_public_boundaries_explicit(self) -> None:
text = normalized_roadmap_text()
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ jobs:
run: python3 .github/scripts/test_roadmap_status.py
- name: Milestone B closeout validation record tests
run: python3 .github/scripts/test_milestone_b_closeout_record.py
- name: Milestone C closeout validation record tests
run: python3 .github/scripts/test_milestone_c_closeout_record.py
- name: Milestone B exit checklist tests
run: python3 .github/scripts/test_milestone_b_exit_checklist.py
- name: Gate Zero harness tests
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ milestone-b-internal-checks:
milestone-c-internal-checks:
$(MAKE) rag-chunk-alpha PYTHON=$(PYTHON)
$(MAKE) security-report-alpha PYTHON=$(PYTHON)
$(PYTHON) .github/scripts/test_milestone_c_closeout_record.py
$(PYTHON) .github/scripts/test_milestone_c_internal_checks.py
git diff --check

Expand Down
13 changes: 9 additions & 4 deletions docs/execution-status.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Ethos Execution Status

Date: 2026-06-17
Date: 2026-06-18
Owner: product / decider
Status: Pre-alpha / internal Milestone B closeout. Week 0 governance is accepted, WS-ENGINE Phase 1 has a real narrow PDFium path, WS-VERIFY-ALPHA has real deterministic evidence checks over native Ethos JSON and pinned OpenDataLoader output, WS-HARNESS has fail-closed readiness scaffolding, the Gate Zero corpus/hardware manifest and direct competitor lock are frozen/signed, ADR-0005 records an accepted `PROCEED` decision for internal Milestone B continuation, ADR-0006 closes package identifier/trademark validation, ADR-0007 locks the product direction, and the public-source preflight is green for a source-only pre-alpha GitHub push. Public benchmark reports, releases, packages, production positioning, and all performance/quality/footprint claims remain blocked. The controlled-run handoff remains `docs/gate-zero-evidence-runbook.md`; the accepted decision record is `docs/decisions/ADR-0005-gate-zero-decision.md`.
Status: Pre-alpha / internal Milestone C artifact-validation closeout. Week 0 governance is accepted, WS-ENGINE Phase 1 has a real narrow PDFium path, WS-VERIFY-ALPHA has real deterministic evidence checks over native Ethos JSON and pinned OpenDataLoader output, WS-HARNESS has fail-closed readiness scaffolding, the Gate Zero corpus/hardware manifest and direct competitor lock are frozen/signed, ADR-0005 records an accepted `PROCEED` decision for internal Milestone B continuation, ADR-0006 closes package identifier/trademark validation, ADR-0007 locks the product direction, and the public-source preflight is green for a source-only pre-alpha GitHub push. Current Milestone C work has a source-tree internal artifact-validation closeout for the RAG chunk and security-report trust-loop checks. Public benchmark reports, releases, packages, production positioning, and all performance/quality/footprint claims remain blocked. The controlled-run handoff remains `docs/gate-zero-evidence-runbook.md`; the accepted decision record is `docs/decisions/ADR-0005-gate-zero-decision.md`.

## Current Reality

Expand All @@ -23,6 +23,9 @@ The committed implementation now includes:
- `make verify-alpha` is the current alpha trust-loop command: it checks native examples, split-quote evidence matching, unsupported non-v1 claim reporting, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, verify-alpha case inventory coverage, usage diagnostics for malformed citations and malformed OpenDataLoader-style structures, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding. `make milestone-b-internal-checks` composes the current internal Milestone B validation path across fixture validation, font-policy profile checks, verify alpha, layout evaluator, Python surface tests, and policy gates; CI has a static guard for that target's command wiring.
- An internal Python surface scaffold exists under `python/ethos_pdf`. It shells out to a caller-provided local `ethos` CLI binary for `ethos doc parse` JSON, Markdown, and text output, and has stdlib unit tests that use a fake local command. This is pre-alpha scaffolding for Milestone B API shape work, not a public installation or publication path.
- Native Ethos verification can emit deterministic, schema-backed crop descriptor JSON artifacts through `--crop-dir`; these bind `document_fingerprint`, page, bbox, and check ids. Native `crop_ref` filenames are logical evidence references derived from document fingerprint, check id, and page, while descriptors still record the exact observed bbox. When `--crop-source-pdf` is supplied, the CLI validates source-PDF fingerprint binding and emits PNG crop artifacts whose filenames, byte hashes, dimensions, and source fingerprint are bound from the descriptor. `make verify-rendered-crops` checks same-host repeated-run stability for the rendered artifact path, and `make compare-rendered-crops` classifies two rendered-crop runs by separating logical evidence identity from rendered artifact byte equality. Cross-platform rendered image determinism is not claimed; the 2026-06-14 macOS arm64 vs Linux x64 validation record in `docs/validation/rendered-crops-2026-06-14.md` preserved document fingerprint and `payload_sha256` but failed rendered artifact byte equality because the evidence bbox differed slightly across platforms.
- `ethos rag chunk` has a committed-example artifact loop over `schemas/examples/document.example.json` and `schemas/examples/chunks.example.jsonl`. The current internal checks cover exact fixture/golden output, repeated-run byte identity, schema/example validation, stale page/element/bbox-page reference rejection, and default-chunk exclusion warning-reference rejection.
- `ethos security report` has a source-only pre-alpha artifact check over the committed document example. The current internal checks cover deterministic report output, report/source identity grounding, security-warning lane and message diagnostics, locator grounding, inventory/report parity, summary drift, warning id uniqueness, deterministic warning numbering, and explicit rejection of unsupported current source-warning references.
- `make milestone-c-internal-checks` composes the current internal Milestone C artifact-validation path across RAG chunk and security-report gates; CI/static guard scripts fail closed if that command wiring or the dated closeout record drifts.

Still absent or not claimable: public benchmark reports, public competitor-comparison claims, public speed/quality/footprint claims, OCR/image-only support, real table extraction, mature list/heading/layout semantics beyond current fixture-backed alpha paths, semantic/arithmetic verification beyond deterministic evidence lookup, Phase 2 project-maintained PDFium builds, release packaging, and claim-audit approval for any public result wording.

Expand All @@ -45,7 +48,7 @@ The corpus/hardware freeze and direct competitor pins are recorded in `benchmark

## Current Milestone Posture

Milestone A has an accepted internal Gate Zero decision for roadmap control, so Milestone B work may proceed internally. The product can demonstrate a narrow parser-backed grounding loop today, but the decision cannot be used as public benchmark credibility.
Milestone A has an accepted internal Gate Zero decision for roadmap control, Milestone B is internally closed for the current source-tree validation scope, and Milestone C now has an internal artifact-validation closeout record. The product can demonstrate a narrow parser-backed grounding loop today, but the decision cannot be used as public benchmark credibility.

| Work item | Current status | Remaining blocker |
| --- | --- | --- |
Expand All @@ -58,9 +61,11 @@ Milestone A has an accepted internal Gate Zero decision for roadmap control, so
| Font policy groundwork | Partially landed: substitution table and profile policy are present; substitution-table bytes are pinned by the deterministic profile and checked by schema/example validation; absent bundled fallback assets must remain represented by a null fallback-bundle hash; fixture output uses deterministic substitution IDs, committed embedded-font fixture metadata now binds expected extraction font identity, document schema/font extraction keep emitted font ids inside the deterministic ASCII `embedded:` / `subst:` contract, and CLI font-isolation PDFs are manifest/hash-bound | Bundled fallback asset introduction/hash pinning and broader font/CID validation remain open |
| Schema/example validation | Landed: schemas, examples, deterministic profile, referential integrity, and bbox sanity pass the `jsonschema` validation gate | Contract changes still require explicit versioning and compatibility review |
| Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping, explicit real ODL-style row/cell table grounding, conservative real-style text/child-container alias normalization, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, split-quote fixture coverage, explicit unsupported non-v1 claim reporting, OpenDataLoader-style structure diagnostics for malformed bbox and unknown-page references, verify-alpha case inventory checks, and demo fixtures | Still needed: additional adapter hardening against broader real output shapes, future claim-kind expansion outside the current v1 alpha policy, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding |
| RAG chunk artifact checks | Landed for current source examples: deterministic command-level fixture/golden output, repeated-run byte identity, schema/example validation, stale reference rejection, and default-chunk exclusion warning-reference rejection | Broader chunk provenance/citation policy and future parser/table integration remain future work |
| Security report artifact checks | Landed for current source examples: source-only `ethos security report`, deterministic output, source/report identity validation, security-warning locator grounding, warning-lane diagnostics, inventory/report parity, summary drift checks, and deterministic warning id checks | Broader security-report generation semantics, debug overlay integration, and future artifact UX remain future work |
| WS-HARNESS readiness | Partially landed: readiness path is green for frozen corpus/hardware and pinned competitors, Gate Zero evidence preflight validates the current `ethos-bench` handoff, and gates fail closed if those records regress | Public-safe comparison report flow, release/package approval, claim-wording approval, and future evidence-refresh workflow still need hardening |
| Determinism workflow | Landed: macOS arm64, Linux x64, and Windows x64 matrix entries run core contract tests; PDFium-backed corpus work stays gated on an explicitly configured pinned runtime; static workflow tests guard the matrix | Windows PDFium runtime provisioning and broader cross-platform corpus validation remain future work |

## PM Rule

Public language stays at "pre-alpha / Milestone B internal continuation" until the remaining external blockers are closed and the claim audit approves specific wording. Do not describe Ethos as benchmark-validated, release-ready, production-ready, or broadly parser-complete. Internal parser work should proceed only when it supports accepted evidence paths or the trust layer; the product-differentiating path remains verification and grounding first, with parser expansion serving that path.
Public language stays at "pre-alpha / internal Milestone C artifact-validation closeout" until the remaining external blockers are closed and the claim audit approves specific wording. Do not describe Ethos as benchmark-validated, release-ready, production-ready, or broadly parser-complete. Internal parser work should proceed only when it supports accepted evidence paths or the trust layer; the product-differentiating path remains verification and grounding first, with parser expansion serving that path.
Loading
Loading