diff --git a/.github/scripts/test_ci_workflow.py b/.github/scripts/test_ci_workflow.py new file mode 100644 index 0000000..890f52a --- /dev/null +++ b/.github/scripts/test_ci_workflow.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] +CI_WORKFLOW = ROOT / ".github/workflows/ci.yml" + + +def workflow_text() -> str: + return CI_WORKFLOW.read_text(encoding="utf-8") + + +class CiWorkflowTests(unittest.TestCase): + def test_alpha_fixture_and_layout_gates_run_in_pr_ci(self) -> None: + text = workflow_text() + + self.assertIn("python3 fixtures/validate_fixtures.py", text) + self.assertIn("make layout-evaluator-alpha", text) + self.assertIn("PYTHONPATH=python python3 -m unittest discover -s python/tests", text) + + def test_schema_job_installs_jsonschema_and_validates_examples(self) -> None: + text = workflow_text() + + self.assertIn('pip install "jsonschema>=4.18"', text) + self.assertIn("python3 schemas/validate_examples.py", text) + + def test_ci_workflow_guard_is_run_by_ci(self) -> None: + text = workflow_text() + + self.assertIn("python3 .github/scripts/test_ci_workflow.py", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5fe163e..780aeb2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,6 +54,8 @@ jobs: run: python3 .github/scripts/test_gate_zero_evidence_preflight.py - name: determinism workflow tests run: python3 .github/scripts/test_determinism_workflow.py + - name: CI workflow tests + run: python3 .github/scripts/test_ci_workflow.py - name: Gate Zero harness tests run: python3 benchmarks/harness/test_run_gate_zero.py - name: same-platform double-parse byte-diff diff --git a/docs/execution-status.md b/docs/execution-status.md index eacee66..ce50fca 100644 --- a/docs/execution-status.md +++ b/docs/execution-status.md @@ -16,7 +16,7 @@ The committed implementation now includes: - The determinism workflow includes a Windows x64 preflight lane for core c14n/profile/fingerprint contract tests, while PDFium-backed corpus work remains explicitly skipped unless the pinned runtime is configured on that runner. A static workflow test guards that matrix wiring. - `ethos doc parse` / `ethos fingerprint` PDF execution through a worker process with `max_parse_ms` timeout enforcement, stable error-envelope relay, diagnostics-gated worker stderr, and page-range validation/filtering. - Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements, fixture-backed alpha heading and flat list-item elements, and simple column reading order for the current born-digital fixtures. Current alpha layout confidence is explicit for heading signals, and below-threshold layout confidence emits deterministic `low_confidence_reading_order` diagnostics instead of staying silent. Fixture validation binds selected `fixture.json` expectations to committed extraction/layout goldens and binds current alpha text/Markdown exports to committed layout output so current read-order, element-type, heading-export, list-item, and export cases fail closed on drift. -- An internal layout evaluator scaffold exists at `fixtures/evaluate_layout_alpha.py` and `make layout-evaluator-alpha`. It reads committed `fixture.json`, `extraction.json`, `layout.json`, `text.txt`, and `markdown.md` files, summarizes alpha element-type and subset coverage, and fails closed on missing layout expectations, dangling/invalid warning references, confidence-policy drift, export-golden drift, invalid span expectation metadata, expected page/span-text/font-id drift, expected rotation drift, or drift in fixture-backed reading order / heading / list-item / hyphenation / ligature cases. +- An internal layout evaluator scaffold exists at `fixtures/evaluate_layout_alpha.py` and `make layout-evaluator-alpha`. It reads committed `fixture.json`, `extraction.json`, `layout.json`, `text.txt`, and `markdown.md` files, summarizes alpha element-type and subset coverage, and fails closed on missing layout expectations, dangling/invalid warning references, confidence-policy drift, export-golden drift, invalid span expectation metadata, expected page/span-text/font-id drift, expected rotation drift, or drift in fixture-backed reading order / heading / list-item / hyphenation / ligature cases. PR CI runs the evaluator and has a static workflow guard for that wiring. - Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract plus referential-integrity and bbox sanity checks outside JSON Schema. - `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases, maps explicit real OpenDataLoader-style row/cell structures to table-cell grounding, and normalizes conservative real-style text/child-container aliases when page/bbox/text data remains explicit. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, split-quote, not-found, stale-fingerprint, unsupported non-v1 claim, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths. - Verification semantics are now trust-honest at alpha scope: quote containment is explicitly labeled, value/table-cell checks require normalized equality, fingerprint-pinned citations fail closed when source fingerprints are unavailable, and structured capability limits explain why a run is downgraded. @@ -53,7 +53,7 @@ Milestone A has an accepted internal Gate Zero decision for roadmap control, so | PDFium loader/runtime checks | Landed: missing/mismatched version, artifact, and runtime library hashes fail deterministically | Release packaging and operator setup path still need hardening | | Real PDF backend | Landed for simple born-digital PDFs: page count, quantized spans, worker execution, timeout, page filtering, and fingerprint path exist | Wider corpus coverage, failure fixtures, memory-limit behavior, quirk log, and Gate Zero run are still missing | | Layout groundwork | Landed: basic paragraph text blocks, fixture-backed alpha heading and flat list-item elements, simple column reading order over quantized spans, explicit alpha heading-confidence values, deterministic below-threshold confidence diagnostics, fixture metadata checks against committed extraction/layout goldens for current read-order and element-type expectations, and alpha text/Markdown export goldens derived from committed layout output | Tables, nested/richer list and heading semantics, broader rotation/quirk handling, and broader confidence dimensions remain future work | -| Layout evaluator scaffold | Landed: deterministic internal evaluator over committed extraction/layout fixture expectations, with heading/list/reading-order/rotation/hyphenation/ligature/font-identity/span-expectation coverage checks, expected page/span-text/font-id checks, expected-spans metadata validation, warning-reference checks, confidence-policy checks, text/Markdown export-golden checks, expectation drift diagnostics, report JSON, Make target, and unit coverage | Broader evaluator dimensions and CI matrix integration remain future work | +| Layout evaluator scaffold | Landed: deterministic internal evaluator over committed extraction/layout fixture expectations, with heading/list/reading-order/rotation/hyphenation/ligature/font-identity/span-expectation coverage checks, expected page/span-text/font-id checks, expected-spans metadata validation, warning-reference checks, confidence-policy checks, text/Markdown export-golden checks, expectation drift diagnostics, report JSON, Make target, unit coverage, PR CI wiring, and static CI workflow guard coverage | Broader evaluator dimensions remain future work | | Python surface scaffold | Landed: internal stdlib wrapper over a caller-provided local `ethos doc parse` command, with explicit JSON/Markdown/text methods, page selection passthrough, diagnostics passthrough, timeout handling, command failure reporting, and mocked-command unit coverage | Native binding work, broader API design, and public setup path remain future work | | Font policy groundwork | Partially landed: substitution table and profile policy are present; substitution-table bytes are pinned by the deterministic profile and checked by schema/example validation; absent bundled fallback assets must remain represented by a null fallback-bundle hash; fixture output uses deterministic substitution IDs, and committed embedded-font fixture metadata now binds expected extraction font identity | Bundled fallback asset introduction/hash pinning and broader font/CID validation remain open | | Schema/example validation | Landed: schemas, examples, deterministic profile, referential integrity, and bbox sanity pass the `jsonschema` validation gate | Contract changes still require explicit versioning and compatibility review |