From c58006dddbc9e5d7f733604cf117c9654acc4263 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 00:09:48 +0530 Subject: [PATCH] Validate font policy profile artifacts Signed-off-by: docushell-admin --- docs/execution-status.md | 4 +- schemas/font_policy_validation.py | 134 +++++++++++++++++++++++++ schemas/test_font_policy_validation.py | 107 ++++++++++++++++++++ schemas/validate_examples.py | 13 +++ 4 files changed, 256 insertions(+), 2 deletions(-) create mode 100644 schemas/font_policy_validation.py create mode 100644 schemas/test_font_policy_validation.py diff --git a/docs/execution-status.md b/docs/execution-status.md index b87ed09..21edd8b 100644 --- a/docs/execution-status.md +++ b/docs/execution-status.md @@ -17,7 +17,7 @@ The committed implementation now includes: - `ethos doc parse` / `ethos fingerprint` PDF execution through a worker process with `max_parse_ms` timeout enforcement, stable error-envelope relay, diagnostics-gated worker stderr, and page-range validation/filtering. - Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements, fixture-backed alpha heading and flat list-item elements, and simple column reading order for the current born-digital fixtures. Current alpha layout confidence is explicit for heading signals, and below-threshold layout confidence emits deterministic `low_confidence_reading_order` diagnostics instead of staying silent. Fixture validation binds selected `fixture.json` expectations to committed extraction/layout goldens and binds current alpha text/Markdown exports to committed layout output so current read-order, element-type, heading-export, list-item, and export cases fail closed on drift. - An internal layout evaluator scaffold exists at `fixtures/evaluate_layout_alpha.py` and `make layout-evaluator-alpha`. It reads committed `fixture.json`, `extraction.json`, `layout.json`, `text.txt`, and `markdown.md` files, summarizes alpha element-type and subset coverage, and fails closed on missing layout expectations, dangling/invalid warning references, confidence-policy drift, export-golden drift, invalid span expectation metadata, expected page/span-text/font-id drift, expected rotation drift, or drift in fixture-backed reading order / heading / list-item / hyphenation / ligature cases. -- Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract plus referential-integrity and bbox sanity checks outside JSON Schema. +- Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract, deterministic-profile font-policy artifact checks, referential-integrity checks, and bbox sanity checks outside JSON Schema. - `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases, and maps explicit real OpenDataLoader-style row/cell structures to table-cell grounding. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, split-quote, not-found, stale-fingerprint, unsupported non-v1 claim, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths. - Verification semantics are now trust-honest at alpha scope: quote containment is explicitly labeled, value/table-cell checks require normalized equality, fingerprint-pinned citations fail closed when source fingerprints are unavailable, and structured capability limits explain why a run is downgraded. - `make verify-alpha` is the current alpha trust-loop command: it checks native examples, split-quote evidence matching, unsupported non-v1 claim reporting, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, verify-alpha case inventory coverage, usage diagnostics for malformed citations and malformed OpenDataLoader-style structures, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding. @@ -55,7 +55,7 @@ Milestone A has an accepted internal Gate Zero decision for roadmap control, so | Layout groundwork | Landed: basic paragraph text blocks, fixture-backed alpha heading and flat list-item elements, simple column reading order over quantized spans, explicit alpha heading-confidence values, deterministic below-threshold confidence diagnostics, fixture metadata checks against committed extraction/layout goldens for current read-order and element-type expectations, and alpha text/Markdown export goldens derived from committed layout output | Tables, nested/richer list and heading semantics, broader rotation/quirk handling, and broader confidence dimensions remain future work | | Layout evaluator scaffold | Landed: deterministic internal evaluator over committed extraction/layout fixture expectations, with heading/list/reading-order/rotation/hyphenation/ligature/font-identity/span-expectation coverage checks, expected page/span-text/font-id checks, expected-spans metadata validation, warning-reference checks, confidence-policy checks, text/Markdown export-golden checks, expectation drift diagnostics, report JSON, Make target, and unit coverage | Broader evaluator dimensions and CI matrix integration remain future work | | Python surface scaffold | Landed: internal stdlib wrapper over a caller-provided local `ethos doc parse` command, with explicit JSON/Markdown/text methods, page selection passthrough, diagnostics passthrough, timeout handling, command failure reporting, and mocked-command unit coverage | Native binding work, broader API design, and public setup path remain future work | -| Font policy groundwork | Partially landed: substitution table and profile policy are present; fixture output uses deterministic substitution IDs, and committed embedded-font fixture metadata now binds expected extraction font identity | Bundled fallback asset hashing and broader font/CID validation remain open | +| Font policy groundwork | Partially landed: substitution table and profile policy are present; substitution-table bytes are pinned by the deterministic profile and checked by schema/example validation; absent bundled fallback assets must remain represented by a null fallback-bundle hash; fixture output uses deterministic substitution IDs, and committed embedded-font fixture metadata now binds expected extraction font identity | Bundled fallback asset introduction/hash pinning and broader font/CID validation remain open | | Schema/example validation | Landed: schemas, examples, deterministic profile, referential integrity, and bbox sanity pass the `jsonschema` validation gate | Contract changes still require explicit versioning and compatibility review | | Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping and explicit real ODL-style row/cell table grounding, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, split-quote fixture coverage, explicit unsupported non-v1 claim reporting, OpenDataLoader-style structure diagnostics for malformed bbox and unknown-page references, verify-alpha case inventory checks, and demo fixtures | Still needed: additional adapter hardening against broader real output shapes, future claim-kind expansion outside the current v1 alpha policy, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding | | WS-HARNESS readiness | Partially landed: readiness path is green for frozen corpus/hardware and pinned competitors, Gate Zero evidence preflight validates the current `ethos-bench` handoff, and gates fail closed if those records regress | Public-safe comparison report flow, release/package approval, claim-wording approval, and future evidence-refresh workflow still need hardening | diff --git a/schemas/font_policy_validation.py b/schemas/font_policy_validation.py new file mode 100644 index 0000000..6dd7ae2 --- /dev/null +++ b/schemas/font_policy_validation.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Deterministic-profile font policy artifact checks.""" + +from __future__ import annotations + +import hashlib +import json +from pathlib import Path +from typing import Any + +HEX256_LEN = 64 +FALLBACK_BUNDLE_DIR = Path("crates/ethos-pdf/assets/fonts/liberation") + + +def diagnose_font_policy(root: Path, profile: dict[str, Any]) -> list[str]: + diagnostics: list[str] = [] + font_policy = profile.get("font_policy") + if not isinstance(font_policy, dict): + return ["profile font_policy must be an object"] + + diagnostics.extend(diagnose_substitution_table(root, font_policy)) + diagnostics.extend(diagnose_fallback_bundle(root, font_policy)) + return diagnostics + + +def diagnose_substitution_table(root: Path, font_policy: dict[str, Any]) -> list[str]: + substitution = font_policy.get("substitution_table") + if not isinstance(substitution, dict): + return ["font_policy.substitution_table must be an object"] + + path_value = substitution.get("path") + if not isinstance(path_value, str) or not safe_relative_path(path_value): + return ["font_policy.substitution_table.path must be a safe relative path"] + + sha_value = substitution.get("sha256") + if not isinstance(sha_value, str) or not is_hex256(sha_value): + return ["font_policy.substitution_table.sha256 must be lowercase hex sha256"] + + path = root / path_value + if not path.is_file(): + return [f"font_policy.substitution_table.path missing: {path_value}"] + + actual = sha256_file(path) + if actual != sha_value: + return [ + "font_policy.substitution_table.sha256 mismatch " + f"for {path_value}: expected {sha_value}, got {actual}" + ] + return [] + + +def diagnose_fallback_bundle(root: Path, font_policy: dict[str, Any]) -> list[str]: + fallback = font_policy.get("fallback_bundle") + if not isinstance(fallback, dict): + return ["font_policy.fallback_bundle must be an object"] + + sha_value = fallback.get("sha256") + bundle_dir = root / FALLBACK_BUNDLE_DIR + if sha_value is None: + if bundle_dir.exists(): + return [ + "font_policy.fallback_bundle.sha256 is null but " + f"{FALLBACK_BUNDLE_DIR.as_posix()} exists" + ] + return [] + + if not isinstance(sha_value, str) or not is_hex256(sha_value): + return ["font_policy.fallback_bundle.sha256 must be null or lowercase hex sha256"] + if not bundle_dir.is_dir(): + return [ + "font_policy.fallback_bundle.sha256 is set but " + f"{FALLBACK_BUNDLE_DIR.as_posix()} is missing" + ] + + actual = sha256_directory(bundle_dir) + if actual is None: + return [f"font fallback bundle has no files: {FALLBACK_BUNDLE_DIR.as_posix()}"] + if actual != sha_value: + return [ + "font_policy.fallback_bundle.sha256 mismatch " + f"for {FALLBACK_BUNDLE_DIR.as_posix()}: expected {sha_value}, got {actual}" + ] + return [] + + +def safe_relative_path(value: str) -> bool: + path = Path(value) + return ( + bool(value) + and not path.is_absolute() + and ".." not in path.parts + and str(path) == value + ) + + +def is_hex256(value: str) -> bool: + return ( + len(value) == HEX256_LEN + and value == value.lower() + and all(ch in "0123456789abcdef" for ch in value) + ) + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def sha256_directory(path: Path) -> str | None: + files = sorted(item for item in path.rglob("*") if item.is_file()) + if not files: + return None + entries = {item.relative_to(path).as_posix(): sha256_file(item) for item in files} + payload = json.dumps(entries, ensure_ascii=False, separators=(",", ":"), sort_keys=True) + return hashlib.sha256(f"{payload}\n".encode("utf-8")).hexdigest() diff --git a/schemas/test_font_policy_validation.py b/schemas/test_font_policy_validation.py new file mode 100644 index 0000000..cedae61 --- /dev/null +++ b/schemas/test_font_policy_validation.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import tempfile +import unittest +from pathlib import Path + +import font_policy_validation + + +def write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +class FontPolicyValidationTests(unittest.TestCase): + def with_repo(self): + temp = tempfile.TemporaryDirectory() + root = Path(temp.name) + table = root / "crates/ethos-pdf/assets/font-substitution-table.json" + write(table, '{"schema_version":"1.0.0"}\n') + profile = { + "font_policy": { + "substitution_table": { + "path": "crates/ethos-pdf/assets/font-substitution-table.json", + "sha256": font_policy_validation.sha256_file(table), + }, + "fallback_bundle": { + "name": "liberation", + "license": "OFL-1.1", + "sha256": None, + }, + } + } + return temp, root, profile + + def test_current_null_fallback_contract_passes_when_bundle_is_absent(self) -> None: + temp, root, profile = self.with_repo() + with temp: + self.assertEqual(font_policy_validation.diagnose_font_policy(root, profile), []) + + def test_substitution_table_hash_mismatch_fails_closed(self) -> None: + temp, root, profile = self.with_repo() + with temp: + profile["font_policy"]["substitution_table"]["sha256"] = "0" * 64 + diagnostics = font_policy_validation.diagnose_font_policy(root, profile) + self.assertEqual(len(diagnostics), 1) + self.assertIn("substitution_table.sha256 mismatch", diagnostics[0]) + + def test_unsafe_substitution_table_path_fails_closed(self) -> None: + temp, root, profile = self.with_repo() + with temp: + profile["font_policy"]["substitution_table"]["path"] = "../font-table.json" + diagnostics = font_policy_validation.diagnose_font_policy(root, profile) + self.assertEqual( + diagnostics, + ["font_policy.substitution_table.path must be a safe relative path"], + ) + + def test_null_fallback_hash_fails_when_bundle_exists(self) -> None: + temp, root, profile = self.with_repo() + with temp: + write( + root / font_policy_validation.FALLBACK_BUNDLE_DIR / "LiberationSans-Regular.ttf", + "font bytes", + ) + diagnostics = font_policy_validation.diagnose_font_policy(root, profile) + self.assertEqual(len(diagnostics), 1) + self.assertIn("fallback_bundle.sha256 is null", diagnostics[0]) + + def test_fallback_bundle_hash_can_pin_directory_payload(self) -> None: + temp, root, profile = self.with_repo() + with temp: + bundle_dir = root / font_policy_validation.FALLBACK_BUNDLE_DIR + write(bundle_dir / "LiberationSans-Regular.ttf", "font bytes") + profile["font_policy"]["fallback_bundle"]["sha256"] = ( + font_policy_validation.sha256_directory(bundle_dir) + ) + self.assertEqual(font_policy_validation.diagnose_font_policy(root, profile), []) + + def test_set_fallback_hash_requires_bundle_directory(self) -> None: + temp, root, profile = self.with_repo() + with temp: + profile["font_policy"]["fallback_bundle"]["sha256"] = "1" * 64 + diagnostics = font_policy_validation.diagnose_font_policy(root, profile) + self.assertEqual(len(diagnostics), 1) + self.assertIn("fallback_bundle.sha256 is set", diagnostics[0]) + + +if __name__ == "__main__": + unittest.main() diff --git a/schemas/validate_examples.py b/schemas/validate_examples.py index 6a787f6..44d9f9a 100644 --- a/schemas/validate_examples.py +++ b/schemas/validate_examples.py @@ -29,6 +29,8 @@ import sys from pathlib import Path +from font_policy_validation import diagnose_font_policy + try: from jsonschema import Draft202012Validator as Validator DIALECT = "2020-12" @@ -218,6 +220,17 @@ def c14n_line(v) -> str: fail(f"{label} diverges from document example") print("ok example fingerprints coherent across artifacts") +# deterministic profile font-policy artifact checks +profile = json.loads( + (ROOT / "profiles" / "ethos-deterministic-v1.json").read_text(encoding="utf-8") +) +font_policy_diagnostics = diagnose_font_policy(ROOT, profile) +if font_policy_diagnostics: + for diagnostic in font_policy_diagnostics: + fail(diagnostic) +else: + print("ok deterministic profile font policy artifact pins coherent") + # bbox sanity (schema cannot express x0<=x1, y0<=y1) def walk_bboxes(label, node, ctx): if isinstance(node, dict):