From 6dded617cd4a4c6f43d3f453c52d274057881c59 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 08:30:10 +0530 Subject: [PATCH] Normalize deterministic font ids Signed-off-by: docushell-admin --- crates/ethos-pdf/src/lib.rs | 49 ++++++++++++++++++++++++++++-- docs/execution-status.md | 2 +- schemas/ethos-document.schema.json | 6 +++- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/crates/ethos-pdf/src/lib.rs b/crates/ethos-pdf/src/lib.rs index 38f1591..46331f2 100644 --- a/crates/ethos-pdf/src/lib.rs +++ b/crates/ethos-pdf/src/lib.rs @@ -1900,11 +1900,20 @@ fn union_rects(mut rects: impl Iterator) -> Option { } fn deterministic_font_id(raw_name: &str) -> Option { - let (name, subset) = strip_subset_prefix(raw_name.trim()); - let normalized = normalize_font_name(name)?; + let raw_name = raw_name.trim(); + if raw_name.is_empty() { + return None; + } + let (name, subset) = strip_subset_prefix(raw_name); if subset { - return Some(format!("embedded:{normalized}")); + if let Some(normalized) = normalize_font_name(name) { + if is_safe_font_id_suffix(&normalized) { + return Some(format!("embedded:{normalized}")); + } + } + return Some(hashed_embedded_font_id(name)); } + let normalized = normalize_font_name(name)?; font_substitution(&normalized) .or_else(|| Some(font_substitution_table().default_unresolved_font_id.clone())) } @@ -1946,6 +1955,20 @@ fn normalize_font_name(name: &str) -> Option { (!out.is_empty()).then_some(out) } +fn is_safe_font_id_suffix(name: &str) -> bool { + !name.is_empty() + && name + .bytes() + .all(|byte| byte.is_ascii_alphanumeric() || matches!(byte, b'-' | b'_' | b'.')) +} + +fn hashed_embedded_font_id(name: &str) -> String { + format!( + "embedded:sha256-{}", + ethos_core::c14n::sha256_hex_bytes(name.as_bytes()) + ) +} + fn font_substitution(name: &str) -> Option { font_substitution_table() .mappings @@ -2436,6 +2459,26 @@ mod tests { assert_eq!(deterministic_font_id(" "), None); } + #[test] + fn deterministic_font_ids_keep_embedded_ids_ascii_only() { + let unsafe_unicode = deterministic_font_id("ABCDEF+明朝").unwrap(); + assert_eq!(unsafe_unicode, hashed_embedded_font_id("明朝")); + assert!(unsafe_unicode.is_ascii()); + + let unsafe_punctuation = deterministic_font_id("ABCDEF+Fixture+Font").unwrap(); + assert_eq!(unsafe_punctuation, hashed_embedded_font_id("Fixture+Font")); + assert!(unsafe_punctuation.is_ascii()); + + let separator_only = deterministic_font_id("ABCDEF+///").unwrap(); + assert_eq!(separator_only, hashed_embedded_font_id("///")); + assert!(separator_only.is_ascii()); + + assert_eq!( + deterministic_font_id("明朝").as_deref(), + Some("subst:liberation-sans-regular") + ); + } + #[test] fn font_substitution_table_is_well_formed() { use std::collections::HashSet; diff --git a/docs/execution-status.md b/docs/execution-status.md index ce50fca..77829b5 100644 --- a/docs/execution-status.md +++ b/docs/execution-status.md @@ -55,7 +55,7 @@ Milestone A has an accepted internal Gate Zero decision for roadmap control, so | Layout groundwork | Landed: basic paragraph text blocks, fixture-backed alpha heading and flat list-item elements, simple column reading order over quantized spans, explicit alpha heading-confidence values, deterministic below-threshold confidence diagnostics, fixture metadata checks against committed extraction/layout goldens for current read-order and element-type expectations, and alpha text/Markdown export goldens derived from committed layout output | Tables, nested/richer list and heading semantics, broader rotation/quirk handling, and broader confidence dimensions remain future work | | Layout evaluator scaffold | Landed: deterministic internal evaluator over committed extraction/layout fixture expectations, with heading/list/reading-order/rotation/hyphenation/ligature/font-identity/span-expectation coverage checks, expected page/span-text/font-id checks, expected-spans metadata validation, warning-reference checks, confidence-policy checks, text/Markdown export-golden checks, expectation drift diagnostics, report JSON, Make target, unit coverage, PR CI wiring, and static CI workflow guard coverage | Broader evaluator dimensions remain future work | | Python surface scaffold | Landed: internal stdlib wrapper over a caller-provided local `ethos doc parse` command, with explicit JSON/Markdown/text methods, page selection passthrough, diagnostics passthrough, timeout handling, command failure reporting, and mocked-command unit coverage | Native binding work, broader API design, and public setup path remain future work | -| Font policy groundwork | Partially landed: substitution table and profile policy are present; substitution-table bytes are pinned by the deterministic profile and checked by schema/example validation; absent bundled fallback assets must remain represented by a null fallback-bundle hash; fixture output uses deterministic substitution IDs, and committed embedded-font fixture metadata now binds expected extraction font identity | Bundled fallback asset introduction/hash pinning and broader font/CID validation remain open | +| Font policy groundwork | Partially landed: substitution table and profile policy are present; substitution-table bytes are pinned by the deterministic profile and checked by schema/example validation; absent bundled fallback assets must remain represented by a null fallback-bundle hash; fixture output uses deterministic substitution IDs, committed embedded-font fixture metadata now binds expected extraction font identity, and document schema/font extraction keep emitted font ids inside the deterministic ASCII `embedded:` / `subst:` contract | Bundled fallback asset introduction/hash pinning and broader font/CID validation remain open | | Schema/example validation | Landed: schemas, examples, deterministic profile, referential integrity, and bbox sanity pass the `jsonschema` validation gate | Contract changes still require explicit versioning and compatibility review | | Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping, explicit real ODL-style row/cell table grounding, conservative real-style text/child-container alias normalization, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, split-quote fixture coverage, explicit unsupported non-v1 claim reporting, OpenDataLoader-style structure diagnostics for malformed bbox and unknown-page references, verify-alpha case inventory checks, and demo fixtures | Still needed: additional adapter hardening against broader real output shapes, future claim-kind expansion outside the current v1 alpha policy, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding | | WS-HARNESS readiness | Partially landed: readiness path is green for frozen corpus/hardware and pinned competitors, Gate Zero evidence preflight validates the current `ethos-bench` handoff, and gates fail closed if those records regress | Public-safe comparison report flow, release/package approval, claim-wording approval, and future evidence-refresh workflow still need hardening | diff --git a/schemas/ethos-document.schema.json b/schemas/ethos-document.schema.json index 9dc6b3f..7aa7257 100644 --- a/schemas/ethos-document.schema.json +++ b/schemas/ethos-document.schema.json @@ -156,7 +156,11 @@ "bbox": { "$ref": "#/$defs/bbox" }, "origin_locator": { "$ref": "#/$defs/origin_locator" }, "text": { "type": "string" }, - "font_id": { "type": "string", "description": "Resolved font identity under the deterministic font profile (ADR-0003): embedded name or substitution-table key." }, + "font_id": { + "type": "string", + "pattern": "^(embedded|subst):[A-Za-z0-9][A-Za-z0-9._-]*$", + "description": "Resolved ASCII font identity under the deterministic font profile (ADR-0003): embedded name/hash or substitution-table key." + }, "font_size_q": { "type": "integer", "minimum": 0, "description": "Font size in quanta." }, "char_start": { "type": "integer", "minimum": 0, "description": "Char offset range within the owning element's text, when span->element ownership exists." }, "char_end": { "type": "integer", "minimum": 0 },