From 93faca477a03ded8652d79d39dbe04e2d386f6fd Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Tue, 16 Jun 2026 23:33:48 +0530 Subject: [PATCH] Map real ODL table cells Signed-off-by: docushell-admin --- .../grounding/opendataloader-json/README.md | 5 +- .../grounding/opendataloader-json/src/lib.rs | 228 +++++++++++++++++- crates/ethos-cli/tests/verify.rs | 83 +++++++ docs/execution-status.md | 4 +- 4 files changed, 311 insertions(+), 9 deletions(-) diff --git a/adapters/grounding/opendataloader-json/README.md b/adapters/grounding/opendataloader-json/README.md index 66a9b05..9bc0fac 100644 --- a/adapters/grounding/opendataloader-json/README.md +++ b/adapters/grounding/opendataloader-json/README.md @@ -23,8 +23,9 @@ grounding elements. Child containers must use array shapes, and malformed child or non-string `content` values are rejected instead of being silently skipped. Real ODL JSON does not include parser version or page dimensions, so the adapter reports parser version as `unknown` and derives page extents from observed bounding boxes. Coordinate origin remains -unknown. Real ODL table-cell grounding is not exposed yet; row/cell nodes are exposed only -as ordinary grounding elements and table-cell claims remain capability-limited. +unknown. Real ODL-style table nodes with explicit `page number`, `bounding box`, and +`rows[].cells[]` cell page/bbox/content fields are mapped to deterministic grounding tables; +row and column addresses are derived from row/cell order. ## Declared capabilities (honest downgrades) diff --git a/adapters/grounding/opendataloader-json/src/lib.rs b/adapters/grounding/opendataloader-json/src/lib.rs index 0443a02..1c1d290 100644 --- a/adapters/grounding/opendataloader-json/src/lib.rs +++ b/adapters/grounding/opendataloader-json/src/lib.rs @@ -40,8 +40,9 @@ //! [`CoordinateOrigin::Unknown`] until the B-alpha pin verifies ODL's convention — //! capability-driven downgrade, never silent assumption. //! - No spans, no char offsets, no fingerprint, no crops: declared `false`. -//! Table capability is declared only when a `tables` array is present in the -//! documented subset; real ODL table-cell mapping is intentionally not declared yet. +//! Table capability is declared when a `tables` array is present in the +//! documented subset or when real ODL-style `rows[].cells` table structures carry +//! enough explicit page/bbox/text data to map deterministic cells. //! Verification surfaces missing capabilities as `capability_limited` (PRD §5.5). #![forbid(unsafe_code)] @@ -353,8 +354,18 @@ fn parse_real_odl(root: &Value) -> Result { let mut page_extents = vec![[1i64, 1i64]; page_count as usize]; let mut elements = Vec::new(); let mut element_ids = HashSet::new(); + let mut tables = Vec::new(); + let mut table_ids = HashSet::new(); let mut next_synthetic_id = 1u32; + let mut next_synthetic_table_id = 1u32; for kid in kids { + collect_real_tables( + kid, + page_count, + &mut tables, + &mut table_ids, + &mut next_synthetic_table_id, + )?; parse_real_content_element( kid, page_count, @@ -382,8 +393,144 @@ fn parse_real_odl(root: &Value) -> Result { parser_version: "unknown".to_string(), pages, elements, - tables_capable: false, - tables: Vec::new(), + tables_capable: !tables.is_empty(), + tables, + }) +} + +fn collect_real_tables( + node: &Value, + page_count: u32, + tables: &mut Vec, + table_ids: &mut HashSet, + next_synthetic_table_id: &mut u32, +) -> Result<(), AdapterError> { + if real_node_has_table_fields(node) { + let table = parse_real_table(node, page_count, next_synthetic_table_id)?; + if !table_ids.insert(table.id.clone()) { + return Err(err("duplicate real table id")); + } + tables.push(table); + } + for child in real_child_elements(node)? { + collect_real_tables( + child, + page_count, + tables, + table_ids, + next_synthetic_table_id, + )?; + } + Ok(()) +} + +fn real_node_has_table_fields(node: &Value) -> bool { + node.get("rows").is_some() + && (node.get("type").is_some() + || node.get("id").is_some() + || node.get("page number").is_some() + || node.get("bounding box").is_some()) +} + +fn parse_real_table( + node: &Value, + page_count: u32, + next_synthetic_table_id: &mut u32, +) -> Result { + let page_number = u32_field( + node, + "page number", + "missing table page number", + "table page number must fit u32", + )?; + if page_number == 0 || page_number > page_count { + return Err(err("table page number references unknown page")); + } + let bbox = bbox_from( + node.get("bounding box") + .ok_or_else(|| err("missing table bounding box"))?, + )?; + let id = real_table_id(node, next_synthetic_table_id)?; + let rows = node + .get("rows") + .and_then(Value::as_array) + .ok_or_else(|| err("rows must be an array"))?; + let mut cells = Vec::new(); + for (row_index, row) in rows.iter().enumerate() { + let row = row + .as_object() + .ok_or_else(|| err("row must be an object"))?; + let cells_value = row + .get("cells") + .and_then(Value::as_array) + .ok_or_else(|| err("row cells must be an array"))?; + for (col_index, cell) in cells_value.iter().enumerate() { + cells.push(parse_real_table_cell( + cell, + page_count, + page_number, + (row_index + 1) as u32, + (col_index + 1) as u32, + )?); + } + } + Ok(GroundingTable { + id, + page: format!("page-{page_number}"), + bbox, + cells, + }) +} + +fn real_table_id(node: &Value, next_synthetic_table_id: &mut u32) -> Result { + if let Some(raw) = node.get("id") { + if let Some(id) = raw.as_u64() { + let id = u32::try_from(id).map_err(|_| err("table id must fit u32"))?; + return Ok(format!("odl-{id}")); + } + if let Some(id) = raw.as_str().map(str::trim).filter(|id| !id.is_empty()) { + return Ok(format!("odl-{id}")); + } + return Err(err("table id must be integer or non-empty string")); + } + let id = *next_synthetic_table_id; + *next_synthetic_table_id = next_synthetic_table_id.saturating_add(1); + Ok(format!("odl-table-{id}")) +} + +fn parse_real_table_cell( + cell: &Value, + page_count: u32, + table_page_number: u32, + row: u32, + col: u32, +) -> Result { + let page_number = u32_field( + cell, + "page number", + "missing cell page number", + "cell page number must fit u32", + )?; + if page_number == 0 || page_number > page_count { + return Err(err("cell page number references unknown page")); + } + if page_number != table_page_number { + return Err(err("cell page number must match table page number")); + } + let bbox = bbox_from( + cell.get("bounding box") + .ok_or_else(|| err("missing cell bounding box"))?, + )?; + let text = real_content_text(cell)? + .filter(|text| !text.trim().is_empty()) + .ok_or_else(|| err("missing cell content"))?; + Ok(GroundingCell { + row, + col, + row_span: 1, + col_span: 1, + bbox, + text, }) } @@ -901,7 +1048,21 @@ mod tests { assert_eq!(pages[0].height, 20000); assert_eq!(pages[1].width, 25000); assert_eq!(pages[1].height, 12000); - assert!(!src.capabilities().tables); + assert!(src.capabilities().tables); + + let tables = src.tables(); + assert_eq!(tables.len(), 1); + assert_eq!(tables[0].id, "odl-13"); + assert_eq!(tables[0].page, "page-2"); + assert_eq!(tables[0].bbox, [1500, 2000, 25000, 12000]); + assert_eq!(tables[0].cells.len(), 2); + assert_eq!(tables[0].cells[0].row, 1); + assert_eq!(tables[0].cells[0].col, 1); + assert_eq!(tables[0].cells[0].text, "Cell A"); + assert_eq!(tables[0].cells[0].bbox, [2000, 3000, 12000, 6000]); + assert_eq!(tables[0].cells[1].row, 1); + assert_eq!(tables[0].cells[1].col, 2); + assert_eq!(tables[0].cells[1].text, "Cell B"); } #[test] @@ -968,6 +1129,63 @@ mod tests { assert!(src.tables().is_empty()); } + #[test] + fn rejects_malformed_real_table_cells() { + assert_error_contains( + r#"{ + "file name": "table.pdf", + "number of pages": 1, + "kids": [ + { + "type": "table", + "id": 1, + "page number": 1, + "bounding box": [10, 10, 200, 100], + "rows": [ + { + "cells": [ + { + "type": "table_cell", + "page number": 2, + "bounding box": [20, 20, 100, 40], + "content": "Out of range" + } + ] + } + ] + } + ] + }"#, + "cell page number references unknown page", + ); + assert_error_contains( + r#"{ + "file name": "table.pdf", + "number of pages": 1, + "kids": [ + { + "type": "table", + "id": 1, + "page number": 1, + "bounding box": [10, 10, 200, 100], + "rows": [ + { + "cells": [ + { + "type": "table_cell", + "page number": 1, + "bounding box": [20, 20, 100, 40] + } + ] + } + ] + } + ] + }"#, + "missing cell content", + ); + } + #[test] fn centipoint_quantization_matches_core_semantics() { let samples = [ diff --git a/crates/ethos-cli/tests/verify.rs b/crates/ethos-cli/tests/verify.rs index 96a8ac6..a24e694 100644 --- a/crates/ethos-cli/tests/verify.rs +++ b/crates/ethos-cli/tests/verify.rs @@ -1926,6 +1926,89 @@ fn empty_tables_are_not_found_when_table_capability_is_declared() { assert_eq!(report["all_evidence_grounded"], false); } +#[test] +fn real_opendataloader_style_table_cell_claim_grounds() { + let grounding = temp_json( + "real-odl-style-table", + r#"{ + "file name": "table.pdf", + "number of pages": 1, + "kids": [ + { + "type": "table", + "id": 13, + "page number": 1, + "bounding box": [10, 10, 240, 80], + "rows": [ + { + "cells": [ + { + "type": "table_cell", + "page number": 1, + "bounding box": [20, 20, 110, 50], + "content": "Metric" + }, + { + "type": "table_cell", + "page number": 1, + "bounding box": [120, 20, 230, 50], + "content": "$12.4M" + } + ] + } + ] + } + ] + }"#, + ); + let citations = temp_json( + "real-odl-style-table-cell-citations", + r#"{ + "claims": [ + { + "kind": "table_cell", + "text": "$12.4M", + "citation": { + "table_id": "odl-13", + "cell": { + "row": 1, + "col": 2 + } + } + } + ] + }"#, + ); + let report = parse_success(&[ + "verify", + grounding.to_str().unwrap(), + "--grounding", + "opendataloader-json", + "--citations", + citations.to_str().unwrap(), + ]); + + assert_eq!(report["grounding"]["capabilities"]["tables"], true); + assert_eq!( + report["capability_limits"], + serde_json::json!([ + "missing_fingerprint", + "missing_spans", + "missing_char_offsets", + "unknown_coordinate_origin" + ]) + ); + assert_eq!(report["checks"][0]["status"], "grounded"); + assert_eq!(report["checks"][0]["match_method"], "table_cell_lookup"); + assert_eq!(report["checks"][0]["evidence"]["page"], "page-1"); + assert_eq!(report["checks"][0]["evidence"]["text"], "$12.4M"); + assert_eq!( + report["checks"][0]["evidence"]["bbox"], + serde_json::json!([12000, 2000, 23000, 5000]) + ); + assert_eq!(report["all_evidence_grounded"], true); +} + #[test] fn foreign_source_without_fingerprint_blocks_fingerprint_pinned_citations() { let grounding = odl_example(); diff --git a/docs/execution-status.md b/docs/execution-status.md index 3aff4e6..b87ed09 100644 --- a/docs/execution-status.md +++ b/docs/execution-status.md @@ -18,7 +18,7 @@ The committed implementation now includes: - Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements, fixture-backed alpha heading and flat list-item elements, and simple column reading order for the current born-digital fixtures. Current alpha layout confidence is explicit for heading signals, and below-threshold layout confidence emits deterministic `low_confidence_reading_order` diagnostics instead of staying silent. Fixture validation binds selected `fixture.json` expectations to committed extraction/layout goldens and binds current alpha text/Markdown exports to committed layout output so current read-order, element-type, heading-export, list-item, and export cases fail closed on drift. - An internal layout evaluator scaffold exists at `fixtures/evaluate_layout_alpha.py` and `make layout-evaluator-alpha`. It reads committed `fixture.json`, `extraction.json`, `layout.json`, `text.txt`, and `markdown.md` files, summarizes alpha element-type and subset coverage, and fails closed on missing layout expectations, dangling/invalid warning references, confidence-policy drift, export-golden drift, invalid span expectation metadata, expected page/span-text/font-id drift, expected rotation drift, or drift in fixture-backed reading order / heading / list-item / hyphenation / ligature cases. - Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract plus referential-integrity and bbox sanity checks outside JSON Schema. -- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, split-quote, not-found, stale-fingerprint, unsupported non-v1 claim, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths. +- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases, and maps explicit real OpenDataLoader-style row/cell structures to table-cell grounding. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, split-quote, not-found, stale-fingerprint, unsupported non-v1 claim, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths. - Verification semantics are now trust-honest at alpha scope: quote containment is explicitly labeled, value/table-cell checks require normalized equality, fingerprint-pinned citations fail closed when source fingerprints are unavailable, and structured capability limits explain why a run is downgraded. - `make verify-alpha` is the current alpha trust-loop command: it checks native examples, split-quote evidence matching, unsupported non-v1 claim reporting, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, verify-alpha case inventory coverage, usage diagnostics for malformed citations and malformed OpenDataLoader-style structures, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding. - An internal Python surface scaffold exists under `python/ethos_pdf`. It shells out to a caller-provided local `ethos` CLI binary for `ethos doc parse` JSON, Markdown, and text output, and has stdlib unit tests that use a fake local command. This is pre-alpha scaffolding for Milestone B API shape work, not a public installation or publication path. @@ -57,7 +57,7 @@ Milestone A has an accepted internal Gate Zero decision for roadmap control, so | Python surface scaffold | Landed: internal stdlib wrapper over a caller-provided local `ethos doc parse` command, with explicit JSON/Markdown/text methods, page selection passthrough, diagnostics passthrough, timeout handling, command failure reporting, and mocked-command unit coverage | Native binding work, broader API design, and public setup path remain future work | | Font policy groundwork | Partially landed: substitution table and profile policy are present; fixture output uses deterministic substitution IDs, and committed embedded-font fixture metadata now binds expected extraction font identity | Bundled fallback asset hashing and broader font/CID validation remain open | | Schema/example validation | Landed: schemas, examples, deterministic profile, referential integrity, and bbox sanity pass the `jsonschema` validation gate | Contract changes still require explicit versioning and compatibility review | -| Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, split-quote fixture coverage, explicit unsupported non-v1 claim reporting, OpenDataLoader-style structure diagnostics for malformed bbox and unknown-page references, verify-alpha case inventory checks, and demo fixtures | Still needed: real OpenDataLoader table-cell grounding, additional adapter hardening against broader real output shapes, future claim-kind expansion outside the current v1 alpha policy, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding | +| Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping and explicit real ODL-style row/cell table grounding, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, split-quote fixture coverage, explicit unsupported non-v1 claim reporting, OpenDataLoader-style structure diagnostics for malformed bbox and unknown-page references, verify-alpha case inventory checks, and demo fixtures | Still needed: additional adapter hardening against broader real output shapes, future claim-kind expansion outside the current v1 alpha policy, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding | | WS-HARNESS readiness | Partially landed: readiness path is green for frozen corpus/hardware and pinned competitors, Gate Zero evidence preflight validates the current `ethos-bench` handoff, and gates fail closed if those records regress | Public-safe comparison report flow, release/package approval, claim-wording approval, and future evidence-refresh workflow still need hardening | | Determinism workflow | Landed: macOS arm64, Linux x64, and Windows x64 matrix entries run core contract tests; PDFium-backed corpus work stays gated on an explicitly configured pinned runtime; static workflow tests guard the matrix | Windows PDFium runtime provisioning and broader cross-platform corpus validation remain future work |