From f2f6b510f88dc7e62a8603987c26a923479bbcf8 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Thu, 18 Jun 2026 11:03:57 +0530 Subject: [PATCH] Address Milestone C audit findings Signed-off-by: docushell-admin --- .github/scripts/makefile_guard.py | 46 +++++ .../test_milestone_b_internal_checks.py | 27 +-- .../test_milestone_c_closeout_record.py | 3 +- .../test_milestone_c_internal_checks.py | 27 +-- .github/scripts/test_rag_chunk_alpha.py | 27 +-- .github/scripts/test_security_report_alpha.py | 27 +-- crates/ethos-cli/src/cmd/rag.rs | 140 ++++++++----- crates/ethos-cli/src/cmd/security.rs | 192 ++++++++++++------ crates/ethos-cli/src/main.rs | 24 +-- crates/ethos-cli/tests/rag.rs | 99 ++++++++- crates/ethos-cli/tests/security_report.rs | 98 ++++++++- crates/ethos-cli/tests/verify.rs | 6 +- crates/ethos-core/src/codes.rs | 136 +++++++++++++ ...estone-c-closeout-validation-2026-06-18.md | 3 +- schemas/examples/security-report.example.json | 54 +---- .../security-report.full.example.json | 82 ++++++++ schemas/security_report_validation.py | 6 +- schemas/test_security_report_validation.py | 62 +++++- schemas/validate_examples.py | 121 ++++++++++- 19 files changed, 871 insertions(+), 309 deletions(-) create mode 100644 .github/scripts/makefile_guard.py create mode 100644 schemas/examples/security-report.full.example.json diff --git a/.github/scripts/makefile_guard.py b/.github/scripts/makefile_guard.py new file mode 100644 index 0000000..d3b4a22 --- /dev/null +++ b/.github/scripts/makefile_guard.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] +MAKEFILE = ROOT / "Makefile" + + +def makefile_text() -> str: + return MAKEFILE.read_text(encoding="utf-8") + + +def target_block(target: str) -> str: + lines = makefile_text().splitlines() + start = None + for index, line in enumerate(lines): + if line == f"{target}:": + start = index + 1 + break + if start is None: + raise AssertionError(f"{target} target is missing") + + block: list[str] = [] + for line in lines[start:]: + if line and not line.startswith(("\t", " ")): + break + block.append(line) + return "\n".join(block) diff --git a/.github/scripts/test_milestone_b_internal_checks.py b/.github/scripts/test_milestone_b_internal_checks.py index 9ab276a..b9e7881 100644 --- a/.github/scripts/test_milestone_b_internal_checks.py +++ b/.github/scripts/test_milestone_b_internal_checks.py @@ -18,33 +18,8 @@ from __future__ import annotations import unittest -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[2] -MAKEFILE = ROOT / "Makefile" - - -def makefile_text() -> str: - return MAKEFILE.read_text(encoding="utf-8") - - -def target_block(target: str) -> str: - lines = makefile_text().splitlines() - start = None - for index, line in enumerate(lines): - if line == f"{target}:": - start = index + 1 - break - if start is None: - raise AssertionError(f"{target} target is missing") - - block: list[str] = [] - for line in lines[start:]: - if line and not line.startswith(("\t", " ")): - break - block.append(line) - return "\n".join(block) +from makefile_guard import makefile_text, target_block class MilestoneBInternalCheckTests(unittest.TestCase): diff --git a/.github/scripts/test_milestone_c_closeout_record.py b/.github/scripts/test_milestone_c_closeout_record.py index e5d2f30..9428ce3 100644 --- a/.github/scripts/test_milestone_c_closeout_record.py +++ b/.github/scripts/test_milestone_c_closeout_record.py @@ -44,7 +44,8 @@ def test_record_is_indexed(self) -> None: def test_record_names_internal_validation_command(self) -> None: text = record_text() - self.assertIn("Validated `main` HEAD: `4e3adbb`", text) + self.assertIn("Validated source HEAD before this record: `4e3adbb`", text) + self.assertIn("Closeout record commit on `main`: `21d1810`", text) self.assertIn( "make milestone-c-internal-checks PYTHON=/bin/python", text, diff --git a/.github/scripts/test_milestone_c_internal_checks.py b/.github/scripts/test_milestone_c_internal_checks.py index 858b545..77ee803 100644 --- a/.github/scripts/test_milestone_c_internal_checks.py +++ b/.github/scripts/test_milestone_c_internal_checks.py @@ -18,33 +18,8 @@ from __future__ import annotations import unittest -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[2] -MAKEFILE = ROOT / "Makefile" - - -def makefile_text() -> str: - return MAKEFILE.read_text(encoding="utf-8") - - -def target_block(target: str) -> str: - lines = makefile_text().splitlines() - start = None - for index, line in enumerate(lines): - if line == f"{target}:": - start = index + 1 - break - if start is None: - raise AssertionError(f"{target} target is missing") - - block: list[str] = [] - for line in lines[start:]: - if line and not line.startswith(("\t", " ")): - break - block.append(line) - return "\n".join(block) +from makefile_guard import makefile_text, target_block class MilestoneCInternalCheckTests(unittest.TestCase): diff --git a/.github/scripts/test_rag_chunk_alpha.py b/.github/scripts/test_rag_chunk_alpha.py index 68d47af..07a5381 100644 --- a/.github/scripts/test_rag_chunk_alpha.py +++ b/.github/scripts/test_rag_chunk_alpha.py @@ -18,33 +18,8 @@ from __future__ import annotations import unittest -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[2] -MAKEFILE = ROOT / "Makefile" - - -def makefile_text() -> str: - return MAKEFILE.read_text(encoding="utf-8") - - -def target_block(target: str) -> str: - lines = makefile_text().splitlines() - start = None - for index, line in enumerate(lines): - if line == f"{target}:": - start = index + 1 - break - if start is None: - raise AssertionError(f"{target} target is missing") - - block: list[str] = [] - for line in lines[start:]: - if line and not line.startswith(("\t", " ")): - break - block.append(line) - return "\n".join(block) +from makefile_guard import makefile_text, target_block class RagChunkAlphaTests(unittest.TestCase): diff --git a/.github/scripts/test_security_report_alpha.py b/.github/scripts/test_security_report_alpha.py index 1cbfaf4..1db347c 100644 --- a/.github/scripts/test_security_report_alpha.py +++ b/.github/scripts/test_security_report_alpha.py @@ -18,33 +18,8 @@ from __future__ import annotations import unittest -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[2] -MAKEFILE = ROOT / "Makefile" - - -def makefile_text() -> str: - return MAKEFILE.read_text(encoding="utf-8") - - -def target_block(target: str) -> str: - lines = makefile_text().splitlines() - start = None - for index, line in enumerate(lines): - if line == f"{target}:": - start = index + 1 - break - if start is None: - raise AssertionError(f"{target} target is missing") - - block: list[str] = [] - for line in lines[start:]: - if line and not line.startswith(("\t", " ")): - break - block.append(line) - return "\n".join(block) +from makefile_guard import makefile_text, target_block class SecurityReportAlphaTests(unittest.TestCase): diff --git a/crates/ethos-cli/src/cmd/rag.rs b/crates/ethos-cli/src/cmd/rag.rs index 147e2c3..ca93c46 100644 --- a/crates/ethos-cli/src/cmd/rag.rs +++ b/crates/ethos-cli/src/cmd/rag.rs @@ -63,6 +63,27 @@ struct RagChunkRefs<'a> { impl<'a> RagChunkRefs<'a> { fn new(doc: &'a Document) -> Self { + let mut excluded_element_warnings = BTreeMap::new(); + let mut excluded_span_warnings = BTreeMap::new(); + let mut warning_codes = BTreeMap::new(); + for warning in doc + .payload + .security_warnings + .iter() + .chain(doc.payload.parser_warnings.iter()) + { + warning_codes.insert(warning.id.as_str(), warning.code); + if warning.code.excludes_from_default_chunks() { + if let Some(element_ref) = warning.element_ref.as_deref() { + excluded_element_warnings + .insert(element_ref, (warning.id.as_str(), warning.code)); + } + if let Some(span_ref) = warning.span_ref.as_deref() { + excluded_span_warnings.insert(span_ref, (warning.id.as_str(), warning.code)); + } + } + } + Self { page_bounds: doc .payload @@ -96,39 +117,9 @@ impl<'a> RagChunkRefs<'a> { .iter() .map(|element| (element.id.as_str(), element.warning_refs.as_slice())) .collect(), - excluded_element_warnings: doc - .payload - .security_warnings - .iter() - .chain(doc.payload.parser_warnings.iter()) - .filter(|warning| excludes_from_default_chunks(warning.code)) - .filter_map(|warning| { - warning - .element_ref - .as_deref() - .map(|element_ref| (element_ref, (warning.id.as_str(), warning.code))) - }) - .collect(), - excluded_span_warnings: doc - .payload - .security_warnings - .iter() - .chain(doc.payload.parser_warnings.iter()) - .filter(|warning| excludes_from_default_chunks(warning.code)) - .filter_map(|warning| { - warning - .span_ref - .as_deref() - .map(|span_ref| (span_ref, (warning.id.as_str(), warning.code))) - }) - .collect(), - warning_codes: doc - .payload - .security_warnings - .iter() - .chain(doc.payload.parser_warnings.iter()) - .map(|warning| (warning.id.as_str(), warning.code)) - .collect(), + excluded_element_warnings, + excluded_span_warnings, + warning_codes, schema_version: doc.schema_version.as_str(), document_fingerprint: doc.fingerprint.as_str(), source_fingerprint: doc.source.fingerprint.as_str(), @@ -137,16 +128,17 @@ impl<'a> RagChunkRefs<'a> { } } -fn excludes_from_default_chunks(code: WarningCode) -> bool { - matches!( - code, - WarningCode::HiddenTextDetected - | WarningCode::OffPageTextDetected - | WarningCode::LowContrastTextDetected - ) +fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Failure> { + validate_chunk_required_refs(chunk)?; + let page_refs = validate_chunk_page_refs(chunk, refs)?; + let mut backed_pages = BTreeSet::new(); + validate_chunk_element_refs(chunk, refs, &page_refs, &mut backed_pages)?; + validate_chunk_bboxes(chunk, refs, &page_refs, &mut backed_pages)?; + validate_backed_page_refs(chunk, &page_refs, &backed_pages)?; + validate_chunk_warning_refs(chunk, refs) } -fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Failure> { +fn validate_chunk_required_refs(chunk: &Chunk) -> Result<(), Failure> { if chunk.element_refs.is_empty() { return Err(Failure::Usage(format!( "chunk {} must include at least one element_ref", @@ -165,7 +157,13 @@ fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Fai chunk.id ))); } + Ok(()) +} +fn validate_chunk_page_refs<'a>( + chunk: &'a Chunk, + refs: &RagChunkRefs<'a>, +) -> Result, Failure> { let mut page_refs = BTreeSet::new(); for id in &chunk.page_refs { if !refs.page_bounds.contains_key(id.as_str()) { @@ -174,10 +172,23 @@ fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Fai chunk.id, id ))); } - page_refs.insert(id.as_str()); + if !page_refs.insert(id.as_str()) { + return Err(Failure::Usage(format!( + "chunk {} has duplicate page_ref {}", + chunk.id, id + ))); + } } + Ok(page_refs) +} - let mut backed_pages = BTreeSet::new(); +fn validate_chunk_element_refs<'a>( + chunk: &'a Chunk, + refs: &RagChunkRefs<'a>, + page_refs: &BTreeSet<&'a str>, + backed_pages: &mut BTreeSet<&'a str>, +) -> Result<(), Failure> { + let mut element_refs = BTreeSet::new(); for id in &chunk.element_refs { let Some(page) = refs.element_pages.get(id.as_str()) else { return Err(Failure::Usage(format!( @@ -191,9 +202,25 @@ fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Fai chunk.id, id, page ))); } + if !element_refs.insert(id.as_str()) { + return Err(Failure::Usage(format!( + "chunk {} has duplicate element_ref {}", + chunk.id, id + ))); + } validate_element_default_chunk_warnings(chunk, id, refs)?; backed_pages.insert(*page); } + Ok(()) +} + +fn validate_chunk_bboxes<'a>( + chunk: &'a Chunk, + refs: &RagChunkRefs<'a>, + page_refs: &BTreeSet<&'a str>, + backed_pages: &mut BTreeSet<&'a str>, +) -> Result<(), Failure> { + let mut bboxes = BTreeSet::new(); for (idx, bbox) in chunk.bboxes.iter().enumerate() { let Some(page_bounds) = refs.page_bounds.get(bbox.page.as_str()) else { return Err(Failure::Usage(format!( @@ -220,8 +247,22 @@ fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Fai chunk.id, idx, bbox.page ))); } + if !bboxes.insert((bbox.page.as_str(), bbox.bbox.to_array())) { + return Err(Failure::Usage(format!( + "chunk {} bboxes[{}] duplicates an earlier bbox on page {}", + chunk.id, idx, bbox.page + ))); + } backed_pages.insert(bbox.page.as_str()); } + Ok(()) +} + +fn validate_backed_page_refs( + chunk: &Chunk, + page_refs: &BTreeSet<&str>, + backed_pages: &BTreeSet<&str>, +) -> Result<(), Failure> { for id in page_refs { if !backed_pages.contains(id) { return Err(Failure::Usage(format!( @@ -230,6 +271,10 @@ fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Fai ))); } } + Ok(()) +} + +fn validate_chunk_warning_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Failure> { for id in &chunk.warning_refs { let Some(code) = refs.warning_codes.get(id.as_str()) else { return Err(Failure::Usage(format!( @@ -237,7 +282,7 @@ fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Fai chunk.id, id ))); }; - if excludes_from_default_chunks(*code) { + if code.excludes_from_default_chunks() { return Err(Failure::Usage(format!( "chunk {} references default-excluded warning_ref {} ({})", chunk.id, @@ -261,9 +306,12 @@ fn validate_element_default_chunk_warnings( .flat_map(|warning_refs| warning_refs.iter()) { let Some(code) = refs.warning_codes.get(warning_ref.as_str()) else { - continue; + return Err(Failure::Usage(format!( + "chunk {} element_ref {} references unknown warning_ref {}", + chunk.id, element_ref, warning_ref + ))); }; - if excludes_from_default_chunks(*code) { + if code.excludes_from_default_chunks() { return Err(Failure::Usage(format!( "chunk {} element_ref {} carries default-excluded warning_ref {} ({})", chunk.id, diff --git a/crates/ethos-cli/src/cmd/security.rs b/crates/ethos-cli/src/cmd/security.rs index 534d63a..b8f10f9 100644 --- a/crates/ethos-cli/src/cmd/security.rs +++ b/crates/ethos-cli/src/cmd/security.rs @@ -17,12 +17,13 @@ use std::cmp::Ordering; use std::collections::BTreeMap; -use ethos_core::codes::WarningCode; use ethos_core::error::EthosError; use ethos_core::model::{Document, Element, Page, Span, Warning}; use crate::{read_document, write_output, Failure, SecurityReportArgs}; +const PREVIEW_MAX_CHARS: usize = 120; + pub(crate) fn security_report(args: SecurityReportArgs) -> Result<(), Failure> { let doc = read_document(&args.input)?; let out = security_report_output_bytes(&doc)?; @@ -31,6 +32,23 @@ pub(crate) fn security_report(args: SecurityReportArgs) -> Result<(), Failure> { fn security_report_output_bytes(doc: &Document) -> Result, Failure> { let refs = SecurityReportRefs::new(doc); + let warnings = sorted_security_warnings(doc)?; + let (summary, findings) = security_report_records(&warnings, &refs)?; + let value = security_report_value(doc, summary, findings); + let mut bytes = + ethos_core::c14n::c14n_bytes(&value).map_err(|e| EthosError::internal(e.message))?; + bytes.push(b'\n'); + Ok(bytes) +} + +fn sorted_security_warnings(doc: &Document) -> Result, Failure> { + reject_security_codes_in_parser_warnings(doc)?; + let mut warnings = validate_security_warning_codes(doc)?; + warnings.sort_by(|left, right| warning_order(left, right)); + Ok(warnings) +} + +fn reject_security_codes_in_parser_warnings(doc: &Document) -> Result<(), Failure> { for warning in &doc.payload.parser_warnings { if warning.code.is_security() { return Err(Failure::Usage(format!( @@ -40,6 +58,10 @@ fn security_report_output_bytes(doc: &Document) -> Result, Failure> { ))); } } + Ok(()) +} + +fn validate_security_warning_codes(doc: &Document) -> Result, Failure> { let mut warnings = Vec::with_capacity(doc.payload.security_warnings.len()); for warning in &doc.payload.security_warnings { if !warning.code.is_security() { @@ -51,8 +73,13 @@ fn security_report_output_bytes(doc: &Document) -> Result, Failure> { } warnings.push(warning); } - warnings.sort_by(|left, right| warning_order(left, right)); + Ok(warnings) +} +fn security_report_records( + warnings: &[&Warning], + refs: &SecurityReportRefs<'_>, +) -> Result<(BTreeMap, Vec), Failure> { let mut summary: BTreeMap = BTreeMap::new(); let mut findings = Vec::with_capacity(warnings.len()); for (index, warning) in warnings.iter().enumerate() { @@ -60,10 +87,17 @@ fn security_report_output_bytes(doc: &Document) -> Result, Failure> { *summary .entry(warning.code.as_str().to_string()) .or_insert(0) += 1; - findings.push(finding_record(index, warning, &refs)?); + findings.push(finding_record(index, warning, refs)?); } + Ok((summary, findings)) +} - let value = serde_json::json!({ +fn security_report_value( + doc: &Document, + summary: BTreeMap, + findings: Vec, +) -> serde_json::Value { + serde_json::json!({ "schema_version": doc.schema_version.as_str(), "document_fingerprint": doc.fingerprint.as_str(), "source_fingerprint": doc.source.fingerprint.as_str(), @@ -80,11 +114,7 @@ fn security_report_output_bytes(doc: &Document) -> Result, Failure> { "scripts": [], "links": [], }, - }); - let mut bytes = - ethos_core::c14n::c14n_bytes(&value).map_err(|e| EthosError::internal(e.message))?; - bytes.push(b'\n'); - Ok(bytes) + }) } struct SecurityReportRefs<'a> { @@ -118,32 +148,6 @@ impl<'a> SecurityReportRefs<'a> { } } -fn inventory_backed_warning_code(code: WarningCode) -> bool { - matches!( - code, - WarningCode::AnnotationsPresent - | WarningCode::ExternalLinksPresent - | WarningCode::UnsupportedAnnotation - ) -} - -fn text_backed_warning_code(code: WarningCode) -> bool { - matches!( - code, - WarningCode::HiddenTextDetected - | WarningCode::OffPageTextDetected - | WarningCode::LowContrastTextDetected - ) -} - -fn page_backed_warning_code(code: WarningCode) -> bool { - text_backed_warning_code(code) || matches!(code, WarningCode::ImageOnlyPage) -} - -fn excludes_from_default_chunks(code: WarningCode) -> bool { - text_backed_warning_code(code) -} - fn warning_order(left: &Warning, right: &Warning) -> Ordering { ( left.code.as_str(), @@ -164,7 +168,7 @@ fn warning_order(left: &Warning, right: &Warning) -> Ordering { } fn block_inventory_backed_warning(warning: &Warning) -> Result<(), Failure> { - if inventory_backed_warning_code(warning.code) { + if warning.code.is_inventory_backed_security() { return Err(Failure::Usage(format!( "security report warning {} ({}) requires inventory data not available in canonical document", warning.id, @@ -180,6 +184,7 @@ fn finding_record( refs: &SecurityReportRefs<'_>, ) -> Result { validate_warning_refs(warning, refs)?; + let message = fixed_finding_message(warning)?; let mut finding = serde_json::Map::new(); finding.insert( "id".to_string(), @@ -191,7 +196,7 @@ fn finding_record( ); finding.insert( "message".to_string(), - serde_json::Value::String(warning.message.clone()), + serde_json::Value::String(message.to_string()), ); if let Some(page) = &warning.page { finding.insert("page".to_string(), serde_json::Value::String(page.clone())); @@ -208,45 +213,89 @@ fn finding_record( serde_json::Value::String(span_ref.clone()), ); } - if text_backed_warning_code(warning.code) { - let span_ref = warning - .span_ref - .as_deref() - .expect("text warning refs validated"); - let span = refs - .spans - .get(span_ref) - .expect("text warning span_ref validated"); - let page_ref = warning - .page - .as_deref() - .expect("text warning page validated"); - let page = refs - .pages - .get(page_ref) - .expect("text warning page validated"); - validate_span_bbox(warning, span_ref, span, page)?; - finding.insert("bbox".to_string(), serde_json::json!(span.bbox.to_array())); - finding.insert( - "text_preview".to_string(), - serde_json::Value::String(deterministic_preview(&span.text)), - ); + if warning.code.is_text_backed_security() { + insert_text_backed_fields(&mut finding, warning, refs)?; } finding.insert( "excluded_from_default_chunks".to_string(), - serde_json::Value::Bool(excludes_from_default_chunks(warning.code)), + serde_json::Value::Bool(warning.code.excludes_from_default_chunks()), ); Ok(serde_json::Value::Object(finding)) } +fn fixed_finding_message(warning: &Warning) -> Result<&'static str, Failure> { + let template = warning + .code + .security_report_message_template() + .ok_or_else(|| { + Failure::Usage(format!( + "security report warning {} ({}) is not a security warning code", + warning.id, + warning.code.as_str() + )) + })?; + if warning.message != template { + return Err(Failure::Usage(format!( + "security report warning {} ({}) message must match fixed template", + warning.id, + warning.code.as_str() + ))); + } + Ok(template) +} + +fn insert_text_backed_fields( + finding: &mut serde_json::Map, + warning: &Warning, + refs: &SecurityReportRefs<'_>, +) -> Result<(), Failure> { + let span_ref = warning + .span_ref + .as_deref() + .expect("text warning refs validated"); + let span = refs + .spans + .get(span_ref) + .expect("text warning span_ref validated"); + let page_ref = warning + .page + .as_deref() + .expect("text warning page validated"); + let page = refs + .pages + .get(page_ref) + .expect("text warning page validated"); + validate_span_bbox(warning, span_ref, span, page)?; + finding.insert("bbox".to_string(), serde_json::json!(span.bbox.to_array())); + finding.insert( + "text_preview".to_string(), + serde_json::Value::String(deterministic_preview(&span.text)), + ); + Ok(()) +} + fn validate_warning_refs(warning: &Warning, refs: &SecurityReportRefs<'_>) -> Result<(), Failure> { + reject_unsupported_region_ref(warning)?; + validate_warning_page_ref(warning, refs)?; + let element = validate_warning_element_ref(warning, refs)?; + validate_warning_span_ref(warning, refs, element) +} + +fn reject_unsupported_region_ref(warning: &Warning) -> Result<(), Failure> { if let Some(region_ref) = warning.region_ref.as_deref() { return Err(Failure::Usage(format!( "security report warning {} region_ref {} is unsupported until security report schema supports region_ref", warning.id, region_ref ))); } - if page_backed_warning_code(warning.code) && warning.page.is_none() { + Ok(()) +} + +fn validate_warning_page_ref( + warning: &Warning, + refs: &SecurityReportRefs<'_>, +) -> Result<(), Failure> { + if warning.code.is_page_backed_security() && warning.page.is_none() { return Err(Failure::Usage(format!( "security report warning {} ({}) requires page", warning.id, @@ -261,6 +310,13 @@ fn validate_warning_refs(warning: &Warning, refs: &SecurityReportRefs<'_>) -> Re ))); } } + Ok(()) +} + +fn validate_warning_element_ref<'a>( + warning: &Warning, + refs: &'a SecurityReportRefs<'_>, +) -> Result, Failure> { let element = match warning.element_ref.as_deref() { Some(element_ref) => { let Some(element) = refs.elements.get(element_ref) else { @@ -281,7 +337,15 @@ fn validate_warning_refs(warning: &Warning, refs: &SecurityReportRefs<'_>) -> Re } None => None, }; - if text_backed_warning_code(warning.code) && warning.span_ref.is_none() { + Ok(element) +} + +fn validate_warning_span_ref( + warning: &Warning, + refs: &SecurityReportRefs<'_>, + element: Option<&Element>, +) -> Result<(), Failure> { + if warning.code.is_text_backed_security() && warning.span_ref.is_none() { return Err(Failure::Usage(format!( "security report warning {} ({}) requires span_ref", warning.id, @@ -339,7 +403,7 @@ fn validate_span_bbox( fn deterministic_preview(text: &str) -> String { let mut chars = text.chars(); - let preview: String = chars.by_ref().take(120).collect(); + let preview: String = chars.by_ref().take(PREVIEW_MAX_CHARS).collect(); if chars.next().is_some() { format!("{preview}\u{2026}") } else { diff --git a/crates/ethos-cli/src/main.rs b/crates/ethos-cli/src/main.rs index 4128d82..f76c6cd 100644 --- a/crates/ethos-cli/src/main.rs +++ b/crates/ethos-cli/src/main.rs @@ -14,16 +14,15 @@ * limitations under the License. */ -//! # `ethos` — CLI skeleton (Milestone A, WS-CONTRACTS weeks 3–4) +//! # `ethos` — source-only pre-alpha CLI //! -//! Public command groups follow the public architecture: `ethos doc …`, `ethos rag …`, -//! `ethos verify …`, plus `ethos fingerprint` (PRD §9.1). Exit codes are the public -//! contract from docs/architecture.md: 0 success, 2 usage, 3–12 stable error codes. +//! Current command groups are `ethos doc …`, `ethos rag …`, `ethos security …`, +//! `ethos verify …`, plus `ethos fingerprint`. Exit codes follow the contract from +//! docs/architecture.md: 0 success, 2 usage, 3–12 stable error codes. //! -//! Skeleton status (honest): `doc parse` is wired through the backend boundary and fails -//! with a stable code until WS-ENGINE lands PDFium; `rag chunk` and `fingerprint` are -//! fully functional over canonical JSON; `verify` runs literal quote/value, -//! presence, and table-cell checks over native Ethos JSON and ODL-style JSON. +//! Current status (honest): `doc parse` is wired through the backend boundary; `rag chunk`, +//! `security report`, and `fingerprint` operate over canonical JSON; `verify` runs literal +//! quote/value, presence, and table-cell checks over native Ethos JSON and ODL-style JSON. mod assembly; mod cmd; @@ -367,11 +366,10 @@ pub(crate) fn read_file_limited(path: &Path, max_bytes: u64) -> Result, pub(crate) fn read_document(path: &Path) -> Result { let bytes = read_file_limited(path, default_max_input_bytes())?; - let doc: Document = serde_json::from_slice(&bytes).map_err(|_| { - Failure::Usage( - "input is not a canonical ethos document (schema urn:ethos:schema:document:1)" - .to_string(), - ) + let doc: Document = serde_json::from_slice(&bytes).map_err(|error| { + Failure::Usage(format!( + "input is not a canonical ethos document (schema urn:ethos:schema:document:1): {error}" + )) })?; doc.verify_integrity().map_err(|error| { Failure::Usage(format!( diff --git a/crates/ethos-cli/tests/rag.rs b/crates/ethos-cli/tests/rag.rs index 854e58c..18018f0 100644 --- a/crates/ethos-cli/tests/rag.rs +++ b/crates/ethos-cli/tests/rag.rs @@ -16,10 +16,10 @@ use std::path::{Path, PathBuf}; use std::process::{Command, Output}; -use std::time::{SystemTime, UNIX_EPOCH}; use ethos_core::model::Document; use serde_json::Value; +use tempfile::TempDir; fn ethos_bin() -> &'static str { env!("CARGO_BIN_EXE_ethos") @@ -49,17 +49,25 @@ fn json_file(path: impl AsRef) -> Value { serde_json::from_slice(&bytes).expect("JSON fixture parses") } -fn temp_json(name: &str, json: &str) -> PathBuf { - let nanos = SystemTime::now() - .duration_since(UNIX_EPOCH) - .expect("clock after unix epoch") - .as_nanos(); - let path = std::env::temp_dir().join(format!("ethos-{name}-{nanos}.json")); +struct TempJson { + _dir: TempDir, + path: PathBuf, +} + +impl TempJson { + fn to_str(&self) -> Option<&str> { + self.path.to_str() + } +} + +fn temp_json(name: &str, json: &str) -> TempJson { + let dir = tempfile::tempdir().expect("temp dir is writable"); + let path = dir.path().join(format!("{name}.json")); std::fs::write(&path, json).expect("temp JSON is writable"); - path + TempJson { _dir: dir, path } } -fn document_with_mutated_chunk(name: &str, mutate: impl FnOnce(&mut Value)) -> PathBuf { +fn document_with_mutated_chunk(name: &str, mutate: impl FnOnce(&mut Value)) -> TempJson { let mut doc = json_file(document_example()); mutate(&mut doc); @@ -171,6 +179,60 @@ fn rag_chunk_rejects_empty_chunk_bboxes() { .contains("chunk c000001 must include at least one bbox")); } +#[test] +fn rag_chunk_rejects_duplicate_chunk_refs() { + type ChunkMutator = fn(&mut Value); + type DuplicateRefCase = (&'static str, ChunkMutator, &'static str); + + let cases: [DuplicateRefCase; 3] = [ + ( + "duplicate-element-ref", + |doc: &mut Value| { + let element_ref = doc["payload"]["chunks"][0]["element_refs"][0].clone(); + doc["payload"]["chunks"][0]["element_refs"] + .as_array_mut() + .expect("fixture chunk element_refs are an array") + .push(element_ref); + }, + "chunk c000001 has duplicate element_ref", + ), + ( + "duplicate-page-ref", + |doc: &mut Value| { + let page_ref = doc["payload"]["chunks"][0]["page_refs"][0].clone(); + doc["payload"]["chunks"][0]["page_refs"] + .as_array_mut() + .expect("fixture chunk page_refs are an array") + .push(page_ref); + }, + "chunk c000001 has duplicate page_ref", + ), + ( + "duplicate-bbox", + |doc: &mut Value| { + let bbox = doc["payload"]["chunks"][0]["bboxes"][0].clone(); + doc["payload"]["chunks"][0]["bboxes"] + .as_array_mut() + .expect("fixture chunk bboxes are an array") + .push(bbox); + }, + "duplicates an earlier bbox on page", + ), + ]; + for (name, mutate, expected) in cases { + let document = document_with_mutated_chunk(name, mutate); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2), "{name}"); + assert_eq!(output.stdout, b"", "{name}"); + assert!( + String::from_utf8_lossy(&output.stderr).contains(expected), + "{name}: {}", + String::from_utf8_lossy(&output.stderr) + ); + } +} + #[test] fn rag_chunk_rejects_chunk_bbox_page_not_listed_in_page_refs() { let mut expected_page_id = String::new(); @@ -319,6 +381,25 @@ fn rag_chunk_rejects_unknown_chunk_warning_ref() { .contains("chunk c000001 references unknown warning_ref w999999")); } +#[test] +fn rag_chunk_rejects_unknown_element_warning_ref() { + let mut expected_element_id = String::new(); + let document = document_with_mutated_chunk("stale-element-warning-ref-document", |doc| { + expected_element_id = doc["payload"]["chunks"][0]["element_refs"][0] + .as_str() + .expect("fixture chunk element_ref is a string") + .to_string(); + doc["payload"]["elements"][0]["warning_refs"] = serde_json::json!(["w999999"]); + }); + let output = run_ethos(&["rag", "chunk", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr).contains(&format!( + "chunk c000001 element_ref {expected_element_id} references unknown warning_ref w999999" + ))); +} + #[test] fn rag_chunk_rejects_default_excluded_chunk_warning_refs() { for code in [ diff --git a/crates/ethos-cli/tests/security_report.rs b/crates/ethos-cli/tests/security_report.rs index 1e3086d..a89b032 100644 --- a/crates/ethos-cli/tests/security_report.rs +++ b/crates/ethos-cli/tests/security_report.rs @@ -20,6 +20,7 @@ use std::time::{SystemTime, UNIX_EPOCH}; use ethos_core::model::Document; use serde_json::Value; +use tempfile::TempDir; fn ethos_bin() -> &'static str { env!("CARGO_BIN_EXE_ethos") @@ -33,6 +34,10 @@ fn document_example() -> PathBuf { repo_root().join("schemas/examples/document.example.json") } +fn security_report_example() -> PathBuf { + repo_root().join("schemas/examples/security-report.example.json") +} + fn run_ethos(args: &[&str]) -> Output { Command::new(ethos_bin()) .args(args) @@ -53,13 +58,25 @@ fn json_file(path: impl AsRef) -> Value { serde_json::from_slice(&bytes).expect("JSON fixture parses") } -fn temp_json(name: &str, json: &str) -> PathBuf { - let path = temp_path(name, "json"); +struct TempJson { + _dir: TempDir, + path: PathBuf, +} + +impl TempJson { + fn to_str(&self) -> Option<&str> { + self.path.to_str() + } +} + +fn temp_json(name: &str, json: &str) -> TempJson { + let dir = tempfile::tempdir().expect("temp dir is writable"); + let path = dir.path().join(format!("{name}.json")); std::fs::write(&path, json).expect("temp JSON is writable"); - path + TempJson { _dir: dir, path } } -fn document_with_mutated_warning(name: &str, mutate: impl FnOnce(&mut Value)) -> PathBuf { +fn document_with_mutated_warning(name: &str, mutate: impl FnOnce(&mut Value)) -> TempJson { let mut doc = json_file(document_example()); mutate(&mut doc); @@ -72,6 +89,23 @@ fn document_with_mutated_warning(name: &str, mutate: impl FnOnce(&mut Value)) -> ) } +#[test] +fn security_report_matches_schema_example_json() { + let output = run_ethos(&["security", "report", document_example().to_str().unwrap()]); + + assert!( + output.status.success(), + "ethos security report failed\nstatus: {:?}\nstderr:\n{}", + output.status.code(), + String::from_utf8_lossy(&output.stderr) + ); + assert_eq!(output.stderr, b""); + + let actual: Value = serde_json::from_slice(&output.stdout).expect("report JSON parses"); + let expected = json_file(security_report_example()); + assert_eq!(actual, expected); +} + #[test] fn security_report_derives_text_backed_warning_from_document() { let output = run_ethos(&["security", "report", document_example().to_str().unwrap()]); @@ -170,6 +204,62 @@ fn security_report_output_is_byte_identical_across_runs() { assert_eq!(first.stdout, second.stdout); } +#[test] +fn security_report_rejects_warning_message_template_drift() { + let document = document_with_mutated_warning("security-warning-message-drift", |doc| { + doc["payload"]["security_warnings"][0]["message"] = + serde_json::json!("hidden text from local path /tmp/input.pdf"); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert_eq!(output.status.code(), Some(2)); + assert_eq!(output.stdout, b""); + assert!(String::from_utf8_lossy(&output.stderr).contains( + "security report warning w0001 (hidden_text_detected) message must match fixed template" + )); +} + +#[test] +fn security_report_orders_multiple_text_backed_findings_deterministically() { + let document = document_with_mutated_warning("multiple-text-backed-security-warnings", |doc| { + doc["payload"]["spans"][1]["text"] = serde_json::json!("x".repeat(121)); + doc["payload"]["security_warnings"] + .as_array_mut() + .expect("fixture security_warnings are an array") + .push(serde_json::json!({ + "id": "w0003", + "code": "low_contrast_text_detected", + "message": "low-contrast text detected: excluded from default chunks", + "page": "p0001", + "span_ref": "s000002" + })); + }); + + let output = run_ethos(&["security", "report", document.to_str().unwrap()]); + + assert!( + output.status.success(), + "ethos security report failed\nstatus: {:?}\nstderr:\n{}", + output.status.code(), + String::from_utf8_lossy(&output.stderr) + ); + assert_eq!(output.stderr, b""); + let report: Value = serde_json::from_slice(&output.stdout).expect("report JSON parses"); + assert_eq!(report["findings"].as_array().unwrap().len(), 2); + assert_eq!(report["summary"]["hidden_text_detected"], 1); + assert_eq!(report["summary"]["low_contrast_text_detected"], 1); + assert_eq!(report["findings"][0]["id"], "f0001"); + assert_eq!(report["findings"][0]["code"], "hidden_text_detected"); + assert_eq!(report["findings"][1]["id"], "f0002"); + assert_eq!(report["findings"][1]["code"], "low_contrast_text_detected"); + assert_eq!(report["findings"][1]["span_ref"], "s000002"); + assert_eq!( + report["findings"][1]["text_preview"], + format!("{}\u{2026}", "x".repeat(120)) + ); +} + #[test] fn security_report_rejects_inventory_backed_warning_without_inventory_source() { for code in [ diff --git a/crates/ethos-cli/tests/verify.rs b/crates/ethos-cli/tests/verify.rs index d2aa780..83c9647 100644 --- a/crates/ethos-cli/tests/verify.rs +++ b/crates/ethos-cli/tests/verify.rs @@ -691,9 +691,9 @@ fn malformed_native_document_is_usage_error() { assert_eq!(output.status.code(), Some(2)); assert_eq!(output.stdout, b""); - assert!( - String::from_utf8_lossy(&output.stderr).contains("input is not a canonical ethos document") - ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stderr.contains("input is not a canonical ethos document")); + assert!(stderr.contains("missing field `schema_version`")); } #[test] diff --git a/crates/ethos-core/src/codes.rs b/crates/ethos-core/src/codes.rs index ea40b73..678799b 100644 --- a/crates/ethos-core/src/codes.rs +++ b/crates/ethos-core/src/codes.rs @@ -95,6 +95,57 @@ impl WarningCode { | WarningCode::ImageOnlyPage ) } + + /// Security-report codes that currently require inventory data not present in the + /// canonical document warning lane. + pub fn is_inventory_backed_security(self) -> bool { + matches!( + self, + WarningCode::AnnotationsPresent + | WarningCode::ExternalLinksPresent + | WarningCode::UnsupportedAnnotation + ) + } + + /// Security-report codes backed by text spans and excluded from default chunks. + pub fn is_text_backed_security(self) -> bool { + matches!( + self, + WarningCode::HiddenTextDetected + | WarningCode::OffPageTextDetected + | WarningCode::LowContrastTextDetected + ) + } + + /// Security-report codes that require a page reference. + pub fn is_page_backed_security(self) -> bool { + self.is_text_backed_security() || matches!(self, WarningCode::ImageOnlyPage) + } + + /// Warning codes that must never appear in default chunk artifacts. + pub fn excludes_from_default_chunks(self) -> bool { + self.is_text_backed_security() + } + + /// Fixed message template for security-report findings. + pub fn security_report_message_template(self) -> Option<&'static str> { + match self { + WarningCode::HiddenTextDetected => { + Some("hidden text detected: excluded from default chunks") + } + WarningCode::OffPageTextDetected => { + Some("off-page text detected: excluded from default chunks") + } + WarningCode::LowContrastTextDetected => { + Some("low-contrast text detected: excluded from default chunks") + } + WarningCode::AnnotationsPresent => Some("annotations present on page"), + WarningCode::ExternalLinksPresent => Some("external links present on page"), + WarningCode::UnsupportedAnnotation => Some("unsupported annotation ignored"), + WarningCode::ImageOnlyPage => Some("image-only page"), + _ => None, + } + } } impl core::fmt::Display for WarningCode { @@ -137,4 +188,89 @@ mod tests { ] ); } + + #[test] + fn security_report_warning_policy_matches_contract() { + let inventory_backed: Vec<_> = WarningCode::ALL + .iter() + .filter(|c| c.is_inventory_backed_security()) + .map(|c| c.as_str()) + .collect(); + assert_eq!( + inventory_backed, + vec![ + "annotations_present", + "external_links_present", + "unsupported_annotation", + ] + ); + + let text_backed: Vec<_> = WarningCode::ALL + .iter() + .filter(|c| c.is_text_backed_security()) + .map(|c| c.as_str()) + .collect(); + assert_eq!( + text_backed, + vec![ + "hidden_text_detected", + "off_page_text_detected", + "low_contrast_text_detected", + ] + ); + + let page_backed: Vec<_> = WarningCode::ALL + .iter() + .filter(|c| c.is_page_backed_security()) + .map(|c| c.as_str()) + .collect(); + assert_eq!( + page_backed, + vec![ + "hidden_text_detected", + "off_page_text_detected", + "low_contrast_text_detected", + "image_only_page", + ] + ); + + let default_chunk_excluded: Vec<_> = WarningCode::ALL + .iter() + .filter(|c| c.excludes_from_default_chunks()) + .map(|c| c.as_str()) + .collect(); + assert_eq!(default_chunk_excluded, text_backed); + } + + #[test] + fn security_report_message_templates_match_contract() { + let templates: Vec<_> = WarningCode::ALL + .iter() + .filter_map(|c| { + c.security_report_message_template() + .map(|template| (c.as_str(), template)) + }) + .collect(); + assert_eq!( + templates, + vec![ + ( + "hidden_text_detected", + "hidden text detected: excluded from default chunks", + ), + ( + "off_page_text_detected", + "off-page text detected: excluded from default chunks", + ), + ( + "low_contrast_text_detected", + "low-contrast text detected: excluded from default chunks", + ), + ("annotations_present", "annotations present on page"), + ("external_links_present", "external links present on page"), + ("image_only_page", "image-only page"), + ("unsupported_annotation", "unsupported annotation ignored"), + ] + ); + } } diff --git a/docs/validation/milestone-c-closeout-validation-2026-06-18.md b/docs/validation/milestone-c-closeout-validation-2026-06-18.md index 68e4db5..029b01a 100644 --- a/docs/validation/milestone-c-closeout-validation-2026-06-18.md +++ b/docs/validation/milestone-c-closeout-validation-2026-06-18.md @@ -17,7 +17,8 @@ unchanged**. ## Subject - Repository: `docushell/ethos` -- Validated `main` HEAD: `4e3adbb` +- Validated source HEAD before this record: `4e3adbb` +- Closeout record commit on `main`: `21d1810` - Scope: tracked source tree, committed schemas/examples, `ethos rag chunk`, current `ethos security report` source-only artifact checks, and Make/static guard wiring - Excluded: public benchmark-result publication, release artifacts, package publication, diff --git a/schemas/examples/security-report.example.json b/schemas/examples/security-report.example.json index ed80df9..b2a7b41 100644 --- a/schemas/examples/security-report.example.json +++ b/schemas/examples/security-report.example.json @@ -7,9 +7,7 @@ "sha256": "d6145b9210845db39ad592ea549788432b52a649778c9947f5b2d91173e38070" }, "summary": { - "hidden_text_detected": 1, - "annotations_present": 1, - "external_links_present": 1 + "hidden_text_detected": 1 }, "findings": [ { @@ -26,57 +24,13 @@ ], "text_preview": "internal-draft-do-not-cite", "excluded_from_default_chunks": true - }, - { - "id": "f0002", - "code": "annotations_present", - "message": "annotations present on page", - "page": "p0001", - "excluded_from_default_chunks": false - }, - { - "id": "f0003", - "code": "external_links_present", - "message": "external links present on page", - "page": "p0001", - "excluded_from_default_chunks": false } ], "inventories": { - "annotations": [ - { - "page": "p0001", - "kind": "link", - "bbox": [ - 7200, - 10100, - 20000, - 11500 - ], - "supported": true - } - ], - "actions": [ - { - "kind": "uri", - "page": "p0001", - "target": "https://example.com/q3" - } - ], + "annotations": [], + "actions": [], "attachments": [], "scripts": [], - "links": [ - { - "page": "p0001", - "uri": "https://example.com/q3", - "external": true, - "bbox": [ - 7200, - 10100, - 20000, - 11500 - ] - } - ] + "links": [] } } diff --git a/schemas/examples/security-report.full.example.json b/schemas/examples/security-report.full.example.json new file mode 100644 index 0000000..ed80df9 --- /dev/null +++ b/schemas/examples/security-report.full.example.json @@ -0,0 +1,82 @@ +{ + "schema_version": "1.0.0", + "document_fingerprint": "sha256:b5d30710d0c25cc38d8dec924ecaf57ae4f81276dd5dc14d75cb3b5b6bde62d3", + "source_fingerprint": "sha256:5f70bf18a086007016e948b04aed3b82103a36bea41755b6cddfaf10ace3c6ef", + "profile": { + "id": "ethos-deterministic-v1", + "sha256": "d6145b9210845db39ad592ea549788432b52a649778c9947f5b2d91173e38070" + }, + "summary": { + "hidden_text_detected": 1, + "annotations_present": 1, + "external_links_present": 1 + }, + "findings": [ + { + "id": "f0001", + "code": "hidden_text_detected", + "message": "hidden text detected: excluded from default chunks", + "page": "p0001", + "span_ref": "s000003", + "bbox": [ + 100, + 79100, + 6000, + 79200 + ], + "text_preview": "internal-draft-do-not-cite", + "excluded_from_default_chunks": true + }, + { + "id": "f0002", + "code": "annotations_present", + "message": "annotations present on page", + "page": "p0001", + "excluded_from_default_chunks": false + }, + { + "id": "f0003", + "code": "external_links_present", + "message": "external links present on page", + "page": "p0001", + "excluded_from_default_chunks": false + } + ], + "inventories": { + "annotations": [ + { + "page": "p0001", + "kind": "link", + "bbox": [ + 7200, + 10100, + 20000, + 11500 + ], + "supported": true + } + ], + "actions": [ + { + "kind": "uri", + "page": "p0001", + "target": "https://example.com/q3" + } + ], + "attachments": [], + "scripts": [], + "links": [ + { + "page": "p0001", + "uri": "https://example.com/q3", + "external": true, + "bbox": [ + 7200, + 10100, + 20000, + 11500 + ] + } + ] + } +} diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py index fcfd837..d9295fb 100644 --- a/schemas/security_report_validation.py +++ b/schemas/security_report_validation.py @@ -48,6 +48,8 @@ TEXT_BACKED_FINDING_CODES = DEFAULT_CHUNK_EXCLUDED_CODES +PREVIEW_MAX_CHARS = 120 + FINDING_MESSAGE_TEMPLATES = { "hidden_text_detected": "hidden text detected: excluded from default chunks", "off_page_text_detected": "off-page text detected: excluded from default chunks", @@ -1074,9 +1076,9 @@ def check_text_backed_finding(finding, refs, ctx, item_ctx, diagnostics): def deterministic_preview(text): - if len(text) <= 120: + if len(text) <= PREVIEW_MAX_CHARS: return text - return text[:120] + "\u2026" + return text[:PREVIEW_MAX_CHARS] + "\u2026" def is_json_integer(value): diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py index bd3ac10..56c09e3 100644 --- a/schemas/test_security_report_validation.py +++ b/schemas/test_security_report_validation.py @@ -23,6 +23,7 @@ from pathlib import Path from security_report_validation import ( + DEFAULT_CHUNK_EXCLUDED_CODES, FINDING_MESSAGE_TEMPLATES, SECURITY_WARNING_CODES, diagnose_security_report_example, @@ -33,17 +34,76 @@ EXAMPLES = ROOT / "examples" +def schema(path: str) -> dict: + return json.loads((ROOT / path).read_text()) + + +def find_enum_with_value(node, value: str) -> set[str]: + if isinstance(node, dict): + enum = node.get("enum") + if isinstance(enum, list) and value in enum: + return set(enum) + for child in node.values(): + found = find_enum_with_value(child, value) + if found: + return found + if isinstance(node, list): + for child in node: + found = find_enum_with_value(child, value) + if found: + return found + return set() + + class SecurityReportValidationTests(unittest.TestCase): def setUp(self) -> None: self.document = json.loads((EXAMPLES / "document.example.json").read_text()) - self.report = json.loads((EXAMPLES / "security-report.example.json").read_text()) + self.source_report = json.loads( + (EXAMPLES / "security-report.example.json").read_text() + ) + self.report = json.loads( + (EXAMPLES / "security-report.full.example.json").read_text() + ) def test_current_examples_are_coherent(self) -> None: + self.assertEqual( + diagnose_security_report_example(self.document, self.source_report), + [], + ) self.assertEqual(diagnose_security_report_example(self.document, self.report), []) def test_all_security_warning_codes_have_fixed_message_templates(self) -> None: self.assertEqual(set(FINDING_MESSAGE_TEMPLATES), SECURITY_WARNING_CODES) + def test_warning_policy_constants_match_schema_enums(self) -> None: + document_warning_codes = find_enum_with_value( + schema("ethos-document.schema.json"), + "hidden_text_detected", + ) + self.assertTrue(document_warning_codes) + self.assertEqual(SECURITY_WARNING_CODES - document_warning_codes, set()) + + security_summary_codes = set( + schema("ethos-security-report.schema.json")["properties"]["summary"][ + "patternProperties" + ] + ) + security_summary_codes = { + code + for pattern in security_summary_codes + for code in pattern.strip("^$()").split("|") + } + self.assertEqual(security_summary_codes, SECURITY_WARNING_CODES) + + chunk_warning_codes = set( + schema("ethos-chunks.schema.json")["properties"]["warnings"]["items"]["enum"] + ) + self.assertEqual( + chunk_warning_codes, + document_warning_codes - DEFAULT_CHUNK_EXCLUDED_CODES, + ) + self.assertEqual(chunk_warning_codes & DEFAULT_CHUNK_EXCLUDED_CODES, set()) + def test_schema_version_must_match_document(self) -> None: report = copy.deepcopy(self.report) report["schema_version"] = "1.0.1" diff --git a/schemas/validate_examples.py b/schemas/validate_examples.py index 0812647..da2b287 100644 --- a/schemas/validate_examples.py +++ b/schemas/validate_examples.py @@ -30,7 +30,14 @@ from pathlib import Path from font_policy_validation import diagnose_font_policy -from security_report_validation import diagnose_security_report_example +from security_report_validation import ( + DEFAULT_CHUNK_EXCLUDED_CODES, + FINDING_MESSAGE_TEMPLATES, + INVENTORY_BACKED_FINDING_CODES, + TEXT_BACKED_FINDING_CODES, + diagnose_security_report_example, + deterministic_preview, +) from table_model_validation import diagnose_table_model try: @@ -63,7 +70,10 @@ EXAMPLES / "citations.example.json", EXAMPLES / "citations-array.example.json", ]), - ("ethos-security-report.schema.json", [EXAMPLES / "security-report.example.json"]), + ("ethos-security-report.schema.json", [ + EXAMPLES / "security-report.example.json", + EXAMPLES / "security-report.full.example.json", + ]), ("ethos-verification-report.schema.json", [ EXAMPLES / "verification-report.example.json", EXAMPLES / "verification-report-negative.example.json", @@ -188,17 +198,16 @@ def c14n_line(v) -> str: return json.dumps(v, separators=(",", ":"), sort_keys=True, ensure_ascii=False) +def c14n_value(v) -> str: + return json.dumps(v, separators=(",", ":"), sort_keys=True, ensure_ascii=False) + + wcodes = {w["id"]: w["code"] for w in p["security_warnings"] + p["parser_warnings"]} -default_chunk_excluded_warning_codes = { - "hidden_text_detected", - "off_page_text_detected", - "low_contrast_text_detected", -} expected_lines = [] for ch in p["chunks"]: for warning_ref in ch.get("warning_refs", []): code = wcodes[warning_ref] - if code in default_chunk_excluded_warning_codes: + if code in DEFAULT_CHUNK_EXCLUDED_CODES: fail( "document.example.json: " f"chunk {ch['id']} references default-excluded warning_ref {warning_ref} ({code})" @@ -227,13 +236,96 @@ def c14n_line(v) -> str: else: print(f"ok chunks.example.jsonl derivable from document example ({len(actual_lines)} chunks)") +spans = {span["id"]: span for span in p["spans"]} +security_warnings = sorted( + p["security_warnings"], + key=lambda warning: ( + warning["code"], + warning.get("page", ""), + warning.get("element_ref", ""), + warning.get("span_ref", ""), + warning.get("region_ref", ""), + warning.get("message", ""), + ), +) +summary = {} +findings = [] +for index, warning in enumerate(security_warnings, 1): + if warning["code"] not in FINDING_MESSAGE_TEMPLATES: + continue + if warning["code"] in INVENTORY_BACKED_FINDING_CODES: + fail( + "document.example.json: security-report.example.json cannot derive " + f"inventory-backed warning {warning['id']} ({warning['code']}) " + "from source-only document warnings" + ) + continue + summary[warning["code"]] = summary.get(warning["code"], 0) + 1 + finding = { + "id": f"f{index:04}", + "code": warning["code"], + "message": warning["message"], + "excluded_from_default_chunks": warning["code"] in DEFAULT_CHUNK_EXCLUDED_CODES, + } + for field in ("page", "element_ref", "span_ref"): + if field in warning: + finding[field] = warning[field] + if warning["code"] in TEXT_BACKED_FINDING_CODES: + span_ref = warning.get("span_ref") + if span_ref is None or span_ref not in spans: + fail( + "document.example.json: security-report.example.json cannot derive " + f"text-backed warning {warning['id']} without a valid span_ref" + ) + else: + finding["bbox"] = spans[span_ref]["bbox"] + finding["text_preview"] = deterministic_preview(spans[span_ref]["text"]) + findings.append(finding) + +expected_security_report = { + "schema_version": doc["schema_version"], + "document_fingerprint": doc["fingerprint"], + "source_fingerprint": doc["source"]["fingerprint"], + "profile": { + "id": doc["profile"]["id"], + "sha256": doc["profile"]["sha256"], + }, + "summary": summary, + "findings": findings, + "inventories": { + "annotations": [], + "actions": [], + "attachments": [], + "scripts": [], + "links": [], + }, +} +actual_security_report = json.loads( + (EXAMPLES / "security-report.example.json").read_text(encoding="utf-8") +) +if c14n_value(expected_security_report) != c14n_value(actual_security_report): + fail( + "security-report.example.json is not derivable from document.example.json " + f"({len(actual_security_report.get('findings', []))} findings vs " + f"{len(findings)} derived)" + ) +else: + finding_label = "finding" if len(findings) == 1 else "findings" + print( + "ok security-report.example.json derivable from document example " + f"({len(findings)} {finding_label})" + ) + # fingerprint coherence across example artifacts -sec = json.loads((EXAMPLES / "security-report.example.json").read_text(encoding="utf-8")) +sec = actual_security_report +sec_full = json.loads((EXAMPLES / "security-report.full.example.json").read_text(encoding="utf-8")) ver = json.loads((EXAMPLES / "verification-report.example.json").read_text(encoding="utf-8")) crop = json.loads((EXAMPLES / "crop-descriptor.example.json").read_text(encoding="utf-8")) for label, got in [ ("security-report.document_fingerprint", sec["document_fingerprint"]), ("security-report.source_fingerprint", sec["source_fingerprint"]), + ("security-report-full.document_fingerprint", sec_full["document_fingerprint"]), + ("security-report-full.source_fingerprint", sec_full["source_fingerprint"]), ("verification-report.document_fingerprint", ver["document_fingerprint"]), ("crop-descriptor.document_fingerprint", crop["document_fingerprint"]), ]: @@ -243,11 +335,18 @@ def c14n_line(v) -> str: print("ok example fingerprints coherent across artifacts") security_report_diagnostics = diagnose_security_report_example(doc, sec) -if security_report_diagnostics: +security_report_full_diagnostics = diagnose_security_report_example( + doc, + sec_full, + ctx="security-report.full.example.json", +) +if security_report_diagnostics or security_report_full_diagnostics: for diagnostic in security_report_diagnostics: fail(diagnostic) + for diagnostic in security_report_full_diagnostics: + fail(diagnostic) else: - print("ok security report example findings are grounded in document example") + print("ok security report examples findings are grounded in document example") # deterministic profile font-policy artifact checks profile = json.loads(