From 7d5c6cee0cc31907e85fc732df79e3f2a72dfd59 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 17 Jun 2026 14:02:37 +0530 Subject: [PATCH] Guard security report example findings Signed-off-by: docushell-admin --- .github/scripts/test_ci_workflow.py | 1 + .../test_milestone_b_internal_checks.py | 1 + .github/workflows/ci.yml | 1 + Makefile | 1 + schemas/security_report_validation.py | 110 ++++++++++++++++++ schemas/test_security_report_validation.py | 110 ++++++++++++++++++ schemas/validate_examples.py | 8 ++ 7 files changed, 232 insertions(+) create mode 100644 schemas/security_report_validation.py create mode 100644 schemas/test_security_report_validation.py diff --git a/.github/scripts/test_ci_workflow.py b/.github/scripts/test_ci_workflow.py index bbc0a53..f104562 100644 --- a/.github/scripts/test_ci_workflow.py +++ b/.github/scripts/test_ci_workflow.py @@ -42,6 +42,7 @@ def test_schema_job_installs_jsonschema_and_validates_examples(self) -> None: self.assertIn('pip install "jsonschema>=4.18"', text) self.assertIn("python3 schemas/validate_examples.py", text) + self.assertIn("python3 schemas/test_security_report_validation.py", text) self.assertIn("python3 schemas/test_table_model_validation.py", text) def test_ci_workflow_guard_is_run_by_ci(self) -> None: diff --git a/.github/scripts/test_milestone_b_internal_checks.py b/.github/scripts/test_milestone_b_internal_checks.py index 387828a..9ab276a 100644 --- a/.github/scripts/test_milestone_b_internal_checks.py +++ b/.github/scripts/test_milestone_b_internal_checks.py @@ -61,6 +61,7 @@ def test_target_composes_current_internal_gates(self) -> None: "$(PYTHON) fixtures/validate_fixtures.py", "$(PYTHON) fixtures/test_validate_fixtures.py", "$(PYTHON) schemas/test_font_policy_validation.py", + "$(PYTHON) schemas/test_security_report_validation.py", "$(PYTHON) .github/scripts/test_execution_status.py", "$(PYTHON) .github/scripts/test_roadmap_status.py", "$(PYTHON) .github/scripts/test_milestone_b_closeout_record.py", diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a220cb1..95d064f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -105,6 +105,7 @@ jobs: python-version: "3.12" - run: pip install "jsonschema>=4.18" - run: python3 schemas/validate_examples.py + - run: python3 schemas/test_security_report_validation.py - run: python3 schemas/test_table_model_validation.py - name: Gate Zero result schema validation run: | diff --git a/Makefile b/Makefile index 6217698..49164af 100644 --- a/Makefile +++ b/Makefile @@ -58,6 +58,7 @@ milestone-b-internal-checks: $(PYTHON) fixtures/validate_fixtures.py $(PYTHON) fixtures/test_validate_fixtures.py $(PYTHON) schemas/test_font_policy_validation.py + $(PYTHON) schemas/test_security_report_validation.py $(PYTHON) .github/scripts/test_execution_status.py $(PYTHON) .github/scripts/test_roadmap_status.py $(PYTHON) .github/scripts/test_milestone_b_closeout_record.py diff --git a/schemas/security_report_validation.py b/schemas/security_report_validation.py new file mode 100644 index 0000000..2c03e50 --- /dev/null +++ b/schemas/security_report_validation.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Security-report example validation helpers.""" + +from __future__ import annotations + + +REPORTABLE_WARNING_CODES = { + "hidden_text_detected", + "off_page_text_detected", + "low_contrast_text_detected", + "annotations_present", + "external_links_present", + "unsupported_annotation", + "image_only_page", +} + +DEFAULT_CHUNK_EXCLUDED_CODES = { + "hidden_text_detected", + "off_page_text_detected", + "low_contrast_text_detected", +} + + +def diagnose_security_report_example( + document, + report, + ctx: str = "security-report.example.json", +): + diagnostics = [] + payload = document.get("payload") if isinstance(document, dict) else {} + warnings = [] + if isinstance(payload, dict): + warnings.extend(payload.get("security_warnings", [])) + warnings.extend(payload.get("parser_warnings", [])) + + findings = report.get("findings") if isinstance(report, dict) else [] + if not isinstance(findings, list): + return [f"{ctx}: findings must be an array"] + summary = report.get("summary") if isinstance(report, dict) else {} + if not isinstance(summary, dict): + return [f"{ctx}: summary must be an object"] + + warning_derived_findings = [ + projected_warning_finding(warning) + for warning in warnings + if isinstance(warning, dict) and warning.get("code") in REPORTABLE_WARNING_CODES + ] + actual_projected_findings = [ + project_report_finding(finding) + for finding in findings + if isinstance(finding, dict) + ] + + for expected in warning_derived_findings: + if expected not in actual_projected_findings: + diagnostics.append( + f"{ctx}: missing warning-derived finding for {expected['code']}" + ) + + for code in sorted({finding["code"] for finding in warning_derived_findings}): + expected_count = sum( + 1 for finding in warning_derived_findings if finding["code"] == code + ) + if summary.get(code) != expected_count: + diagnostics.append( + f"{ctx}: summary.{code} must be {expected_count} " + "for warning-derived findings" + ) + + return diagnostics + + +def projected_warning_finding(warning): + projected = { + "code": warning.get("code"), + "message": warning.get("message"), + "excluded_from_default_chunks": warning.get("code") in DEFAULT_CHUNK_EXCLUDED_CODES, + } + for key in ("page", "element_ref", "span_ref"): + if key in warning: + projected[key] = warning[key] + return projected + + +def project_report_finding(finding): + projected = { + "code": finding.get("code"), + "message": finding.get("message"), + "excluded_from_default_chunks": finding.get("excluded_from_default_chunks"), + } + for key in ("page", "element_ref", "span_ref"): + if key in finding: + projected[key] = finding[key] + return projected diff --git a/schemas/test_security_report_validation.py b/schemas/test_security_report_validation.py new file mode 100644 index 0000000..4aca4b0 --- /dev/null +++ b/schemas/test_security_report_validation.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# +# Copyright 2026 The Ethos maintainers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import copy +import json +import unittest +from pathlib import Path + +from security_report_validation import diagnose_security_report_example + + +ROOT = Path(__file__).resolve().parent +EXAMPLES = ROOT / "examples" + + +class SecurityReportValidationTests(unittest.TestCase): + def setUp(self) -> None: + self.document = json.loads((EXAMPLES / "document.example.json").read_text()) + self.report = json.loads((EXAMPLES / "security-report.example.json").read_text()) + + def test_current_examples_are_coherent(self) -> None: + self.assertEqual(diagnose_security_report_example(self.document, self.report), []) + + def test_warning_derived_summary_must_match_document_warning_count(self) -> None: + report = copy.deepcopy(self.report) + report["summary"]["hidden_text_detected"] = 2 + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: summary.hidden_text_detected must be 1 " + "for warning-derived findings", + diagnostics, + ) + + def test_document_security_warnings_must_have_matching_findings(self) -> None: + report = copy.deepcopy(self.report) + report["findings"] = [ + finding + for finding in report["findings"] + if finding["code"] != "hidden_text_detected" + ] + report["summary"].pop("hidden_text_detected") + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: missing warning-derived finding for hidden_text_detected", + diagnostics, + ) + + def test_warning_refs_must_match_report_finding_projection(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0]["span_ref"] = "s999999" + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: missing warning-derived finding for hidden_text_detected", + diagnostics, + ) + + def test_default_excluded_warning_codes_must_be_flagged(self) -> None: + report = copy.deepcopy(self.report) + report["findings"][0]["excluded_from_default_chunks"] = False + + diagnostics = diagnose_security_report_example(self.document, report) + + self.assertIn( + "security-report.example.json: missing warning-derived finding for hidden_text_detected", + diagnostics, + ) + + def test_reportable_parser_warning_codes_are_included_when_present(self) -> None: + document = copy.deepcopy(self.document) + document["payload"]["parser_warnings"].append( + { + "id": "w0099", + "code": "image_only_page", + "message": "image-only page requires OCR", + "page": "p0001", + } + ) + + diagnostics = diagnose_security_report_example(document, self.report) + + self.assertIn( + "security-report.example.json: missing warning-derived finding for image_only_page", + diagnostics, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/schemas/validate_examples.py b/schemas/validate_examples.py index 43cf52f..0812647 100644 --- a/schemas/validate_examples.py +++ b/schemas/validate_examples.py @@ -30,6 +30,7 @@ from pathlib import Path from font_policy_validation import diagnose_font_policy +from security_report_validation import diagnose_security_report_example from table_model_validation import diagnose_table_model try: @@ -241,6 +242,13 @@ def c14n_line(v) -> str: fail(f"{label} diverges from document example") print("ok example fingerprints coherent across artifacts") +security_report_diagnostics = diagnose_security_report_example(doc, sec) +if security_report_diagnostics: + for diagnostic in security_report_diagnostics: + fail(diagnostic) +else: + print("ok security report example findings are grounded in document example") + # deterministic profile font-policy artifact checks profile = json.loads( (ROOT / "profiles" / "ethos-deterministic-v1.json").read_text(encoding="utf-8")