Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/scripts/test_ci_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_schema_job_installs_jsonschema_and_validates_examples(self) -> None:

self.assertIn('pip install "jsonschema>=4.18"', text)
self.assertIn("python3 schemas/validate_examples.py", text)
self.assertIn("python3 schemas/test_security_report_validation.py", text)
self.assertIn("python3 schemas/test_table_model_validation.py", text)

def test_ci_workflow_guard_is_run_by_ci(self) -> None:
Expand Down
1 change: 1 addition & 0 deletions .github/scripts/test_milestone_b_internal_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def test_target_composes_current_internal_gates(self) -> None:
"$(PYTHON) fixtures/validate_fixtures.py",
"$(PYTHON) fixtures/test_validate_fixtures.py",
"$(PYTHON) schemas/test_font_policy_validation.py",
"$(PYTHON) schemas/test_security_report_validation.py",
"$(PYTHON) .github/scripts/test_execution_status.py",
"$(PYTHON) .github/scripts/test_roadmap_status.py",
"$(PYTHON) .github/scripts/test_milestone_b_closeout_record.py",
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ jobs:
python-version: "3.12"
- run: pip install "jsonschema>=4.18"
- run: python3 schemas/validate_examples.py
- run: python3 schemas/test_security_report_validation.py
- run: python3 schemas/test_table_model_validation.py
- name: Gate Zero result schema validation
run: |
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ milestone-b-internal-checks:
$(PYTHON) fixtures/validate_fixtures.py
$(PYTHON) fixtures/test_validate_fixtures.py
$(PYTHON) schemas/test_font_policy_validation.py
$(PYTHON) schemas/test_security_report_validation.py
$(PYTHON) .github/scripts/test_execution_status.py
$(PYTHON) .github/scripts/test_roadmap_status.py
$(PYTHON) .github/scripts/test_milestone_b_closeout_record.py
Expand Down
110 changes: 110 additions & 0 deletions schemas/security_report_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3
#
# Copyright 2026 The Ethos maintainers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Security-report example validation helpers."""

from __future__ import annotations


REPORTABLE_WARNING_CODES = {
"hidden_text_detected",
"off_page_text_detected",
"low_contrast_text_detected",
"annotations_present",
"external_links_present",
"unsupported_annotation",
"image_only_page",
}

DEFAULT_CHUNK_EXCLUDED_CODES = {
"hidden_text_detected",
"off_page_text_detected",
"low_contrast_text_detected",
}


def diagnose_security_report_example(
document,
report,
ctx: str = "security-report.example.json",
):
diagnostics = []
payload = document.get("payload") if isinstance(document, dict) else {}
warnings = []
if isinstance(payload, dict):
warnings.extend(payload.get("security_warnings", []))
warnings.extend(payload.get("parser_warnings", []))

findings = report.get("findings") if isinstance(report, dict) else []
if not isinstance(findings, list):
return [f"{ctx}: findings must be an array"]
summary = report.get("summary") if isinstance(report, dict) else {}
if not isinstance(summary, dict):
return [f"{ctx}: summary must be an object"]

warning_derived_findings = [
projected_warning_finding(warning)
for warning in warnings
if isinstance(warning, dict) and warning.get("code") in REPORTABLE_WARNING_CODES
]
actual_projected_findings = [
project_report_finding(finding)
for finding in findings
if isinstance(finding, dict)
]

for expected in warning_derived_findings:
if expected not in actual_projected_findings:
diagnostics.append(
f"{ctx}: missing warning-derived finding for {expected['code']}"
)

for code in sorted({finding["code"] for finding in warning_derived_findings}):
expected_count = sum(
1 for finding in warning_derived_findings if finding["code"] == code
)
if summary.get(code) != expected_count:
diagnostics.append(
f"{ctx}: summary.{code} must be {expected_count} "
"for warning-derived findings"
)

return diagnostics


def projected_warning_finding(warning):
projected = {
"code": warning.get("code"),
"message": warning.get("message"),
"excluded_from_default_chunks": warning.get("code") in DEFAULT_CHUNK_EXCLUDED_CODES,
}
for key in ("page", "element_ref", "span_ref"):
if key in warning:
projected[key] = warning[key]
return projected


def project_report_finding(finding):
projected = {
"code": finding.get("code"),
"message": finding.get("message"),
"excluded_from_default_chunks": finding.get("excluded_from_default_chunks"),
}
for key in ("page", "element_ref", "span_ref"):
if key in finding:
projected[key] = finding[key]
return projected
110 changes: 110 additions & 0 deletions schemas/test_security_report_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3
#
# Copyright 2026 The Ethos maintainers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import annotations

import copy
import json
import unittest
from pathlib import Path

from security_report_validation import diagnose_security_report_example


ROOT = Path(__file__).resolve().parent
EXAMPLES = ROOT / "examples"


class SecurityReportValidationTests(unittest.TestCase):
def setUp(self) -> None:
self.document = json.loads((EXAMPLES / "document.example.json").read_text())
self.report = json.loads((EXAMPLES / "security-report.example.json").read_text())

def test_current_examples_are_coherent(self) -> None:
self.assertEqual(diagnose_security_report_example(self.document, self.report), [])

def test_warning_derived_summary_must_match_document_warning_count(self) -> None:
report = copy.deepcopy(self.report)
report["summary"]["hidden_text_detected"] = 2

diagnostics = diagnose_security_report_example(self.document, report)

self.assertIn(
"security-report.example.json: summary.hidden_text_detected must be 1 "
"for warning-derived findings",
diagnostics,
)

def test_document_security_warnings_must_have_matching_findings(self) -> None:
report = copy.deepcopy(self.report)
report["findings"] = [
finding
for finding in report["findings"]
if finding["code"] != "hidden_text_detected"
]
report["summary"].pop("hidden_text_detected")

diagnostics = diagnose_security_report_example(self.document, report)

self.assertIn(
"security-report.example.json: missing warning-derived finding for hidden_text_detected",
diagnostics,
)

def test_warning_refs_must_match_report_finding_projection(self) -> None:
report = copy.deepcopy(self.report)
report["findings"][0]["span_ref"] = "s999999"

diagnostics = diagnose_security_report_example(self.document, report)

self.assertIn(
"security-report.example.json: missing warning-derived finding for hidden_text_detected",
diagnostics,
)

def test_default_excluded_warning_codes_must_be_flagged(self) -> None:
report = copy.deepcopy(self.report)
report["findings"][0]["excluded_from_default_chunks"] = False

diagnostics = diagnose_security_report_example(self.document, report)

self.assertIn(
"security-report.example.json: missing warning-derived finding for hidden_text_detected",
diagnostics,
)

def test_reportable_parser_warning_codes_are_included_when_present(self) -> None:
document = copy.deepcopy(self.document)
document["payload"]["parser_warnings"].append(
{
"id": "w0099",
"code": "image_only_page",
"message": "image-only page requires OCR",
"page": "p0001",
}
)

diagnostics = diagnose_security_report_example(document, self.report)

self.assertIn(
"security-report.example.json: missing warning-derived finding for image_only_page",
diagnostics,
)


if __name__ == "__main__":
unittest.main()
8 changes: 8 additions & 0 deletions schemas/validate_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pathlib import Path

from font_policy_validation import diagnose_font_policy
from security_report_validation import diagnose_security_report_example
from table_model_validation import diagnose_table_model

try:
Expand Down Expand Up @@ -241,6 +242,13 @@ def c14n_line(v) -> str:
fail(f"{label} diverges from document example")
print("ok example fingerprints coherent across artifacts")

security_report_diagnostics = diagnose_security_report_example(doc, sec)
if security_report_diagnostics:
for diagnostic in security_report_diagnostics:
fail(diagnostic)
else:
print("ok security report example findings are grounded in document example")

# deterministic profile font-policy artifact checks
profile = json.loads(
(ROOT / "profiles" / "ethos-deterministic-v1.json").read_text(encoding="utf-8")
Expand Down
Loading