Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 202 additions & 0 deletions fixtures/test_validate_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,5 +208,207 @@ def cited_cell() -> dict[str, list[str]]:
return {"span_refs": ["s000001"], "element_refs": ["e000001"]}


class FixtureValidatorRegionRefTests(unittest.TestCase):
def setUp(self) -> None:
VALIDATOR.failures = 0

def tearDown(self) -> None:
VALIDATOR.failures = 0

def test_region_refs_reject_unknown_pages(self) -> None:
failures, output = self.validate_regions(
[
{
"id": "r0001",
"page": "p9999",
"bbox": [10, 20, 30, 40],
"kind": "unknown",
}
],
)

self.assertEqual(failures, 1)
self.assertIn(
"extraction.json regions[0] references unknown page 'p9999'",
output,
)

def test_region_refs_reject_malformed_bbox(self) -> None:
failures, output = self.validate_regions(
[
{
"id": "r0001",
"page": "p0001",
"bbox": [30, 20, 10, 40],
"kind": "unknown",
}
],
)

self.assertEqual(failures, 1)
self.assertIn(
"extraction.json regions[0].bbox must satisfy x0<=x1 and y0<=y1",
output,
)

def test_region_refs_reject_non_four_integer_bbox(self) -> None:
cases = [
[10, 20, 30],
[10, 20, 30, 40.5],
[10, True, 30, 40],
]
for bbox in cases:
with self.subTest(bbox=bbox):
VALIDATOR.failures = 0
failures, output = self.validate_regions(
[
{
"id": "r0001",
"page": "p0001",
"bbox": bbox,
"kind": "unknown",
}
],
)

self.assertEqual(failures, 1)
self.assertIn(
"extraction.json regions[0].bbox must be a four-integer array",
output,
)

def test_region_refs_reject_bbox_outside_page_bounds(self) -> None:
failures, output = self.validate_regions(
[
{
"id": "r0001",
"page": "p0001",
"bbox": [10, 20, 1001, 40],
"kind": "unknown",
}
],
)

self.assertEqual(failures, 1)
self.assertIn(
"extraction.json regions[0].bbox must stay within page bounds",
output,
)

def test_region_refs_reject_unknown_warning_refs(self) -> None:
failures, output = self.validate_regions(
[
{
"id": "r0001",
"page": "p0001",
"bbox": [10, 20, 30, 40],
"kind": "unknown",
"warning_refs": ["w9999"],
}
],
)

self.assertEqual(failures, 1)
self.assertIn(
"extraction.json regions[0] references unknown warning 'w9999'",
output,
)

def test_warning_refs_reject_unknown_region_refs(self) -> None:
failures, output = self.validate_regions(
[
{
"id": "r0001",
"page": "p0001",
"bbox": [10, 20, 30, 40],
"kind": "unknown",
}
],
[{"id": "w0001", "region_ref": "r9999"}],
)

self.assertEqual(failures, 1)
self.assertIn(
"extraction.json warnings[0] references unknown region 'r9999'",
output,
)

def test_region_refs_reject_duplicate_region_ids(self) -> None:
failures, output = self.validate_regions(
[
{
"id": "r0001",
"page": "p0001",
"bbox": [10, 20, 30, 40],
"kind": "unknown",
},
{
"id": "r0001",
"page": "p0001",
"bbox": [50, 60, 70, 80],
"kind": "unknown",
},
],
)

self.assertEqual(failures, 1)
self.assertIn("extraction.json regions[1].id duplicates 'r0001'", output)

def test_layout_element_region_refs_reject_unknown_regions(self) -> None:
failures, output = self.validate_layout_region_refs(
{"elements": [{"id": "e000001", "region_ref": "r9999"}], "warnings": []},
)

self.assertEqual(failures, 1)
self.assertIn("layout.json elements[0] references unknown region 'r9999'", output)

def test_layout_warning_region_refs_reject_unknown_regions(self) -> None:
failures, output = self.validate_layout_region_refs(
{"elements": [], "warnings": [{"id": "w0001", "region_ref": "r9999"}]},
)

self.assertEqual(failures, 1)
self.assertIn("layout.json warnings[0] references unknown region 'r9999'", output)

def test_valid_region_refs_are_accepted(self) -> None:
failures, output = self.validate_regions(
[
{
"id": "r0001",
"page": "p0001",
"bbox": [10, 20, 30, 40],
"kind": "unknown",
"warning_refs": ["w0001"],
}
],
[{"id": "w0001", "region_ref": "r0001"}],
)

self.assertEqual(failures, 0)
self.assertEqual(output, "")

def validate_regions(self, regions, warnings=None) -> tuple[int, str]:
extraction = {
"pages": [{"id": "p0001", "width": 1000, "height": 1000}],
"regions": regions,
"warnings": warnings or [{"id": "w0001"}],
}
output = io.StringIO()
with contextlib.redirect_stdout(output):
VALIDATOR.validate_extraction_region_refs("extraction.json", extraction)
return VALIDATOR.failures, output.getvalue()

def validate_layout_region_refs(self, layout) -> tuple[int, str]:
extraction = {
"regions": [
{"id": "r0001", "page": "p0001", "bbox": [10, 20, 30, 40]}
],
}
output = io.StringIO()
with contextlib.redirect_stdout(output):
VALIDATOR.validate_layout_region_refs("layout.json", layout, extraction)
return VALIDATOR.failures, output.getvalue()


if __name__ == "__main__":
unittest.main()
136 changes: 136 additions & 0 deletions fixtures/validate_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ def validate_golden_file(path: Path, stage: str, keys: set[str]):
fail(f"{ctx} {key} must be an array")
validate_projection_items(ctx, "pages", golden.get("pages"), required=True)
validate_projection_items(ctx, "spans", golden.get("spans"), required=True)
validate_projection_items(ctx, "regions", golden.get("regions"), required=False)
validate_extraction_region_refs(ctx, golden)
elif stage == "layout":
if not isinstance(golden.get("elements"), list):
fail(f"{ctx} elements must be an array")
Expand Down Expand Up @@ -358,6 +360,135 @@ def validate_table_refs(ctx: str, tables, extraction, layout) -> None:
fail(f"{cell_ctx} in table {table_id} must cite span_refs or element_refs")


def validate_extraction_region_refs(ctx: str, extraction) -> None:
if not isinstance(extraction, dict):
return
pages = extraction.get("pages") if isinstance(extraction.get("pages"), list) else []
regions = (
extraction.get("regions") if isinstance(extraction.get("regions"), list) else []
)
warnings = (
extraction.get("warnings") if isinstance(extraction.get("warnings"), list) else []
)

page_dims = {}
for page in pages:
if not isinstance(page, dict) or not isinstance(page.get("id"), str):
continue
width = page.get("width")
height = page.get("height")
if (
isinstance(width, int)
and not isinstance(width, bool)
and isinstance(height, int)
and not isinstance(height, bool)
):
page_dims[page["id"]] = (width, height)

region_ids = set()
for region_index, region in enumerate(regions):
region_ctx = f"{ctx} regions[{region_index}]"
if not isinstance(region, dict):
fail(f"{region_ctx} must be an object")
continue
region_id = region.get("id")
if isinstance(region_id, str) and region_id:
if region_id in region_ids:
fail(f"{region_ctx}.id duplicates '{region_id}'")
region_ids.add(region_id)

warning_ids = {
warning.get("id")
for warning in warnings
if isinstance(warning, dict) and isinstance(warning.get("id"), str)
}

for region_index, region in enumerate(regions):
region_ctx = f"{ctx} regions[{region_index}]"
if not isinstance(region, dict):
continue
page = region.get("page")
if not isinstance(page, str) or not page:
fail(f"{region_ctx}.page must be a non-empty string")
page_dims_for_region = None
elif page not in page_dims:
fail(f"{region_ctx} references unknown page '{page}'")
page_dims_for_region = None
else:
page_dims_for_region = page_dims[page]

validate_bbox(region.get("bbox"), region_ctx, page_dims_for_region)
for ref in string_ref_array(
region.get("warning_refs", []), f"{region_ctx}.warning_refs"
):
if ref not in warning_ids:
fail(f"{region_ctx} references unknown warning '{ref}'")

for warning_index, warning in enumerate(warnings):
warning_ctx = f"{ctx} warnings[{warning_index}]"
if not isinstance(warning, dict) or "region_ref" not in warning:
continue
region_ref = warning.get("region_ref")
if not isinstance(region_ref, str) or not region_ref:
fail(f"{warning_ctx}.region_ref must be a non-empty string")
elif region_ref not in region_ids:
fail(f"{warning_ctx} references unknown region '{region_ref}'")


def validate_layout_region_refs(ctx: str, layout, extraction) -> None:
if not isinstance(layout, dict) or not isinstance(extraction, dict):
return
regions = (
extraction.get("regions") if isinstance(extraction.get("regions"), list) else []
)
region_ids = {
region.get("id")
for region in regions
if isinstance(region, dict) and isinstance(region.get("id"), str)
}
elements = layout.get("elements") if isinstance(layout.get("elements"), list) else []
warnings = layout.get("warnings") if isinstance(layout.get("warnings"), list) else []

for element_index, element in enumerate(elements):
element_ctx = f"{ctx} elements[{element_index}]"
if not isinstance(element, dict) or "region_ref" not in element:
continue
region_ref = element.get("region_ref")
if not isinstance(region_ref, str) or not region_ref:
fail(f"{element_ctx}.region_ref must be a non-empty string")
elif region_ref not in region_ids:
fail(f"{element_ctx} references unknown region '{region_ref}'")

for warning_index, warning in enumerate(warnings):
warning_ctx = f"{ctx} warnings[{warning_index}]"
if not isinstance(warning, dict) or "region_ref" not in warning:
continue
region_ref = warning.get("region_ref")
if not isinstance(region_ref, str) or not region_ref:
fail(f"{warning_ctx}.region_ref must be a non-empty string")
elif region_ref not in region_ids:
fail(f"{warning_ctx} references unknown region '{region_ref}'")


def validate_bbox(value, ctx: str, page_dims=None) -> None:
if (
not isinstance(value, list)
or len(value) != 4
or any(not isinstance(item, int) or isinstance(item, bool) for item in value)
):
fail(f"{ctx}.bbox must be a four-integer array")
return
x0, y0, x1, y1 = value
if x0 > x1 or y0 > y1:
fail(f"{ctx}.bbox must satisfy x0<=x1 and y0<=y1")
return
if page_dims is None:
return
width, height = page_dims
if x0 < 0 or y0 < 0 or x1 > width or y1 > height:
fail(f"{ctx}.bbox must stay within page bounds")


def string_ref_array(value, ctx: str) -> list[str]:
if not isinstance(value, list):
fail(f"{ctx} must be an array")
Expand Down Expand Up @@ -853,6 +984,11 @@ def validate_stage_expectations(metadata_path: Path, metadata, extraction, layou
extraction_golden,
layout_golden,
)
validate_layout_region_refs(
str((fixture_dir / "layout.json").relative_to(ROOT)),
layout_golden,
extraction_golden,
)
validate_table_goldens(
fixture_dir,
metadata,
Expand Down
Loading