From 6ba05cf60f617e71724ff9e2c2320c9675018b5c Mon Sep 17 00:00:00 2001 From: Phyo Pyae Sone Han Date: Fri, 29 May 2026 17:04:52 +0800 Subject: [PATCH 1/3] feat: don't merge tables separated by a section heading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two tables that share a column schema but belong to different sections were being stitched into one because header similarity alone drove the merge. A genuine page-split continuation has nothing but page furniture between its fragments, so a section heading between two tables is a reliable "separate tables" signal. The docling adapter now computes a per-table TableMeta.content_before in reading order, and both merge paths consult it: _classify_sequential_pair (pass 1) and should_force_orphan_merge (pass 2). Gated by the new MultiPageConfig.block_on_intervening_content (default True). Furniture handling so legitimate continuations still merge: - Only section_header/title nodes block. Paragraphs, list items, captions, footnotes and figures are ignored — real PDFs scatter those between fragments of a single continued table. - A heading that recurs near-identically (Jaccard >= 0.8) on another page is treated as a running header (a repeated banner, or a journal name docling labels page_header on one page and section_header on the next), not a boundary. Fixes over-eager merging of same-schema per-section tables, e.g. an insurance policy's eight Prestige|Elite|Classic benefit grids collapsing into one. Tests: 5 merger-level guard tests + 3 adapter-level tests building real DoclingDocuments. Full suite 158 passed; the existing continuation fixtures (repeated-header, headerless-continuation, orphan-pair, inconsistent-header-detection) are unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 24 ++++ pyproject.toml | 2 +- src/table_stitcher/adapters/docling.py | 181 ++++++++++++++++++++++++ src/table_stitcher/merger.py | 16 +++ src/table_stitcher/models.py | 20 +++ tests/test_intervening_content_guard.py | 128 +++++++++++++++++ tests/test_merger.py | 60 ++++++++ 7 files changed, 430 insertions(+), 1 deletion(-) create mode 100644 tests/test_intervening_content_guard.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 36ad5bf..a713eef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,30 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html ## [Unreleased] +## [0.4.0] — 2026-05-29 + +### Added + +- **Intervening-content guard** (`block_on_intervening_content`, default `True`). + Two tables that share a column schema but belong to different sections — a + heading sits between them in reading order — are no longer stitched into one. + A genuine page-split continuation has nothing but page furniture between its + fragments, so a section heading between them is a reliable "separate tables" + signal. The docling adapter computes a per-table `TableMeta.content_before` + signal; both merge paths (`_classify_sequential_pair` and + `should_force_orphan_merge`) consult it. + - Running headers mislabeled as headings (e.g. a journal banner labeled + `page_header` on one page and `section_header` on another, or a repeated + "Summary of benefits" banner above every page of a multi-page table) are + detected as furniture via near-identical (Jaccard ≥ 0.8) recurrence across + pages, so they do not block legitimate continuations. + - Only `section_header`/`title` nodes block; plain paragraphs, list items, + captions, footnotes and figures are deliberately ignored, since real PDFs + routinely scatter those between fragments of a single continued table. + - Fixes over-eager merging of same-schema per-section tables (e.g. an + insurance policy's eight `Prestige | Elite | Classic` benefit grids being + collapsed into one). + ## [0.3.0] — 2026-05-06 ### Fixed diff --git a/pyproject.toml b/pyproject.toml index b85f782..4479943 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "table-stitcher" -version = "0.3.0" +version = "0.4.0" description = "Reassemble tables split across page boundaries in PDF extraction" readme = "README.md" license = "MIT" diff --git a/src/table_stitcher/adapters/docling.py b/src/table_stitcher/adapters/docling.py index 60e9c9b..c040ec5 100644 --- a/src/table_stitcher/adapters/docling.py +++ b/src/table_stitcher/adapters/docling.py @@ -19,6 +19,7 @@ from ..merger import ( first_row_has_number, is_numeric_like_colnames, + jaccard, normalize_col_name, tokenize, ) @@ -556,6 +557,176 @@ def _get_ref_pointer(ref_obj: Any) -> str: # ------------------------------------------------------------------- +# ------------------------------------------------------------------- +# Intervening-content detection (reading-order furniture filtering) +# +# Philosophy: a table split across a page break has nothing but page +# furniture (running headers/footers, page numbers) between its fragments. +# If *substantive* body content — a heading, paragraph, list item, or real +# figure — sits between two fragments in reading order, they are separate +# tables that merely share a column schema. These helpers classify the body +# nodes between consecutive tables so the merger can refuse such merges. +# ------------------------------------------------------------------- + +# Only a structural section boundary reliably means "these are separate +# tables." Plain prose, list items, captions, footnotes and figures all turn +# up *between* fragments of genuine continuations on real-world PDFs (cell text +# extracted as body nodes, interleaved multi-column reading order, repeated +# legends), so blocking on them regresses legitimate merges. A new heading +# between two tables, by contrast, is a clean separator. +_BLOCKING_LABELS = {"section_header", "title"} + +_CONTINUATION_RE = re.compile(r"\bcont(?:inued|inuation|'?d|\.)?\b", re.IGNORECASE) + + +def _label_str(item: Any) -> str: + """Normalize a docling item label (enum or str) to a lowercase string.""" + lab = getattr(item, "label", None) + return str(getattr(lab, "value", lab) or "").lower() + + +def _norm_text(item: Any) -> str: + """Whitespace-collapsed lowercase text of an item.""" + return " ".join(str(getattr(item, "text", "") or "").split()).lower() + + +def _flatten_body_refs(doc: Any) -> list[str]: + """Return body reference pointers in reading order (DFS through groups).""" + seq: list[str] = [] + groups = getattr(doc, "groups", []) or [] + seen_groups: set[int] = set() + + def walk(node: Any) -> None: + children = getattr(node, "children", None) or [] + for child in children: + ref = _get_ref_pointer(child) + if not ref: + continue + if ref.startswith("#/groups/"): + try: + gi = int(ref.split("/")[-1]) + except ValueError: + continue + # Guard against malformed self-referential group cycles. + if gi in seen_groups or not (0 <= gi < len(groups)): + continue + seen_groups.add(gi) + walk(groups[gi]) + else: + seq.append(ref) + + body = getattr(doc, "body", None) + if body is not None: + walk(body) + return seq + + +def _resolve_ref(doc: Any, ref: str) -> Optional[Any]: + """Resolve a ``#/kind/N`` pointer to its item, or None.""" + try: + _, kind, n_str = ref.split("/") + n = int(n_str) + except (ValueError, AttributeError): + return None + coll = { + "texts": getattr(doc, "texts", None), + "tables": getattr(doc, "tables", None), + "pictures": getattr(doc, "pictures", None), + }.get(kind) + if not coll or n >= len(coll): + return None + return coll[n] + + +def _detect_running_furniture(doc: Any, cfg: MultiPageConfig) -> set[str]: + """ + Identify headings that are actually running headers, not section boundaries. + + Only headings (``_BLOCKING_LABELS``) can block a merge, so only headings + need to be exempted. A heading is a running header when near-identical text + appears on a *different* page — e.g. a repeated ``Summary of benefits`` + banner above every page of a multi-page table, or a journal name that one + page labels ``page_header`` and another mislabels ``section_header`` (the + parser inconsistency this protects against). + + Similarity uses symmetric Jaccard with a high threshold (near-duplicate). + A real, unique heading such as ``38e - Trip postponement`` has no + near-duplicate on another page, so it correctly stays a blocker — unlike a + looser containment metric, which a short subset string (a TOC entry, say) + would spuriously satisfy. + """ + texts = getattr(doc, "texts", []) or [] + # (ref, tokens, page, label) for every text node — the comparison pool. + nodes: list[tuple[str, set, Any, str]] = [] + for i, item in enumerate(texts): + prov = getattr(item, "prov", None) or [] + page = getattr(prov[0], "page_no", None) if prov else None + nodes.append((f"#/texts/{i}", tokenize(_norm_text(item)), page, _label_str(item))) + + furniture: set[str] = set() + for ref, toks, page, label in nodes: + if label not in _BLOCKING_LABELS or not toks or page is None: + continue + for ref2, toks2, page2, _ in nodes: + if ref2 == ref or page2 is None or page2 == page or not toks2: + continue + if jaccard(toks, toks2) >= 0.8: + furniture.add(ref) + break + return furniture + + +def _ref_is_blocking(doc: Any, ref: str, furniture: set[str], cfg: MultiPageConfig) -> bool: + """Whether a body node between two tables marks a real table boundary. + + Only a non-furniture section heading qualifies (see ``_BLOCKING_LABELS``). + Figures, list items and plain paragraphs are deliberately ignored: on real + PDFs they routinely appear between fragments of a single continued table. + """ + if ref in furniture or not ref.startswith("#/texts/"): + return False + item = _resolve_ref(doc, ref) + if item is None: + return False + if _label_str(item) not in _BLOCKING_LABELS: + return False + text = str(getattr(item, "text", "") or "") + if not text.strip(): + return False + # A "(continued)" heading marks a continuation, not a new table. + if _CONTINUATION_RE.search(text): + return False + return True + + +def _compute_content_before(doc: Any, cfg: MultiPageConfig) -> dict[int, bool]: + """ + For each table (keyed by docling index), whether substantive body content + separates it from the previous table in reading order. Tables absent from + the body reading order (orphans) are omitted, disabling the guard for them. + """ + seq = _flatten_body_refs(doc) + if not seq: + return {} + furniture = _detect_running_furniture(doc, cfg) + + result: dict[int, bool] = {} + prev_table_seen = False + blocking_seen = False + for ref in seq: + if ref.startswith("#/tables/"): + try: + t_idx = int(ref.split("/")[-1]) + except ValueError: + continue + result[t_idx] = prev_table_seen and blocking_seen + prev_table_seen = True + blocking_seen = False + elif not blocking_seen and _ref_is_blocking(doc, ref, furniture, cfg): + blocking_seen = True + return result + + class DoclingAdapter: """ Table-stitcher adapter for Docling (docling-core). @@ -569,6 +740,15 @@ def extract(self, doc: DoclingDocument, cfg: MultiPageConfig) -> list[TableMeta] total = len(doc.tables) skipped = 0 + # Reading-order map: which tables have substantive body content before + # them (used by the merger's intervening-content guard). Computed once. + content_before_map: dict[int, bool] = {} + if cfg.block_on_intervening_content: + try: + content_before_map = _compute_content_before(doc, cfg) + except Exception as e: # never let the guard break extraction + log.warning(f"Intervening-content detection failed: {e}") + for idx, table in enumerate(doc.tables): try: df = _grid_to_dataframe(table, doc) @@ -640,6 +820,7 @@ def extract(self, doc: DoclingDocument, cfg: MultiPageConfig) -> list[TableMeta] row_count=df.shape[0], continuation_content=continuation_content, is_headerless=is_headerless, + content_before=content_before_map.get(idx), ) ) diff --git a/src/table_stitcher/merger.py b/src/table_stitcher/merger.py index a9a6704..6298461 100644 --- a/src/table_stitcher/merger.py +++ b/src/table_stitcher/merger.py @@ -312,6 +312,11 @@ def should_force_orphan_merge(h: TableMeta, d: TableMeta, cfg: MultiPageConfig) return False, "" if abs(h.width - d.width) > cfg.max_width_difference: return False, "" + # Intervening-content guard — sibling of the one in _classify_sequential_pair. + # Pass 2 reaches this without going through that function, so the guard must + # be repeated here: a heading before the data fragment means a new table. + if cfg.block_on_intervening_content and d.content_before: + return False, "content_between_tables" layout = layout_suggests_continuation(h, d, cfg) if h.is_header_orphan and d.is_data_orphan: @@ -705,6 +710,17 @@ def _classify_sequential_pair( if page_gap < 1 or page_gap > cfg.max_page_gap: return False, "page_gap_out_of_range", False, [] + # --- Intervening-content guard --- + # A genuine page-split continuation has nothing but page furniture between + # its fragments. Substantive body content (a heading, paragraph, list item, + # or figure) between tA and tB means they are separate tables that merely + # share a column schema, not one table split across a page break. The + # adapter computes this in reading order (furniture, captions and footnotes + # are already filtered out); ``None`` means position unknown, so we defer to + # the other signals rather than block. + if cfg.block_on_intervening_content and tB.content_before: + return False, "content_between_tables", False, [] + # --- Spillover (checked before width guards since spillover can cross # width boundaries legitimately: 1-col fragment follows N-col table) --- if is_spillover_fragment(tA, tB, cfg): diff --git a/src/table_stitcher/models.py b/src/table_stitcher/models.py index 0fba16c..4a427be 100644 --- a/src/table_stitcher/models.py +++ b/src/table_stitcher/models.py @@ -92,6 +92,19 @@ class MultiPageConfig: The structural signal (1 col following N cols) is strong enough for most cases. """ + # --- Intervening-Content Guard --- + block_on_intervening_content: bool = True + """ + If True, refuse to merge two fragments when substantive body content + (a heading, paragraph, list item, or figure) appears between them in + reading order. A genuine page-split continuation has nothing between its + fragments except page furniture (running headers/footers, page numbers), + which is filtered out, as are table-attached captions/footnotes and + ``(continued)`` markers. Requires an adapter that populates + ``TableMeta.content_before``; when it is left ``None`` the guard is a + no-op, so non-docling adapters are unaffected. + """ + # --- Cell Stitching --- stitch_separator: str = "\n" """Character(s) used to join split cell content.""" @@ -118,6 +131,13 @@ class TableMeta: row_count: int continuation_content: list[dict] = field(default_factory=list) is_headerless: bool = False + content_before: Optional[bool] = None + """ + Whether substantive (non-furniture) body content immediately precedes this + fragment in reading order, since the previous table fragment. ``None`` means + the adapter could not place the table in reading order (e.g. an orphan + table), in which case the intervening-content guard is skipped for it. + """ @dataclass diff --git a/tests/test_intervening_content_guard.py b/tests/test_intervening_content_guard.py new file mode 100644 index 0000000..70bc807 --- /dev/null +++ b/tests/test_intervening_content_guard.py @@ -0,0 +1,128 @@ +""" +Adapter-level tests for the intervening-content guard. + +These build real ``DoclingDocument`` objects (via the docling-core builder API) +so they exercise the *producer* side — ``_detect_running_furniture`` and +``_compute_content_before`` in the docling adapter — not just the merger's +consumption of ``content_before``. + +Intent (why these matter): +- Two tables that share a column schema but belong to *different sections* + (a heading sits between them) must NOT be stitched into one. This is the + GreatEastern COVID-endorsement bug: eight per-benefit plan grids, all with a + ``Prestige | Elite | Classic`` header, were merged across page breaks. +- A genuine continuation whose only intervening content is a *running header* + (the same heading repeated atop each page) must STILL merge. +""" + +from docling_core.types.doc import ( + BoundingBox, + CoordOrigin, + DocItemLabel, + DoclingDocument, + ProvenanceItem, + Size, + TableCell, + TableData, +) + +from table_stitcher import stitch_tables +from table_stitcher.adapters.docling import DoclingAdapter, _compute_content_before +from table_stitcher.models import MultiPageConfig + +PLAN_HEADER = ["", "Prestige plan", "Elite plan", "Classic plan"] + + +def _table_data(header, rows): + cells, grid = [], [] + hrow = [] + for j, h in enumerate(header): + c = TableCell( + text=str(h), row_span=1, col_span=1, column_header=True, row_header=False, + start_row_offset_idx=0, end_row_offset_idx=1, + start_col_offset_idx=j, end_col_offset_idx=j + 1, + ) + hrow.append(c) + cells.append(c) + grid.append(hrow) + for i, row in enumerate(rows): + grow = [] + for j, v in enumerate(row): + c = TableCell( + text=str(v), row_span=1, col_span=1, column_header=False, row_header=False, + start_row_offset_idx=i + 1, end_row_offset_idx=i + 2, + start_col_offset_idx=j, end_col_offset_idx=j + 1, + ) + grow.append(c) + cells.append(c) + grid.append(grow) + return TableData(num_rows=len(rows) + 1, num_cols=len(header), table_cells=cells, grid=grid) + + +def _prov(page, top): + return ProvenanceItem( + page_no=page, + bbox=BoundingBox(l=50, t=top, r=550, b=top + 18, coord_origin=CoordOrigin.TOPLEFT), + charspan=(0, 0), + ) + + +def _new_doc(pages=2): + doc = DoclingDocument(name="synthetic") + for p in range(1, pages + 1): + doc.add_page(page_no=p, size=Size(width=600, height=800)) + return doc + + +def _is_blanked(table): + """A satellite merged away by inject() becomes num_rows=0 with empty prov.""" + return (getattr(table.data, "num_rows", 0) or 0) == 0 and not (table.prov or []) + + +def test_heading_between_same_schema_tables_blocks_merge(): + """The endorsement bug: a unique heading between two plan grids => separate.""" + doc = _new_doc() + doc.add_table(data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]), + prov=_prov(1, 600)) + doc.add_text(label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80)) + doc.add_text(label=DocItemLabel.TEXT, text="Cover under section 15 is extended ...", prov=_prov(2, 110)) + doc.add_table(data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]), + prov=_prov(2, 300)) + + cmap = _compute_content_before(doc, MultiPageConfig()) + assert cmap.get(1) is True, "heading before the 2nd table should be detected as a boundary" + + stitch_tables(doc) + assert not _is_blanked(doc.tables[0]) and not _is_blanked(doc.tables[1]), \ + "tables in different sections must not be merged" + + +def test_running_header_between_fragments_still_merges(): + """A repeated heading atop each page is furniture, not a boundary => merge.""" + doc = _new_doc() + doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Summary of benefits", prov=_prov(1, 60)) + doc.add_table(data=_table_data(PLAN_HEADER, [["Death", "100%", "100%", "100%"]]), + prov=_prov(1, 600)) + # Same heading repeated at the top of page 2 — a running header. + doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Summary of benefits", prov=_prov(2, 60)) + doc.add_table(data=_table_data(PLAN_HEADER, [["Disability", "100%", "100%", "100%"]]), + prov=_prov(2, 90)) + + cmap = _compute_content_before(doc, MultiPageConfig()) + assert cmap.get(1) is False, "a repeated running header must not count as a boundary" + + stitch_tables(doc) + assert _is_blanked(doc.tables[1]), "a true continuation should still be stitched" + + +def test_guard_disabled_restores_legacy_merge(): + """With the flag off, the heading no longer blocks (legacy behaviour).""" + doc = _new_doc() + doc.add_table(data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]), + prov=_prov(1, 600)) + doc.add_text(label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80)) + doc.add_table(data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]), + prov=_prov(2, 300)) + + stitch_tables(doc, config=MultiPageConfig(block_on_intervening_content=False)) + assert _is_blanked(doc.tables[1]), "legacy header-similarity merge should still happen when disabled" diff --git a/tests/test_merger.py b/tests/test_merger.py index bcb73fa..0ef5a18 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -30,6 +30,7 @@ def _make_meta( is_headerless: bool = False, vert_top: float = None, vert_bottom: float = None, + content_before: bool = None, ) -> TableMeta: """Build a minimal TableMeta for testing.""" return TableMeta( @@ -51,6 +52,7 @@ def _make_meta( numeric_like_cols=is_numeric_like_colnames([str(c) for c in df.columns]), row_count=df.shape[0], is_headerless=is_headerless, + content_before=content_before, ) @@ -789,3 +791,61 @@ def test_duplicate_column_names_do_not_break_folding(self): assert out.shape == (1, 4) # Continuation folded into the 4th column (Notes, by positional match). assert out.iloc[0, 3] == "first\nsecond line" + + +# --------------------------------------------------------------------------- +# Intervening-content guard: a paragraph/heading between two same-schema +# tables means they are separate tables, not one split across a page break. +# --------------------------------------------------------------------------- + + +class TestInterveningContentGuard: + """``content_before`` blocks merges that header similarity would allow.""" + + @staticmethod + def _benefit(idx, page, label, content_before=None): + # Single-row plan grid with the identical Prestige/Elite/Classic header + # shared by every COVID endorsement benefit table — header Jaccard 1.0. + df = pd.DataFrame( + [[label, "S$8,000", "S$5,000", "S$3,000"]], + columns=["", "Prestige plan", "Elite plan", "Classic plan"], + ) + return _make_meta(idx, df, start_page=page, content_before=content_before) + + def test_identical_headers_merge_without_guard_signal(self): + # Baseline: with content_before=None the strict-header path still merges + # (this is the over-eager behaviour the guard is designed to stop). + a = self._benefit(0, 1, "Trip cancellation") + b = self._benefit(1, 2, "Trip postponement") + logical = merge_multipage_tables([a, b], MultiPageConfig()) + assert len(logical) == 1 + assert "header_similarity_strict" in logical[0].merge_reason + + def test_content_between_blocks_merge(self): + # A heading/paragraph sits before the second fragment -> stay separate. + a = self._benefit(0, 1, "Trip cancellation") + b = self._benefit(1, 2, "Trip postponement", content_before=True) + logical = merge_multipage_tables([a, b], MultiPageConfig()) + assert len(logical) == 2 + + def test_furniture_only_gap_still_merges(self): + # content_before=False means only furniture (running header) sat between + # the fragments -> a genuine continuation, still merged. + a = self._benefit(0, 1, "Summary part 1") + b = self._benefit(1, 2, "Summary part 2", content_before=False) + logical = merge_multipage_tables([a, b], MultiPageConfig()) + assert len(logical) == 1 + + def test_flag_disables_guard(self): + a = self._benefit(0, 1, "Trip cancellation") + b = self._benefit(1, 2, "Trip postponement", content_before=True) + cfg = MultiPageConfig(block_on_intervening_content=False) + logical = merge_multipage_tables([a, b], cfg) + assert len(logical) == 1 + + def test_none_is_backward_compatible(self): + # Adapters that don't populate the field (None) must not be affected. + a = self._benefit(0, 1, "Trip cancellation") + b = self._benefit(1, 2, "Trip postponement", content_before=None) + logical = merge_multipage_tables([a, b], MultiPageConfig()) + assert len(logical) == 1 From f3e5cc55cb663b6060db256b1822d7b746347026 Mon Sep 17 00:00:00 2001 From: Phyo Pyae Sone Han Date: Fri, 29 May 2026 17:55:05 +0800 Subject: [PATCH 2/3] style: drop unused DoclingAdapter import in guard test (ruff F401) Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_intervening_content_guard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_intervening_content_guard.py b/tests/test_intervening_content_guard.py index 70bc807..400afe4 100644 --- a/tests/test_intervening_content_guard.py +++ b/tests/test_intervening_content_guard.py @@ -27,7 +27,7 @@ ) from table_stitcher import stitch_tables -from table_stitcher.adapters.docling import DoclingAdapter, _compute_content_before +from table_stitcher.adapters.docling import _compute_content_before from table_stitcher.models import MultiPageConfig PLAN_HEADER = ["", "Prestige plan", "Elite plan", "Classic plan"] From 12401f40db64eacecefb261c38c787805446bc8b Mon Sep 17 00:00:00 2001 From: Phyo Pyae Sone Han Date: Fri, 29 May 2026 18:10:44 +0800 Subject: [PATCH 3/3] style: ruff-format the guard test file Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_intervening_content_guard.py | 77 +++++++++++++++++-------- 1 file changed, 54 insertions(+), 23 deletions(-) diff --git a/tests/test_intervening_content_guard.py b/tests/test_intervening_content_guard.py index 400afe4..802288d 100644 --- a/tests/test_intervening_content_guard.py +++ b/tests/test_intervening_content_guard.py @@ -38,9 +38,15 @@ def _table_data(header, rows): hrow = [] for j, h in enumerate(header): c = TableCell( - text=str(h), row_span=1, col_span=1, column_header=True, row_header=False, - start_row_offset_idx=0, end_row_offset_idx=1, - start_col_offset_idx=j, end_col_offset_idx=j + 1, + text=str(h), + row_span=1, + col_span=1, + column_header=True, + row_header=False, + start_row_offset_idx=0, + end_row_offset_idx=1, + start_col_offset_idx=j, + end_col_offset_idx=j + 1, ) hrow.append(c) cells.append(c) @@ -49,9 +55,15 @@ def _table_data(header, rows): grow = [] for j, v in enumerate(row): c = TableCell( - text=str(v), row_span=1, col_span=1, column_header=False, row_header=False, - start_row_offset_idx=i + 1, end_row_offset_idx=i + 2, - start_col_offset_idx=j, end_col_offset_idx=j + 1, + text=str(v), + row_span=1, + col_span=1, + column_header=False, + row_header=False, + start_row_offset_idx=i + 1, + end_row_offset_idx=i + 2, + start_col_offset_idx=j, + end_col_offset_idx=j + 1, ) grow.append(c) cells.append(c) @@ -82,31 +94,42 @@ def _is_blanked(table): def test_heading_between_same_schema_tables_blocks_merge(): """The endorsement bug: a unique heading between two plan grids => separate.""" doc = _new_doc() - doc.add_table(data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]), - prov=_prov(1, 600)) - doc.add_text(label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80)) - doc.add_text(label=DocItemLabel.TEXT, text="Cover under section 15 is extended ...", prov=_prov(2, 110)) - doc.add_table(data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]), - prov=_prov(2, 300)) + doc.add_table( + data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]), + prov=_prov(1, 600), + ) + doc.add_text( + label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80) + ) + doc.add_text( + label=DocItemLabel.TEXT, text="Cover under section 15 is extended ...", prov=_prov(2, 110) + ) + doc.add_table( + data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]), + prov=_prov(2, 300), + ) cmap = _compute_content_before(doc, MultiPageConfig()) assert cmap.get(1) is True, "heading before the 2nd table should be detected as a boundary" stitch_tables(doc) - assert not _is_blanked(doc.tables[0]) and not _is_blanked(doc.tables[1]), \ + assert not _is_blanked(doc.tables[0]) and not _is_blanked(doc.tables[1]), ( "tables in different sections must not be merged" + ) def test_running_header_between_fragments_still_merges(): """A repeated heading atop each page is furniture, not a boundary => merge.""" doc = _new_doc() doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Summary of benefits", prov=_prov(1, 60)) - doc.add_table(data=_table_data(PLAN_HEADER, [["Death", "100%", "100%", "100%"]]), - prov=_prov(1, 600)) + doc.add_table( + data=_table_data(PLAN_HEADER, [["Death", "100%", "100%", "100%"]]), prov=_prov(1, 600) + ) # Same heading repeated at the top of page 2 — a running header. doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Summary of benefits", prov=_prov(2, 60)) - doc.add_table(data=_table_data(PLAN_HEADER, [["Disability", "100%", "100%", "100%"]]), - prov=_prov(2, 90)) + doc.add_table( + data=_table_data(PLAN_HEADER, [["Disability", "100%", "100%", "100%"]]), prov=_prov(2, 90) + ) cmap = _compute_content_before(doc, MultiPageConfig()) assert cmap.get(1) is False, "a repeated running header must not count as a boundary" @@ -118,11 +141,19 @@ def test_running_header_between_fragments_still_merges(): def test_guard_disabled_restores_legacy_merge(): """With the flag off, the heading no longer blocks (legacy behaviour).""" doc = _new_doc() - doc.add_table(data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]), - prov=_prov(1, 600)) - doc.add_text(label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80)) - doc.add_table(data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]), - prov=_prov(2, 300)) + doc.add_table( + data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]), + prov=_prov(1, 600), + ) + doc.add_text( + label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80) + ) + doc.add_table( + data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]), + prov=_prov(2, 300), + ) stitch_tables(doc, config=MultiPageConfig(block_on_intervening_content=False)) - assert _is_blanked(doc.tables[1]), "legacy header-similarity merge should still happen when disabled" + assert _is_blanked(doc.tables[1]), ( + "legacy header-similarity merge should still happen when disabled" + )