From 6ba05cf60f617e71724ff9e2c2320c9675018b5c Mon Sep 17 00:00:00 2001
From: Phyo Pyae Sone Han <phyohan1234@gmail.com>
Date: Fri, 29 May 2026 17:04:52 +0800
Subject: [PATCH 1/3] feat: don't merge tables separated by a section heading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two tables that share a column schema but belong to different sections were
being stitched into one because header similarity alone drove the merge. A
genuine page-split continuation has nothing but page furniture between its
fragments, so a section heading between two tables is a reliable "separate
tables" signal.

The docling adapter now computes a per-table TableMeta.content_before in
reading order, and both merge paths consult it: _classify_sequential_pair
(pass 1) and should_force_orphan_merge (pass 2). Gated by the new
MultiPageConfig.block_on_intervening_content (default True).

Furniture handling so legitimate continuations still merge:
- Only section_header/title nodes block. Paragraphs, list items, captions,
  footnotes and figures are ignored — real PDFs scatter those between
  fragments of a single continued table.
- A heading that recurs near-identically (Jaccard >= 0.8) on another page is
  treated as a running header (a repeated banner, or a journal name docling
  labels page_header on one page and section_header on the next), not a
  boundary.

Fixes over-eager merging of same-schema per-section tables, e.g. an insurance
policy's eight Prestige|Elite|Classic benefit grids collapsing into one.

Tests: 5 merger-level guard tests + 3 adapter-level tests building real
DoclingDocuments. Full suite 158 passed; the existing continuation fixtures
(repeated-header, headerless-continuation, orphan-pair,
inconsistent-header-detection) are unaffected.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                            |  24 ++++
 pyproject.toml                          |   2 +-
 src/table_stitcher/adapters/docling.py  | 181 ++++++++++++++++++++++++
 src/table_stitcher/merger.py            |  16 +++
 src/table_stitcher/models.py            |  20 +++
 tests/test_intervening_content_guard.py | 128 +++++++++++++++++
 tests/test_merger.py                    |  60 ++++++++
 7 files changed, 430 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_intervening_content_guard.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 36ad5bf..a713eef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,30 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html
 
 ## [Unreleased]
 
+## [0.4.0] — 2026-05-29
+
+### Added
+
+- **Intervening-content guard** (`block_on_intervening_content`, default `True`).
+  Two tables that share a column schema but belong to different sections — a
+  heading sits between them in reading order — are no longer stitched into one.
+  A genuine page-split continuation has nothing but page furniture between its
+  fragments, so a section heading between them is a reliable "separate tables"
+  signal. The docling adapter computes a per-table `TableMeta.content_before`
+  signal; both merge paths (`_classify_sequential_pair` and
+  `should_force_orphan_merge`) consult it.
+  - Running headers mislabeled as headings (e.g. a journal banner labeled
+    `page_header` on one page and `section_header` on another, or a repeated
+    "Summary of benefits" banner above every page of a multi-page table) are
+    detected as furniture via near-identical (Jaccard ≥ 0.8) recurrence across
+    pages, so they do not block legitimate continuations.
+  - Only `section_header`/`title` nodes block; plain paragraphs, list items,
+    captions, footnotes and figures are deliberately ignored, since real PDFs
+    routinely scatter those between fragments of a single continued table.
+  - Fixes over-eager merging of same-schema per-section tables (e.g. an
+    insurance policy's eight `Prestige | Elite | Classic` benefit grids being
+    collapsed into one).
+
 ## [0.3.0] — 2026-05-06
 
 ### Fixed
diff --git a/pyproject.toml b/pyproject.toml
index b85f782..4479943 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "table-stitcher"
-version = "0.3.0"
+version = "0.4.0"
 description = "Reassemble tables split across page boundaries in PDF extraction"
 readme = "README.md"
 license = "MIT"
diff --git a/src/table_stitcher/adapters/docling.py b/src/table_stitcher/adapters/docling.py
index 60e9c9b..c040ec5 100644
--- a/src/table_stitcher/adapters/docling.py
+++ b/src/table_stitcher/adapters/docling.py
@@ -19,6 +19,7 @@
 from ..merger import (
     first_row_has_number,
     is_numeric_like_colnames,
+    jaccard,
     normalize_col_name,
     tokenize,
 )
@@ -556,6 +557,176 @@ def _get_ref_pointer(ref_obj: Any) -> str:
 # -------------------------------------------------------------------
 
 
+# -------------------------------------------------------------------
+# Intervening-content detection (reading-order furniture filtering)
+#
+# Philosophy: a table split across a page break has nothing but page
+# furniture (running headers/footers, page numbers) between its fragments.
+# If *substantive* body content — a heading, paragraph, list item, or real
+# figure — sits between two fragments in reading order, they are separate
+# tables that merely share a column schema. These helpers classify the body
+# nodes between consecutive tables so the merger can refuse such merges.
+# -------------------------------------------------------------------
+
+# Only a structural section boundary reliably means "these are separate
+# tables." Plain prose, list items, captions, footnotes and figures all turn
+# up *between* fragments of genuine continuations on real-world PDFs (cell text
+# extracted as body nodes, interleaved multi-column reading order, repeated
+# legends), so blocking on them regresses legitimate merges. A new heading
+# between two tables, by contrast, is a clean separator.
+_BLOCKING_LABELS = {"section_header", "title"}
+
+_CONTINUATION_RE = re.compile(r"\bcont(?:inued|inuation|'?d|\.)?\b", re.IGNORECASE)
+
+
+def _label_str(item: Any) -> str:
+    """Normalize a docling item label (enum or str) to a lowercase string."""
+    lab = getattr(item, "label", None)
+    return str(getattr(lab, "value", lab) or "").lower()
+
+
+def _norm_text(item: Any) -> str:
+    """Whitespace-collapsed lowercase text of an item."""
+    return " ".join(str(getattr(item, "text", "") or "").split()).lower()
+
+
+def _flatten_body_refs(doc: Any) -> list[str]:
+    """Return body reference pointers in reading order (DFS through groups)."""
+    seq: list[str] = []
+    groups = getattr(doc, "groups", []) or []
+    seen_groups: set[int] = set()
+
+    def walk(node: Any) -> None:
+        children = getattr(node, "children", None) or []
+        for child in children:
+            ref = _get_ref_pointer(child)
+            if not ref:
+                continue
+            if ref.startswith("#/groups/"):
+                try:
+                    gi = int(ref.split("/")[-1])
+                except ValueError:
+                    continue
+                # Guard against malformed self-referential group cycles.
+                if gi in seen_groups or not (0 <= gi < len(groups)):
+                    continue
+                seen_groups.add(gi)
+                walk(groups[gi])
+            else:
+                seq.append(ref)
+
+    body = getattr(doc, "body", None)
+    if body is not None:
+        walk(body)
+    return seq
+
+
+def _resolve_ref(doc: Any, ref: str) -> Optional[Any]:
+    """Resolve a ``#/kind/N`` pointer to its item, or None."""
+    try:
+        _, kind, n_str = ref.split("/")
+        n = int(n_str)
+    except (ValueError, AttributeError):
+        return None
+    coll = {
+        "texts": getattr(doc, "texts", None),
+        "tables": getattr(doc, "tables", None),
+        "pictures": getattr(doc, "pictures", None),
+    }.get(kind)
+    if not coll or n >= len(coll):
+        return None
+    return coll[n]
+
+
+def _detect_running_furniture(doc: Any, cfg: MultiPageConfig) -> set[str]:
+    """
+    Identify headings that are actually running headers, not section boundaries.
+
+    Only headings (``_BLOCKING_LABELS``) can block a merge, so only headings
+    need to be exempted. A heading is a running header when near-identical text
+    appears on a *different* page — e.g. a repeated ``Summary of benefits``
+    banner above every page of a multi-page table, or a journal name that one
+    page labels ``page_header`` and another mislabels ``section_header`` (the
+    parser inconsistency this protects against).
+
+    Similarity uses symmetric Jaccard with a high threshold (near-duplicate).
+    A real, unique heading such as ``38e - Trip postponement`` has no
+    near-duplicate on another page, so it correctly stays a blocker — unlike a
+    looser containment metric, which a short subset string (a TOC entry, say)
+    would spuriously satisfy.
+    """
+    texts = getattr(doc, "texts", []) or []
+    # (ref, tokens, page, label) for every text node — the comparison pool.
+    nodes: list[tuple[str, set, Any, str]] = []
+    for i, item in enumerate(texts):
+        prov = getattr(item, "prov", None) or []
+        page = getattr(prov[0], "page_no", None) if prov else None
+        nodes.append((f"#/texts/{i}", tokenize(_norm_text(item)), page, _label_str(item)))
+
+    furniture: set[str] = set()
+    for ref, toks, page, label in nodes:
+        if label not in _BLOCKING_LABELS or not toks or page is None:
+            continue
+        for ref2, toks2, page2, _ in nodes:
+            if ref2 == ref or page2 is None or page2 == page or not toks2:
+                continue
+            if jaccard(toks, toks2) >= 0.8:
+                furniture.add(ref)
+                break
+    return furniture
+
+
+def _ref_is_blocking(doc: Any, ref: str, furniture: set[str], cfg: MultiPageConfig) -> bool:
+    """Whether a body node between two tables marks a real table boundary.
+
+    Only a non-furniture section heading qualifies (see ``_BLOCKING_LABELS``).
+    Figures, list items and plain paragraphs are deliberately ignored: on real
+    PDFs they routinely appear between fragments of a single continued table.
+    """
+    if ref in furniture or not ref.startswith("#/texts/"):
+        return False
+    item = _resolve_ref(doc, ref)
+    if item is None:
+        return False
+    if _label_str(item) not in _BLOCKING_LABELS:
+        return False
+    text = str(getattr(item, "text", "") or "")
+    if not text.strip():
+        return False
+    # A "(continued)" heading marks a continuation, not a new table.
+    if _CONTINUATION_RE.search(text):
+        return False
+    return True
+
+
+def _compute_content_before(doc: Any, cfg: MultiPageConfig) -> dict[int, bool]:
+    """
+    For each table (keyed by docling index), whether substantive body content
+    separates it from the previous table in reading order. Tables absent from
+    the body reading order (orphans) are omitted, disabling the guard for them.
+    """
+    seq = _flatten_body_refs(doc)
+    if not seq:
+        return {}
+    furniture = _detect_running_furniture(doc, cfg)
+
+    result: dict[int, bool] = {}
+    prev_table_seen = False
+    blocking_seen = False
+    for ref in seq:
+        if ref.startswith("#/tables/"):
+            try:
+                t_idx = int(ref.split("/")[-1])
+            except ValueError:
+                continue
+            result[t_idx] = prev_table_seen and blocking_seen
+            prev_table_seen = True
+            blocking_seen = False
+        elif not blocking_seen and _ref_is_blocking(doc, ref, furniture, cfg):
+            blocking_seen = True
+    return result
+
+
 class DoclingAdapter:
     """
     Table-stitcher adapter for Docling (docling-core).
@@ -569,6 +740,15 @@ def extract(self, doc: DoclingDocument, cfg: MultiPageConfig) -> list[TableMeta]
         total = len(doc.tables)
         skipped = 0
 
+        # Reading-order map: which tables have substantive body content before
+        # them (used by the merger's intervening-content guard). Computed once.
+        content_before_map: dict[int, bool] = {}
+        if cfg.block_on_intervening_content:
+            try:
+                content_before_map = _compute_content_before(doc, cfg)
+            except Exception as e:  # never let the guard break extraction
+                log.warning(f"Intervening-content detection failed: {e}")
+
         for idx, table in enumerate(doc.tables):
             try:
                 df = _grid_to_dataframe(table, doc)
@@ -640,6 +820,7 @@ def extract(self, doc: DoclingDocument, cfg: MultiPageConfig) -> list[TableMeta]
                     row_count=df.shape[0],
                     continuation_content=continuation_content,
                     is_headerless=is_headerless,
+                    content_before=content_before_map.get(idx),
                 )
             )
 
diff --git a/src/table_stitcher/merger.py b/src/table_stitcher/merger.py
index a9a6704..6298461 100644
--- a/src/table_stitcher/merger.py
+++ b/src/table_stitcher/merger.py
@@ -312,6 +312,11 @@ def should_force_orphan_merge(h: TableMeta, d: TableMeta, cfg: MultiPageConfig)
         return False, ""
     if abs(h.width - d.width) > cfg.max_width_difference:
         return False, ""
+    # Intervening-content guard — sibling of the one in _classify_sequential_pair.
+    # Pass 2 reaches this without going through that function, so the guard must
+    # be repeated here: a heading before the data fragment means a new table.
+    if cfg.block_on_intervening_content and d.content_before:
+        return False, "content_between_tables"
 
     layout = layout_suggests_continuation(h, d, cfg)
     if h.is_header_orphan and d.is_data_orphan:
@@ -705,6 +710,17 @@ def _classify_sequential_pair(
     if page_gap < 1 or page_gap > cfg.max_page_gap:
         return False, "page_gap_out_of_range", False, []
 
+    # --- Intervening-content guard ---
+    # A genuine page-split continuation has nothing but page furniture between
+    # its fragments. Substantive body content (a heading, paragraph, list item,
+    # or figure) between tA and tB means they are separate tables that merely
+    # share a column schema, not one table split across a page break. The
+    # adapter computes this in reading order (furniture, captions and footnotes
+    # are already filtered out); ``None`` means position unknown, so we defer to
+    # the other signals rather than block.
+    if cfg.block_on_intervening_content and tB.content_before:
+        return False, "content_between_tables", False, []
+
     # --- Spillover (checked before width guards since spillover can cross
     # width boundaries legitimately: 1-col fragment follows N-col table) ---
     if is_spillover_fragment(tA, tB, cfg):
diff --git a/src/table_stitcher/models.py b/src/table_stitcher/models.py
index 0fba16c..4a427be 100644
--- a/src/table_stitcher/models.py
+++ b/src/table_stitcher/models.py
@@ -92,6 +92,19 @@ class MultiPageConfig:
     The structural signal (1 col following N cols) is strong enough for most cases.
     """
 
+    # --- Intervening-Content Guard ---
+    block_on_intervening_content: bool = True
+    """
+    If True, refuse to merge two fragments when substantive body content
+    (a heading, paragraph, list item, or figure) appears between them in
+    reading order. A genuine page-split continuation has nothing between its
+    fragments except page furniture (running headers/footers, page numbers),
+    which is filtered out, as are table-attached captions/footnotes and
+    ``(continued)`` markers. Requires an adapter that populates
+    ``TableMeta.content_before``; when it is left ``None`` the guard is a
+    no-op, so non-docling adapters are unaffected.
+    """
+
     # --- Cell Stitching ---
     stitch_separator: str = "\n"
     """Character(s) used to join split cell content."""
@@ -118,6 +131,13 @@ class TableMeta:
     row_count: int
     continuation_content: list[dict] = field(default_factory=list)
     is_headerless: bool = False
+    content_before: Optional[bool] = None
+    """
+    Whether substantive (non-furniture) body content immediately precedes this
+    fragment in reading order, since the previous table fragment. ``None`` means
+    the adapter could not place the table in reading order (e.g. an orphan
+    table), in which case the intervening-content guard is skipped for it.
+    """
 
 
 @dataclass
diff --git a/tests/test_intervening_content_guard.py b/tests/test_intervening_content_guard.py
new file mode 100644
index 0000000..70bc807
--- /dev/null
+++ b/tests/test_intervening_content_guard.py
@@ -0,0 +1,128 @@
+"""
+Adapter-level tests for the intervening-content guard.
+
+These build real ``DoclingDocument`` objects (via the docling-core builder API)
+so they exercise the *producer* side — ``_detect_running_furniture`` and
+``_compute_content_before`` in the docling adapter — not just the merger's
+consumption of ``content_before``.
+
+Intent (why these matter):
+- Two tables that share a column schema but belong to *different sections*
+  (a heading sits between them) must NOT be stitched into one. This is the
+  GreatEastern COVID-endorsement bug: eight per-benefit plan grids, all with a
+  ``Prestige | Elite | Classic`` header, were merged across page breaks.
+- A genuine continuation whose only intervening content is a *running header*
+  (the same heading repeated atop each page) must STILL merge.
+"""
+
+from docling_core.types.doc import (
+    BoundingBox,
+    CoordOrigin,
+    DocItemLabel,
+    DoclingDocument,
+    ProvenanceItem,
+    Size,
+    TableCell,
+    TableData,
+)
+
+from table_stitcher import stitch_tables
+from table_stitcher.adapters.docling import DoclingAdapter, _compute_content_before
+from table_stitcher.models import MultiPageConfig
+
+PLAN_HEADER = ["", "Prestige plan", "Elite plan", "Classic plan"]
+
+
+def _table_data(header, rows):
+    cells, grid = [], []
+    hrow = []
+    for j, h in enumerate(header):
+        c = TableCell(
+            text=str(h), row_span=1, col_span=1, column_header=True, row_header=False,
+            start_row_offset_idx=0, end_row_offset_idx=1,
+            start_col_offset_idx=j, end_col_offset_idx=j + 1,
+        )
+        hrow.append(c)
+        cells.append(c)
+    grid.append(hrow)
+    for i, row in enumerate(rows):
+        grow = []
+        for j, v in enumerate(row):
+            c = TableCell(
+                text=str(v), row_span=1, col_span=1, column_header=False, row_header=False,
+                start_row_offset_idx=i + 1, end_row_offset_idx=i + 2,
+                start_col_offset_idx=j, end_col_offset_idx=j + 1,
+            )
+            grow.append(c)
+            cells.append(c)
+        grid.append(grow)
+    return TableData(num_rows=len(rows) + 1, num_cols=len(header), table_cells=cells, grid=grid)
+
+
+def _prov(page, top):
+    return ProvenanceItem(
+        page_no=page,
+        bbox=BoundingBox(l=50, t=top, r=550, b=top + 18, coord_origin=CoordOrigin.TOPLEFT),
+        charspan=(0, 0),
+    )
+
+
+def _new_doc(pages=2):
+    doc = DoclingDocument(name="synthetic")
+    for p in range(1, pages + 1):
+        doc.add_page(page_no=p, size=Size(width=600, height=800))
+    return doc
+
+
+def _is_blanked(table):
+    """A satellite merged away by inject() becomes num_rows=0 with empty prov."""
+    return (getattr(table.data, "num_rows", 0) or 0) == 0 and not (table.prov or [])
+
+
+def test_heading_between_same_schema_tables_blocks_merge():
+    """The endorsement bug: a unique heading between two plan grids => separate."""
+    doc = _new_doc()
+    doc.add_table(data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]),
+                  prov=_prov(1, 600))
+    doc.add_text(label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80))
+    doc.add_text(label=DocItemLabel.TEXT, text="Cover under section 15 is extended ...", prov=_prov(2, 110))
+    doc.add_table(data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]),
+                  prov=_prov(2, 300))
+
+    cmap = _compute_content_before(doc, MultiPageConfig())
+    assert cmap.get(1) is True, "heading before the 2nd table should be detected as a boundary"
+
+    stitch_tables(doc)
+    assert not _is_blanked(doc.tables[0]) and not _is_blanked(doc.tables[1]), \
+        "tables in different sections must not be merged"
+
+
+def test_running_header_between_fragments_still_merges():
+    """A repeated heading atop each page is furniture, not a boundary => merge."""
+    doc = _new_doc()
+    doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Summary of benefits", prov=_prov(1, 60))
+    doc.add_table(data=_table_data(PLAN_HEADER, [["Death", "100%", "100%", "100%"]]),
+                  prov=_prov(1, 600))
+    # Same heading repeated at the top of page 2 — a running header.
+    doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Summary of benefits", prov=_prov(2, 60))
+    doc.add_table(data=_table_data(PLAN_HEADER, [["Disability", "100%", "100%", "100%"]]),
+                  prov=_prov(2, 90))
+
+    cmap = _compute_content_before(doc, MultiPageConfig())
+    assert cmap.get(1) is False, "a repeated running header must not count as a boundary"
+
+    stitch_tables(doc)
+    assert _is_blanked(doc.tables[1]), "a true continuation should still be stitched"
+
+
+def test_guard_disabled_restores_legacy_merge():
+    """With the flag off, the heading no longer blocks (legacy behaviour)."""
+    doc = _new_doc()
+    doc.add_table(data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]),
+                  prov=_prov(1, 600))
+    doc.add_text(label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80))
+    doc.add_table(data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]),
+                  prov=_prov(2, 300))
+
+    stitch_tables(doc, config=MultiPageConfig(block_on_intervening_content=False))
+    assert _is_blanked(doc.tables[1]), "legacy header-similarity merge should still happen when disabled"
diff --git a/tests/test_merger.py b/tests/test_merger.py
index bcb73fa..0ef5a18 100644
--- a/tests/test_merger.py
+++ b/tests/test_merger.py
@@ -30,6 +30,7 @@ def _make_meta(
     is_headerless: bool = False,
     vert_top: float = None,
     vert_bottom: float = None,
+    content_before: bool = None,
 ) -> TableMeta:
     """Build a minimal TableMeta for testing."""
     return TableMeta(
@@ -51,6 +52,7 @@ def _make_meta(
         numeric_like_cols=is_numeric_like_colnames([str(c) for c in df.columns]),
         row_count=df.shape[0],
         is_headerless=is_headerless,
+        content_before=content_before,
     )
 
 
@@ -789,3 +791,61 @@ def test_duplicate_column_names_do_not_break_folding(self):
         assert out.shape == (1, 4)
         # Continuation folded into the 4th column (Notes, by positional match).
         assert out.iloc[0, 3] == "first\nsecond line"
+
+
+# ---------------------------------------------------------------------------
+# Intervening-content guard: a paragraph/heading between two same-schema
+# tables means they are separate tables, not one split across a page break.
+# ---------------------------------------------------------------------------
+
+
+class TestInterveningContentGuard:
+    """``content_before`` blocks merges that header similarity would allow."""
+
+    @staticmethod
+    def _benefit(idx, page, label, content_before=None):
+        # Single-row plan grid with the identical Prestige/Elite/Classic header
+        # shared by every COVID endorsement benefit table — header Jaccard 1.0.
+        df = pd.DataFrame(
+            [[label, "S$8,000", "S$5,000", "S$3,000"]],
+            columns=["", "Prestige plan", "Elite plan", "Classic plan"],
+        )
+        return _make_meta(idx, df, start_page=page, content_before=content_before)
+
+    def test_identical_headers_merge_without_guard_signal(self):
+        # Baseline: with content_before=None the strict-header path still merges
+        # (this is the over-eager behaviour the guard is designed to stop).
+        a = self._benefit(0, 1, "Trip cancellation")
+        b = self._benefit(1, 2, "Trip postponement")
+        logical = merge_multipage_tables([a, b], MultiPageConfig())
+        assert len(logical) == 1
+        assert "header_similarity_strict" in logical[0].merge_reason
+
+    def test_content_between_blocks_merge(self):
+        # A heading/paragraph sits before the second fragment -> stay separate.
+        a = self._benefit(0, 1, "Trip cancellation")
+        b = self._benefit(1, 2, "Trip postponement", content_before=True)
+        logical = merge_multipage_tables([a, b], MultiPageConfig())
+        assert len(logical) == 2
+
+    def test_furniture_only_gap_still_merges(self):
+        # content_before=False means only furniture (running header) sat between
+        # the fragments -> a genuine continuation, still merged.
+        a = self._benefit(0, 1, "Summary part 1")
+        b = self._benefit(1, 2, "Summary part 2", content_before=False)
+        logical = merge_multipage_tables([a, b], MultiPageConfig())
+        assert len(logical) == 1
+
+    def test_flag_disables_guard(self):
+        a = self._benefit(0, 1, "Trip cancellation")
+        b = self._benefit(1, 2, "Trip postponement", content_before=True)
+        cfg = MultiPageConfig(block_on_intervening_content=False)
+        logical = merge_multipage_tables([a, b], cfg)
+        assert len(logical) == 1
+
+    def test_none_is_backward_compatible(self):
+        # Adapters that don't populate the field (None) must not be affected.
+        a = self._benefit(0, 1, "Trip cancellation")
+        b = self._benefit(1, 2, "Trip postponement", content_before=None)
+        logical = merge_multipage_tables([a, b], MultiPageConfig())
+        assert len(logical) == 1

From f3e5cc55cb663b6060db256b1822d7b746347026 Mon Sep 17 00:00:00 2001
From: Phyo Pyae Sone Han <phyohan1234@gmail.com>
Date: Fri, 29 May 2026 17:55:05 +0800
Subject: [PATCH 2/3] style: drop unused DoclingAdapter import in guard test
 (ruff F401)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/test_intervening_content_guard.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_intervening_content_guard.py b/tests/test_intervening_content_guard.py
index 70bc807..400afe4 100644
--- a/tests/test_intervening_content_guard.py
+++ b/tests/test_intervening_content_guard.py
@@ -27,7 +27,7 @@
 )
 
 from table_stitcher import stitch_tables
-from table_stitcher.adapters.docling import DoclingAdapter, _compute_content_before
+from table_stitcher.adapters.docling import _compute_content_before
 from table_stitcher.models import MultiPageConfig
 
 PLAN_HEADER = ["", "Prestige plan", "Elite plan", "Classic plan"]

From 12401f40db64eacecefb261c38c787805446bc8b Mon Sep 17 00:00:00 2001
From: Phyo Pyae Sone Han <phyohan1234@gmail.com>
Date: Fri, 29 May 2026 18:10:44 +0800
Subject: [PATCH 3/3] style: ruff-format the guard test file

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/test_intervening_content_guard.py | 77 +++++++++++++++++--------
 1 file changed, 54 insertions(+), 23 deletions(-)

diff --git a/tests/test_intervening_content_guard.py b/tests/test_intervening_content_guard.py
index 400afe4..802288d 100644
--- a/tests/test_intervening_content_guard.py
+++ b/tests/test_intervening_content_guard.py
@@ -38,9 +38,15 @@ def _table_data(header, rows):
     hrow = []
     for j, h in enumerate(header):
         c = TableCell(
-            text=str(h), row_span=1, col_span=1, column_header=True, row_header=False,
-            start_row_offset_idx=0, end_row_offset_idx=1,
-            start_col_offset_idx=j, end_col_offset_idx=j + 1,
+            text=str(h),
+            row_span=1,
+            col_span=1,
+            column_header=True,
+            row_header=False,
+            start_row_offset_idx=0,
+            end_row_offset_idx=1,
+            start_col_offset_idx=j,
+            end_col_offset_idx=j + 1,
         )
         hrow.append(c)
         cells.append(c)
@@ -49,9 +55,15 @@ def _table_data(header, rows):
         grow = []
         for j, v in enumerate(row):
             c = TableCell(
-                text=str(v), row_span=1, col_span=1, column_header=False, row_header=False,
-                start_row_offset_idx=i + 1, end_row_offset_idx=i + 2,
-                start_col_offset_idx=j, end_col_offset_idx=j + 1,
+                text=str(v),
+                row_span=1,
+                col_span=1,
+                column_header=False,
+                row_header=False,
+                start_row_offset_idx=i + 1,
+                end_row_offset_idx=i + 2,
+                start_col_offset_idx=j,
+                end_col_offset_idx=j + 1,
             )
             grow.append(c)
             cells.append(c)
@@ -82,31 +94,42 @@ def _is_blanked(table):
 def test_heading_between_same_schema_tables_blocks_merge():
     """The endorsement bug: a unique heading between two plan grids => separate."""
     doc = _new_doc()
-    doc.add_table(data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]),
-                  prov=_prov(1, 600))
-    doc.add_text(label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80))
-    doc.add_text(label=DocItemLabel.TEXT, text="Cover under section 15 is extended ...", prov=_prov(2, 110))
-    doc.add_table(data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]),
-                  prov=_prov(2, 300))
+    doc.add_table(
+        data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]),
+        prov=_prov(1, 600),
+    )
+    doc.add_text(
+        label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80)
+    )
+    doc.add_text(
+        label=DocItemLabel.TEXT, text="Cover under section 15 is extended ...", prov=_prov(2, 110)
+    )
+    doc.add_table(
+        data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]),
+        prov=_prov(2, 300),
+    )
 
     cmap = _compute_content_before(doc, MultiPageConfig())
     assert cmap.get(1) is True, "heading before the 2nd table should be detected as a boundary"
 
     stitch_tables(doc)
-    assert not _is_blanked(doc.tables[0]) and not _is_blanked(doc.tables[1]), \
+    assert not _is_blanked(doc.tables[0]) and not _is_blanked(doc.tables[1]), (
         "tables in different sections must not be merged"
+    )
 
 
 def test_running_header_between_fragments_still_merges():
     """A repeated heading atop each page is furniture, not a boundary => merge."""
     doc = _new_doc()
     doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Summary of benefits", prov=_prov(1, 60))
-    doc.add_table(data=_table_data(PLAN_HEADER, [["Death", "100%", "100%", "100%"]]),
-                  prov=_prov(1, 600))
+    doc.add_table(
+        data=_table_data(PLAN_HEADER, [["Death", "100%", "100%", "100%"]]), prov=_prov(1, 600)
+    )
     # Same heading repeated at the top of page 2 — a running header.
     doc.add_text(label=DocItemLabel.SECTION_HEADER, text="Summary of benefits", prov=_prov(2, 60))
-    doc.add_table(data=_table_data(PLAN_HEADER, [["Disability", "100%", "100%", "100%"]]),
-                  prov=_prov(2, 90))
+    doc.add_table(
+        data=_table_data(PLAN_HEADER, [["Disability", "100%", "100%", "100%"]]), prov=_prov(2, 90)
+    )
 
     cmap = _compute_content_before(doc, MultiPageConfig())
     assert cmap.get(1) is False, "a repeated running header must not count as a boundary"
@@ -118,11 +141,19 @@ def test_running_header_between_fragments_still_merges():
 def test_guard_disabled_restores_legacy_merge():
     """With the flag off, the heading no longer blocks (legacy behaviour)."""
     doc = _new_doc()
-    doc.add_table(data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]),
-                  prov=_prov(1, 600))
-    doc.add_text(label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80))
-    doc.add_table(data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]),
-                  prov=_prov(2, 300))
+    doc.add_table(
+        data=_table_data(PLAN_HEADER, [["Repatriation", "S$5,000", "S$5,000", "S$5,000"]]),
+        prov=_prov(1, 600),
+    )
+    doc.add_text(
+        label=DocItemLabel.SECTION_HEADER, text="38d - Trip cancellation", prov=_prov(2, 80)
+    )
+    doc.add_table(
+        data=_table_data(PLAN_HEADER, [["Trip cancellation", "S$8,000", "S$5,000", "S$3,000"]]),
+        prov=_prov(2, 300),
+    )
 
     stitch_tables(doc, config=MultiPageConfig(block_on_intervening_content=False))
-    assert _is_blanked(doc.tables[1]), "legacy header-similarity merge should still happen when disabled"
+    assert _is_blanked(doc.tables[1]), (
+        "legacy header-similarity merge should still happen when disabled"
+    )