From 56a2cf6916d59b3e8c8258dcf94ab0b2f55bc1f1 Mon Sep 17 00:00:00 2001
From: maish <maish@pebbleroad.com>
Date: Thu, 11 Jun 2026 12:02:23 +0800
Subject: [PATCH 1/2] Fix reprinted continuation-page headers appended as data
 rows on multi-page merge

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                                  | 15 ++++
 src/table_stitcher/adapters/docling.py        | 56 +++++++++++++--
 tests/integration/conftest.py                 | 10 ++-
 .../15-page-druglist.corp.expected.yaml       |  3 +
 .../covid-misc-labs-4pg.pt2.expected.yaml     |  3 +
 .../retirement-portfolio.corp.expected.yaml   |  3 +
 tests/test_docling_adapter.py                 | 71 +++++++++++++++++++
 7 files changed, 155 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae74ebd..9077af5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,21 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html
 
 ## [Unreleased]
 
+### Fixed
+
+- **Reprinted continuation-page headers appended as data rows on multi-page
+  merge** (`adapters/docling.py`). When a table's column header is reprinted at
+  the top of each page — especially a multi-row (hierarchical) header — the
+  repeated header rows survived the merge as bogus data rows, misaligning the
+  stitched table. Injection now drops a body row when it is *both* flagged
+  `column_header` by Docling *and* a tokenized match (Jaccard ≥ 0.6) for the
+  reconstructed header block. Both signals are required: the flag alone is
+  unreliable (Docling over-flags rowspan/continuation *data* rows as headers),
+  and the tokenized comparison is punctuation-agnostic, so per-cell OCR drift
+  such as `(S$)` vs `($$)` is tolerated without any threshold tuning. The merged
+  DataFrame (`lt.df`) is unchanged; only the injected document is de-duplicated.
+  A `debug` log reports each dropped row.
+
 ## [0.4.2] — 2026-06-08
 
 ### Fixed
diff --git a/src/table_stitcher/adapters/docling.py b/src/table_stitcher/adapters/docling.py
index aa865b2..608d358 100644
--- a/src/table_stitcher/adapters/docling.py
+++ b/src/table_stitcher/adapters/docling.py
@@ -499,6 +499,42 @@ def _reemit_body_row(
     return grid_row, distinct
 
 
+# Jaccard threshold for recognizing a body row as a reprinted continuation
+# header. Combined with Docling's column_header flag — both must hold — so it
+# stays conservative. Matches the merger's header_sim_strict default; the
+# punctuation-agnostic tokenizer makes it tolerant of per-cell OCR drift such
+# as "(S$)" vs "($$)".
+_REPEATED_HEADER_SIM = 0.6
+
+
+def _row_token_set(cells: list[TableCell]) -> set:
+    """Union of tokenized cell text for a grid row (duplicate span cells fold in)."""
+    toks: set = set()
+    for c in cells:
+        if c:
+            toks |= tokenize(getattr(c, "text", "") or "")
+    return toks
+
+
+def _is_reprinted_header(orig_row: list[TableCell], header_sigs: list[set]) -> bool:
+    """True if ``orig_row`` is a reprinted continuation header to drop from the body.
+
+    Docling reprints the column header at the top of each continuation page; on
+    a multi-row header those rows survive the merge as bogus data rows. They are
+    dropped only when BOTH signals agree: Docling flagged the row
+    ``column_header`` AND it is a tokenized match for one of the reconstructed
+    header-block rows. The flag alone is unreliable — Docling over-flags
+    rowspan/continuation *data* rows as headers — so the content match guards
+    against deleting real data.
+    """
+    if not any(getattr(c, "column_header", False) for c in orig_row if c):
+        return False
+    toks = _row_token_set(orig_row)
+    if not toks:
+        return False
+    return any(jaccard(toks, sig) >= _REPEATED_HEADER_SIM for sig in header_sigs)
+
+
 def _dataframe_to_docling_data(
     df: pd.DataFrame,
     original_data: Optional[TableData] = None,
@@ -587,12 +623,15 @@ def _dataframe_to_docling_data(
 
     # --- Build data rows from merged DataFrame ---
     # Index member fragments' original body rows so spanning cells survive the
-    # round-trip (see _index_member_body_rows).
+    # round-trip (see _index_member_rows).
     body_index = _index_member_rows(member_data) if member_data else {}
+    # Tokenized signatures of the reconstructed header block, for dropping
+    # reprinted continuation headers (see _is_reprinted_header).
+    header_sigs = [s for s in (_row_token_set(h) for h in orig_header_rows) if s]
 
-    for i, (_, row) in enumerate(df.iterrows()):
-        table_row_idx = num_header_rows + i
-
+    emitted = 0
+    for _, row in df.iterrows():
+        table_row_idx = num_header_rows + emitted
         row_vals = ["" if (pd.isna(v) or v is None) else str(v) for v in row]
 
         # Re-emit untouched rows from their original grid cells (preserves
@@ -601,9 +640,15 @@ def _dataframe_to_docling_data(
         bucket = body_index.get(tuple(row_vals))
         if bucket:
             orig_row = bucket.pop(0)
+            if header_sigs and _is_reprinted_header(orig_row, header_sigs):
+                # Reprinted header from a continuation page — already present as
+                # the header block; drop it instead of duplicating into the body.
+                log.debug("Dropped reprinted continuation header row from merged body.")
+                continue
             grid_row, distinct = _reemit_body_row(orig_row, table_row_idx, has_row_headers)
             grid.append(grid_row)
             table_cells.extend(distinct)
+            emitted += 1
             continue
 
         grid_row: list[TableCell] = []
@@ -625,8 +670,9 @@ def _dataframe_to_docling_data(
             table_cells.append(cell)
 
         grid.append(grid_row)
+        emitted += 1
 
-    num_total_rows = num_header_rows + len(df)
+    num_total_rows = num_header_rows + emitted
 
     return TableData(num_rows=num_total_rows, num_cols=num_cols, table_cells=table_cells, grid=grid)
 
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 4a637c7..9814d97 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -333,7 +333,15 @@ def assert_public_stitch_injects_docling_doc(
         ctx = f"public stitch for members={members}, pages={exp['pages']}"
 
         assert getattr(anchor.data, "num_rows", 0) > 0, f"{ctx}: anchor has no data"
-        if "shape" in exp:
+        if "injected_rows" in exp:
+            # Exact injected row count. Used when injection legitimately differs
+            # from the parser-neutral shape — e.g. reprinted continuation-page
+            # headers are dropped from the body (they remain in the merged
+            # DataFrame but are not duplicated into the stitched document).
+            assert anchor.data.num_rows == exp["injected_rows"], (
+                f"{ctx}: injected rows {anchor.data.num_rows} != expected {exp['injected_rows']}"
+            )
+        elif "shape" in exp:
             # +1 or more for header rows; this guards that merged data was injected.
             assert anchor.data.num_rows >= exp["shape"][0] + 1, (
                 f"{ctx}: anchor rows {anchor.data.num_rows} do not contain merged body"
diff --git a/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.expected.yaml b/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.expected.yaml
index 236468f..3d2e6ad 100644
--- a/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.expected.yaml
+++ b/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.expected.yaml
@@ -37,6 +37,9 @@ logical_tables:
   shape:
   - 500
   - 3
+  # Injection drops the reprinted header block (title + column names) that the
+  # merged DataFrame still carries as its first two rows: 500 - 2 = header(2) + body(498).
+  injected_rows: 500
   columns:
   - Column_0
   - Column_1
diff --git a/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.expected.yaml b/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.expected.yaml
index 3152576..bf2c85b 100644
--- a/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.expected.yaml
+++ b/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.expected.yaml
@@ -73,6 +73,9 @@ logical_tables:
   shape:
   - 46
   - 6
+  # Injection drops the reprinted column header from the continuation page:
+  # 46 - 1 = header(1) + body(45).
+  injected_rows: 46
   columns:
   - Column_0
   - Column_1
diff --git a/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.expected.yaml b/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.expected.yaml
index ad42664..60b6971 100644
--- a/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.expected.yaml
+++ b/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.expected.yaml
@@ -23,6 +23,9 @@ logical_tables:
   shape:
   - 145
   - 3
+  # Injection drops the reprinted column header from the continuation pages:
+  # 145 - 1 = header(1) + body(144).
+  injected_rows: 145
   columns:
   - Contribution Type
   - Investment Name
diff --git a/tests/test_docling_adapter.py b/tests/test_docling_adapter.py
index cc7182c..5c7229d 100644
--- a/tests/test_docling_adapter.py
+++ b/tests/test_docling_adapter.py
@@ -388,6 +388,77 @@ def test_without_member_data_falls_back_to_flat(self):
         assert all(cell.col_span == 1 for cell in td.grid[1])
 
 
+class TestReprintedHeaderDedup:
+    """Reprinted continuation-page headers are dropped from the injected body,
+    but column_header-flagged rows that don't match the header (Docling
+    over-flagging rowspan/continuation data) are kept.
+    """
+
+    @staticmethod
+    def _cell(text, r, c, *, header):
+        return TableCell(
+            text=text,
+            row_span=1,
+            col_span=1,
+            column_header=header,
+            row_header=False,
+            start_row_offset_idx=r,
+            end_row_offset_idx=r + 1,
+            start_col_offset_idx=c,
+            end_col_offset_idx=c + 1,
+        )
+
+    def _table(self, rows: list) -> TableData:
+        grid = []
+        flat = []
+        for r, (cells_text, is_header) in enumerate(rows):
+            grid_row = [self._cell(t, r, c, header=is_header) for c, t in enumerate(cells_text)]
+            grid.append(grid_row)
+            flat.extend(grid_row)
+        return TableData(num_rows=len(rows), num_cols=len(rows[0][0]), table_cells=flat, grid=grid)
+
+    def test_drops_reprinted_header_keeps_misflagged_data(self):
+        # Anchor: 1-row header "(S$)" + one data row.
+        anchor = self._table([(["SECTION", "LIMIT (S$)"], True), (["Death", "100"], False)])
+        # Continuation: header reprinted with OCR drift "($$)", a data row, and
+        # a row Docling wrongly flagged column_header (real data, distinct text).
+        satellite = self._table(
+            [
+                (["SECTION", "LIMIT ($$)"], True),
+                (["Injury", "50"], False),
+                (["Sub-limit per accident", "999"], True),
+            ]
+        )
+        # Merged DataFrame: the merger concatenates everything, including the
+        # reprinted header and the mis-flagged row.
+        merged_df = pd.DataFrame(
+            [
+                ["Death", "100"],
+                ["SECTION", "LIMIT ($$)"],
+                ["Injury", "50"],
+                ["Sub-limit per accident", "999"],
+            ],
+            columns=["Column_0", "Column_1"],
+        )
+
+        td = _dataframe_to_docling_data(
+            merged_df, original_data=anchor, member_data=[anchor, satellite]
+        )
+
+        body = [r for r in td.grid if not any(getattr(c, "column_header", False) for c in r if c)]
+        body_text = " ".join(str(c.text) for r in body for c in r if c)
+
+        # Reprinted header (drifted) dropped from the body...
+        assert "($$)" not in body_text
+        # ...but the header block still carries the anchor's "(S$)".
+        assert any("(S$)" in (c.text or "") for r in td.grid for c in r if c)
+        # Real data preserved, including the column_header-mis-flagged row.
+        assert "Death" in body_text and "Injury" in body_text
+        assert "Sub-limit per accident" in body_text and "999" in body_text
+        # 1 header row + 3 body rows (one reprinted header removed from 4).
+        assert td.num_rows == 4
+
+
 class TestAdapterProtocol:
     """Verify DoclingAdapter satisfies the protocol."""
 

From 20752abba3a1f199124537cff0660f07422ab747 Mon Sep 17 00:00:00 2001
From: maish <maish@pebbleroad.com>
Date: Thu, 11 Jun 2026 12:02:39 +0800
Subject: [PATCH 2/2] chore: release 0.4.3

---
 CHANGELOG.md   | 2 ++
 pyproject.toml | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9077af5..71f37d5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html
 
 ## [Unreleased]
 
+## [0.4.3] — 2026-06-11
+
 ### Fixed
 
 - **Reprinted continuation-page headers appended as data rows on multi-page
diff --git a/pyproject.toml b/pyproject.toml
index 29bbd94..058603d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "table-stitcher"
-version = "0.4.2"
+version = "0.4.3"
 description = "Reassemble tables split across page boundaries in PDF extraction"
 readme = "README.md"
 license = "MIT"