From 56a2cf6916d59b3e8c8258dcf94ab0b2f55bc1f1 Mon Sep 17 00:00:00 2001 From: maish Date: Thu, 11 Jun 2026 12:02:23 +0800 Subject: [PATCH 1/2] Fix reprinted continuation-page headers appended as data rows on multi-page merge Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 15 ++++ src/table_stitcher/adapters/docling.py | 56 +++++++++++++-- tests/integration/conftest.py | 10 ++- .../15-page-druglist.corp.expected.yaml | 3 + .../covid-misc-labs-4pg.pt2.expected.yaml | 3 + .../retirement-portfolio.corp.expected.yaml | 3 + tests/test_docling_adapter.py | 71 +++++++++++++++++++ 7 files changed, 155 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae74ebd..9077af5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,21 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html ## [Unreleased] +### Fixed + +- **Reprinted continuation-page headers appended as data rows on multi-page + merge** (`adapters/docling.py`). When a table's column header is reprinted at + the top of each page — especially a multi-row (hierarchical) header — the + repeated header rows survived the merge as bogus data rows, misaligning the + stitched table. Injection now drops a body row when it is *both* flagged + `column_header` by Docling *and* a tokenized match (Jaccard ≥ 0.6) for the + reconstructed header block. Both signals are required: the flag alone is + unreliable (Docling over-flags rowspan/continuation *data* rows as headers), + and the tokenized comparison is punctuation-agnostic, so per-cell OCR drift + such as `(S$)` vs `($$)` is tolerated without any threshold tuning. The merged + DataFrame (`lt.df`) is unchanged; only the injected document is de-duplicated. + A `debug` log reports each dropped row. + ## [0.4.2] — 2026-06-08 ### Fixed diff --git a/src/table_stitcher/adapters/docling.py b/src/table_stitcher/adapters/docling.py index aa865b2..608d358 100644 --- a/src/table_stitcher/adapters/docling.py +++ b/src/table_stitcher/adapters/docling.py @@ -499,6 +499,42 @@ def _reemit_body_row( return grid_row, distinct +# Jaccard threshold for recognizing a body row as a reprinted continuation +# header. Combined with Docling's column_header flag — both must hold — so it +# stays conservative. Matches the merger's header_sim_strict default; the +# punctuation-agnostic tokenizer makes it tolerant of per-cell OCR drift such +# as "(S$)" vs "($$)". +_REPEATED_HEADER_SIM = 0.6 + + +def _row_token_set(cells: list[TableCell]) -> set: + """Union of tokenized cell text for a grid row (duplicate span cells fold in).""" + toks: set = set() + for c in cells: + if c: + toks |= tokenize(getattr(c, "text", "") or "") + return toks + + +def _is_reprinted_header(orig_row: list[TableCell], header_sigs: list[set]) -> bool: + """True if ``orig_row`` is a reprinted continuation header to drop from the body. + + Docling reprints the column header at the top of each continuation page; on + a multi-row header those rows survive the merge as bogus data rows. They are + dropped only when BOTH signals agree: Docling flagged the row + ``column_header`` AND it is a tokenized match for one of the reconstructed + header-block rows. The flag alone is unreliable — Docling over-flags + rowspan/continuation *data* rows as headers — so the content match guards + against deleting real data. + """ + if not any(getattr(c, "column_header", False) for c in orig_row if c): + return False + toks = _row_token_set(orig_row) + if not toks: + return False + return any(jaccard(toks, sig) >= _REPEATED_HEADER_SIM for sig in header_sigs) + + def _dataframe_to_docling_data( df: pd.DataFrame, original_data: Optional[TableData] = None, @@ -587,12 +623,15 @@ def _dataframe_to_docling_data( # --- Build data rows from merged DataFrame --- # Index member fragments' original body rows so spanning cells survive the - # round-trip (see _index_member_body_rows). + # round-trip (see _index_member_rows). body_index = _index_member_rows(member_data) if member_data else {} + # Tokenized signatures of the reconstructed header block, for dropping + # reprinted continuation headers (see _is_reprinted_header). + header_sigs = [s for s in (_row_token_set(h) for h in orig_header_rows) if s] - for i, (_, row) in enumerate(df.iterrows()): - table_row_idx = num_header_rows + i - + emitted = 0 + for _, row in df.iterrows(): + table_row_idx = num_header_rows + emitted row_vals = ["" if (pd.isna(v) or v is None) else str(v) for v in row] # Re-emit untouched rows from their original grid cells (preserves @@ -601,9 +640,15 @@ def _dataframe_to_docling_data( bucket = body_index.get(tuple(row_vals)) if bucket: orig_row = bucket.pop(0) + if header_sigs and _is_reprinted_header(orig_row, header_sigs): + # Reprinted header from a continuation page — already present as + # the header block; drop it instead of duplicating into the body. + log.debug("Dropped reprinted continuation header row from merged body.") + continue grid_row, distinct = _reemit_body_row(orig_row, table_row_idx, has_row_headers) grid.append(grid_row) table_cells.extend(distinct) + emitted += 1 continue grid_row: list[TableCell] = [] @@ -625,8 +670,9 @@ def _dataframe_to_docling_data( table_cells.append(cell) grid.append(grid_row) + emitted += 1 - num_total_rows = num_header_rows + len(df) + num_total_rows = num_header_rows + emitted return TableData(num_rows=num_total_rows, num_cols=num_cols, table_cells=table_cells, grid=grid) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 4a637c7..9814d97 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -333,7 +333,15 @@ def assert_public_stitch_injects_docling_doc( ctx = f"public stitch for members={members}, pages={exp['pages']}" assert getattr(anchor.data, "num_rows", 0) > 0, f"{ctx}: anchor has no data" - if "shape" in exp: + if "injected_rows" in exp: + # Exact injected row count. Used when injection legitimately differs + # from the parser-neutral shape — e.g. reprinted continuation-page + # headers are dropped from the body (they remain in the merged + # DataFrame but are not duplicated into the stitched document). + assert anchor.data.num_rows == exp["injected_rows"], ( + f"{ctx}: injected rows {anchor.data.num_rows} != expected {exp['injected_rows']}" + ) + elif "shape" in exp: # +1 or more for header rows; this guards that merged data was injected. assert anchor.data.num_rows >= exp["shape"][0] + 1, ( f"{ctx}: anchor rows {anchor.data.num_rows} do not contain merged body" diff --git a/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.expected.yaml b/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.expected.yaml index 236468f..3d2e6ad 100644 --- a/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.expected.yaml +++ b/tests/integration/fixtures/headerless-continuation/15-page-druglist.corp.expected.yaml @@ -37,6 +37,9 @@ logical_tables: shape: - 500 - 3 + # Injection drops the reprinted header block (title + column names) that the + # merged DataFrame still carries as its first two rows: 500 - 2 = header(2) + body(498). + injected_rows: 500 columns: - Column_0 - Column_1 diff --git a/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.expected.yaml b/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.expected.yaml index 3152576..bf2c85b 100644 --- a/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.expected.yaml +++ b/tests/integration/fixtures/inconsistent-header-detection/covid-misc-labs-4pg.pt2.expected.yaml @@ -73,6 +73,9 @@ logical_tables: shape: - 46 - 6 + # Injection drops the reprinted column header from the continuation page: + # 46 - 1 = header(1) + body(45). + injected_rows: 46 columns: - Column_0 - Column_1 diff --git a/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.expected.yaml b/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.expected.yaml index ad42664..60b6971 100644 --- a/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.expected.yaml +++ b/tests/integration/fixtures/inconsistent-header-detection/retirement-portfolio.corp.expected.yaml @@ -23,6 +23,9 @@ logical_tables: shape: - 145 - 3 + # Injection drops the reprinted column header from the continuation pages: + # 145 - 1 = header(1) + body(144). + injected_rows: 145 columns: - Contribution Type - Investment Name diff --git a/tests/test_docling_adapter.py b/tests/test_docling_adapter.py index cc7182c..5c7229d 100644 --- a/tests/test_docling_adapter.py +++ b/tests/test_docling_adapter.py @@ -388,6 +388,77 @@ def test_without_member_data_falls_back_to_flat(self): assert all(cell.col_span == 1 for cell in td.grid[1]) +class TestReprintedHeaderDedup: + """Reprinted continuation-page headers are dropped from the injected body, + but column_header-flagged rows that don't match the header (Docling + over-flagging rowspan/continuation data) are kept. + """ + + @staticmethod + def _cell(text, r, c, *, header): + return TableCell( + text=text, + row_span=1, + col_span=1, + column_header=header, + row_header=False, + start_row_offset_idx=r, + end_row_offset_idx=r + 1, + start_col_offset_idx=c, + end_col_offset_idx=c + 1, + ) + + def _table(self, rows: list) -> TableData: + grid = [] + flat = [] + for r, (cells_text, is_header) in enumerate(rows): + grid_row = [self._cell(t, r, c, header=is_header) for c, t in enumerate(cells_text)] + grid.append(grid_row) + flat.extend(grid_row) + return TableData(num_rows=len(rows), num_cols=len(rows[0][0]), table_cells=flat, grid=grid) + + def test_drops_reprinted_header_keeps_misflagged_data(self): + # Anchor: 1-row header "(S$)" + one data row. + anchor = self._table([(["SECTION", "LIMIT (S$)"], True), (["Death", "100"], False)]) + # Continuation: header reprinted with OCR drift "($$)", a data row, and + # a row Docling wrongly flagged column_header (real data, distinct text). + satellite = self._table( + [ + (["SECTION", "LIMIT ($$)"], True), + (["Injury", "50"], False), + (["Sub-limit per accident", "999"], True), + ] + ) + # Merged DataFrame: the merger concatenates everything, including the + # reprinted header and the mis-flagged row. + merged_df = pd.DataFrame( + [ + ["Death", "100"], + ["SECTION", "LIMIT ($$)"], + ["Injury", "50"], + ["Sub-limit per accident", "999"], + ], + columns=["Column_0", "Column_1"], + ) + + td = _dataframe_to_docling_data( + merged_df, original_data=anchor, member_data=[anchor, satellite] + ) + + body = [r for r in td.grid if not any(getattr(c, "column_header", False) for c in r if c)] + body_text = " ".join(str(c.text) for r in body for c in r if c) + + # Reprinted header (drifted) dropped from the body... + assert "($$)" not in body_text + # ...but the header block still carries the anchor's "(S$)". + assert any("(S$)" in (c.text or "") for r in td.grid for c in r if c) + # Real data preserved, including the column_header-mis-flagged row. + assert "Death" in body_text and "Injury" in body_text + assert "Sub-limit per accident" in body_text and "999" in body_text + # 1 header row + 3 body rows (one reprinted header removed from 4). + assert td.num_rows == 4 + + class TestAdapterProtocol: """Verify DoclingAdapter satisfies the protocol.""" From 20752abba3a1f199124537cff0660f07422ab747 Mon Sep 17 00:00:00 2001 From: maish Date: Thu, 11 Jun 2026 12:02:39 +0800 Subject: [PATCH 2/2] chore: release 0.4.3 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9077af5..71f37d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html ## [Unreleased] +## [0.4.3] — 2026-06-11 + ### Fixed - **Reprinted continuation-page headers appended as data rows on multi-page diff --git a/pyproject.toml b/pyproject.toml index 29bbd94..058603d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "table-stitcher" -version = "0.4.2" +version = "0.4.3" description = "Reassemble tables split across page boundaries in PDF extraction" readme = "README.md" license = "MIT"