Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,23 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html

## [Unreleased]

## [0.4.3] — 2026-06-11

### Fixed

- **Reprinted continuation-page headers appended as data rows on multi-page
merge** (`adapters/docling.py`). When a table's column header is reprinted at
the top of each page — especially a multi-row (hierarchical) header — the
repeated header rows survived the merge as bogus data rows, misaligning the
stitched table. Injection now drops a body row when it is *both* flagged
`column_header` by Docling *and* a tokenized match (Jaccard ≥ 0.6) for the
reconstructed header block. Both signals are required: the flag alone is
unreliable (Docling over-flags rowspan/continuation *data* rows as headers),
and the tokenized comparison is punctuation-agnostic, so per-cell OCR drift
such as `(S$)` vs `($$)` is tolerated without any threshold tuning. The merged
DataFrame (`lt.df`) is unchanged; only the injected document is de-duplicated.
A `debug` log reports each dropped row.

## [0.4.2] — 2026-06-08

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "table-stitcher"
version = "0.4.2"
version = "0.4.3"
description = "Reassemble tables split across page boundaries in PDF extraction"
readme = "README.md"
license = "MIT"
Expand Down
56 changes: 51 additions & 5 deletions src/table_stitcher/adapters/docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,42 @@ def _reemit_body_row(
return grid_row, distinct


# Jaccard threshold for recognizing a body row as a reprinted continuation
# header. Combined with Docling's column_header flag — both must hold — so it
# stays conservative. Matches the merger's header_sim_strict default; the
# punctuation-agnostic tokenizer makes it tolerant of per-cell OCR drift such
# as "(S$)" vs "($$)".
_REPEATED_HEADER_SIM = 0.6


def _row_token_set(cells: list[TableCell]) -> set:
"""Union of tokenized cell text for a grid row (duplicate span cells fold in)."""
toks: set = set()
for c in cells:
if c:
toks |= tokenize(getattr(c, "text", "") or "")
return toks


def _is_reprinted_header(orig_row: list[TableCell], header_sigs: list[set]) -> bool:
"""True if ``orig_row`` is a reprinted continuation header to drop from the body.

Docling reprints the column header at the top of each continuation page; on
a multi-row header those rows survive the merge as bogus data rows. They are
dropped only when BOTH signals agree: Docling flagged the row
``column_header`` AND it is a tokenized match for one of the reconstructed
header-block rows. The flag alone is unreliable — Docling over-flags
rowspan/continuation *data* rows as headers — so the content match guards
against deleting real data.
"""
if not any(getattr(c, "column_header", False) for c in orig_row if c):
return False
toks = _row_token_set(orig_row)
if not toks:
return False
return any(jaccard(toks, sig) >= _REPEATED_HEADER_SIM for sig in header_sigs)


def _dataframe_to_docling_data(
df: pd.DataFrame,
original_data: Optional[TableData] = None,
Expand Down Expand Up @@ -587,12 +623,15 @@ def _dataframe_to_docling_data(

# --- Build data rows from merged DataFrame ---
# Index member fragments' original body rows so spanning cells survive the
# round-trip (see _index_member_body_rows).
# round-trip (see _index_member_rows).
body_index = _index_member_rows(member_data) if member_data else {}
# Tokenized signatures of the reconstructed header block, for dropping
# reprinted continuation headers (see _is_reprinted_header).
header_sigs = [s for s in (_row_token_set(h) for h in orig_header_rows) if s]

for i, (_, row) in enumerate(df.iterrows()):
table_row_idx = num_header_rows + i

emitted = 0
for _, row in df.iterrows():
table_row_idx = num_header_rows + emitted
row_vals = ["" if (pd.isna(v) or v is None) else str(v) for v in row]

# Re-emit untouched rows from their original grid cells (preserves
Expand All @@ -601,9 +640,15 @@ def _dataframe_to_docling_data(
bucket = body_index.get(tuple(row_vals))
if bucket:
orig_row = bucket.pop(0)
if header_sigs and _is_reprinted_header(orig_row, header_sigs):
# Reprinted header from a continuation page — already present as
# the header block; drop it instead of duplicating into the body.
log.debug("Dropped reprinted continuation header row from merged body.")
continue
grid_row, distinct = _reemit_body_row(orig_row, table_row_idx, has_row_headers)
grid.append(grid_row)
table_cells.extend(distinct)
emitted += 1
continue

grid_row: list[TableCell] = []
Expand All @@ -625,8 +670,9 @@ def _dataframe_to_docling_data(
table_cells.append(cell)

grid.append(grid_row)
emitted += 1

num_total_rows = num_header_rows + len(df)
num_total_rows = num_header_rows + emitted

return TableData(num_rows=num_total_rows, num_cols=num_cols, table_cells=table_cells, grid=grid)

Expand Down
10 changes: 9 additions & 1 deletion tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,15 @@ def assert_public_stitch_injects_docling_doc(
ctx = f"public stitch for members={members}, pages={exp['pages']}"

assert getattr(anchor.data, "num_rows", 0) > 0, f"{ctx}: anchor has no data"
if "shape" in exp:
if "injected_rows" in exp:
# Exact injected row count. Used when injection legitimately differs
# from the parser-neutral shape — e.g. reprinted continuation-page
# headers are dropped from the body (they remain in the merged
# DataFrame but are not duplicated into the stitched document).
assert anchor.data.num_rows == exp["injected_rows"], (
f"{ctx}: injected rows {anchor.data.num_rows} != expected {exp['injected_rows']}"
)
elif "shape" in exp:
# +1 or more for header rows; this guards that merged data was injected.
assert anchor.data.num_rows >= exp["shape"][0] + 1, (
f"{ctx}: anchor rows {anchor.data.num_rows} do not contain merged body"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ logical_tables:
shape:
- 500
- 3
# Injection drops the reprinted header block (title + column names) that the
# merged DataFrame still carries as its first two rows: 500 - 2 = header(2) + body(498).
injected_rows: 500
columns:
- Column_0
- Column_1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ logical_tables:
shape:
- 46
- 6
# Injection drops the reprinted column header from the continuation page:
# 46 - 1 = header(1) + body(45).
injected_rows: 46
columns:
- Column_0
- Column_1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ logical_tables:
shape:
- 145
- 3
# Injection drops the reprinted column header from the continuation pages:
# 145 - 1 = header(1) + body(144).
injected_rows: 145
columns:
- Contribution Type
- Investment Name
Expand Down
71 changes: 71 additions & 0 deletions tests/test_docling_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,77 @@ def test_without_member_data_falls_back_to_flat(self):
assert all(cell.col_span == 1 for cell in td.grid[1])


class TestReprintedHeaderDedup:
"""Reprinted continuation-page headers are dropped from the injected body,
but column_header-flagged rows that don't match the header (Docling
over-flagging rowspan/continuation data) are kept.
"""

@staticmethod
def _cell(text, r, c, *, header):
return TableCell(
text=text,
row_span=1,
col_span=1,
column_header=header,
row_header=False,
start_row_offset_idx=r,
end_row_offset_idx=r + 1,
start_col_offset_idx=c,
end_col_offset_idx=c + 1,
)

def _table(self, rows: list) -> TableData:
grid = []
flat = []
for r, (cells_text, is_header) in enumerate(rows):
grid_row = [self._cell(t, r, c, header=is_header) for c, t in enumerate(cells_text)]
grid.append(grid_row)
flat.extend(grid_row)
return TableData(num_rows=len(rows), num_cols=len(rows[0][0]), table_cells=flat, grid=grid)

def test_drops_reprinted_header_keeps_misflagged_data(self):
# Anchor: 1-row header "(S$)" + one data row.
anchor = self._table([(["SECTION", "LIMIT (S$)"], True), (["Death", "100"], False)])
# Continuation: header reprinted with OCR drift "($$)", a data row, and
# a row Docling wrongly flagged column_header (real data, distinct text).
satellite = self._table(
[
(["SECTION", "LIMIT ($$)"], True),
(["Injury", "50"], False),
(["Sub-limit per accident", "999"], True),
]
)
# Merged DataFrame: the merger concatenates everything, including the
# reprinted header and the mis-flagged row.
merged_df = pd.DataFrame(
[
["Death", "100"],
["SECTION", "LIMIT ($$)"],
["Injury", "50"],
["Sub-limit per accident", "999"],
],
columns=["Column_0", "Column_1"],
)

td = _dataframe_to_docling_data(
merged_df, original_data=anchor, member_data=[anchor, satellite]
)

body = [r for r in td.grid if not any(getattr(c, "column_header", False) for c in r if c)]
body_text = " ".join(str(c.text) for r in body for c in r if c)

# Reprinted header (drifted) dropped from the body...
assert "($$)" not in body_text
# ...but the header block still carries the anchor's "(S$)".
assert any("(S$)" in (c.text or "") for r in td.grid for c in r if c)
# Real data preserved, including the column_header-mis-flagged row.
assert "Death" in body_text and "Injury" in body_text
assert "Sub-limit per accident" in body_text and "999" in body_text
# 1 header row + 3 body rows (one reprinted header removed from 4).
assert td.num_rows == 4


class TestAdapterProtocol:
"""Verify DoclingAdapter satisfies the protocol."""

Expand Down
Loading