Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,21 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html

## [Unreleased]

## [0.4.1] — 2026-06-08

### Fixed

- **Spanning body cells duplicated across columns on multi-page merge**
(`adapters/docling.py`). Docling repeats a `col_span=N` cell's text across
every column it covers; the merge round-trip rebuilt those as `N` separate
`col_span=1` cells, leaking a full-width description into every value column
and displacing the real values (a repeated `col_span` header behaved the same
way). Injection now matches each merged row back to its source grid row and
re-emits the original spans; rows the merger transformed (stitched
continuations, folded overflow) fall back to the flat 1x1 rebuild. The match
uses the original span metadata, never value equality, so coincidentally-equal
adjacent values (e.g. two plan columns sharing a cap) stay separate cells.

## [0.4.0] — 2026-05-29

### Added
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "table-stitcher"
version = "0.4.0"
version = "0.4.1"
description = "Reassemble tables split across page boundaries in PDF extraction"
readme = "README.md"
license = "MIT"
Expand Down
120 changes: 114 additions & 6 deletions src/table_stitcher/adapters/docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,9 +419,90 @@ def _extract_original_header_rows(
return header_rows, header_cells


def _index_member_rows(
member_data: list[Optional[TableData]],
) -> dict[tuple, list[list[TableCell]]]:
"""
Index every member fragment's original grid rows by their expanded
text-vector, for col_span reconstruction during injection.

Docling repeats a spanning cell's text across each column it covers, so the
text-vector of an original grid row equals the DataFrame row that
``_grid_to_dataframe`` produced for it (for any row the merger left
untouched). Keying on that vector lets ``_dataframe_to_docling_data``
recover the original col_span structure instead of flattening every cell to
1x1 — which duplicates a spanning cell into every column it covered on a
multi-page merge.

All rows are indexed, including header rows: a satellite fragment's repeated
header (e.g. a ``col_span=6`` banner) arrives in the merged DataFrame as a
body row, and must match its original spanning cell rather than duplicate
across the value columns. (The anchor's own header rows are reconstructed
separately and never looked up here.) Values are buckets because a row can
legitimately repeat; identical text-vectors imply identical span structure,
so any occurrence is interchangeable.
"""
index: dict[tuple, list[list[TableCell]]] = {}
for data in member_data:
if not data or not data.grid:
continue
for row in data.grid:
if not row:
continue
key = tuple((getattr(c, "text", "") or "") if c else "" for c in row)
index.setdefault(key, []).append(row)
return index


def _reemit_body_row(
orig_row: list[TableCell], table_row_idx: int, has_row_headers: bool
) -> tuple[list[TableCell], list[TableCell]]:
"""
Re-emit an original grid body row at a new row offset, preserving col_span.

Returns ``(grid_row, distinct_cells)`` where ``grid_row`` repeats each
spanning cell across the columns it covers (Docling grid convention) and
``distinct_cells`` lists each origin cell once (for ``table_cells``).

row_span is intentionally clamped to 1: the merged DataFrame represents one
logical row per grid row, so a multi-row body span cannot be expressed
without desynchronizing the rebuilt grid. (Body row_spans are rare; col_span
is the case that corrupts multi-page merges.)
"""
grid_row: list[Optional[TableCell]] = []
distinct: list[TableCell] = []
for c_idx, cell in enumerate(orig_row):
if cell is None:
grid_row.append(None)
continue
start_col = getattr(cell, "start_col_offset_idx", c_idx)
if start_col == c_idx:
col_span = getattr(cell, "col_span", 1) or 1
new_cell = TableCell(
text=getattr(cell, "text", "") or "",
row_span=1,
col_span=col_span,
column_header=False,
row_header=(c_idx == 0 and has_row_headers)
or bool(getattr(cell, "row_header", False)),
start_row_offset_idx=table_row_idx,
end_row_offset_idx=table_row_idx + 1,
start_col_offset_idx=c_idx,
end_col_offset_idx=c_idx + col_span,
)
distinct.append(new_cell)
grid_row.append(new_cell)
else:
# Continuation column of a span originating to the left: repeat the
# same cell object, which was already appended at ``start_col``.
grid_row.append(grid_row[start_col] if start_col < len(grid_row) else None)
return grid_row, distinct


def _dataframe_to_docling_data(
df: pd.DataFrame,
original_data: Optional[TableData] = None,
member_data: Optional[list[Optional[TableData]]] = None,
) -> TableData:
"""
Converts a pandas DataFrame back into Docling's TableData structure.
Expand All @@ -431,6 +512,12 @@ def _dataframe_to_docling_data(
are preserved exactly. Only the data rows are rebuilt from the DataFrame.
This prevents the lossy roundtrip that would flatten complex headers into
simple 1x1 cells.

When ``member_data`` (the original ``TableData`` of every fragment in the
logical table) is provided, body rows the merger left untouched are
re-emitted from their original grid cells, preserving col_span. Rows the
merger transformed (stitched continuations, folded overflow) fall back to a
flat 1x1 rebuild from the DataFrame.
"""
if df.empty:
cols = list(df.columns) if len(df.columns) > 0 else ["Column_0"]
Expand Down Expand Up @@ -499,16 +586,28 @@ def _dataframe_to_docling_data(
break

# --- Build data rows from merged DataFrame ---
# Index member fragments' original body rows so spanning cells survive the
# round-trip (see _index_member_body_rows).
body_index = _index_member_rows(member_data) if member_data else {}

for i, (_, row) in enumerate(df.iterrows()):
grid_row: list[TableCell] = []
table_row_idx = num_header_rows + i

for j, val in enumerate(row):
if pd.isna(val) or val is None:
text_val = ""
else:
text_val = str(val)
row_vals = ["" if (pd.isna(v) or v is None) else str(v) for v in row]

# Re-emit untouched rows from their original grid cells (preserves
# col_span); only matches when widths align, so coincidentally-equal
# adjacent values are never fused.
bucket = body_index.get(tuple(row_vals))
if bucket:
orig_row = bucket.pop(0)
grid_row, distinct = _reemit_body_row(orig_row, table_row_idx, has_row_headers)
grid.append(grid_row)
table_cells.extend(distinct)
continue

grid_row: list[TableCell] = []
for j, text_val in enumerate(row_vals):
row_header = j == 0 and has_row_headers

cell = TableCell(
Expand Down Expand Up @@ -899,9 +998,18 @@ def restore_snapshots():

original_data = getattr(anchor_table, "data", None)

# Original TableData of every fragment, captured in
# table_snapshots before any mutation, so injection can recover
# each untouched body row's col_span (see
# _index_member_body_rows).
member_data = [
table_snapshots[m]["data"] for m in lt.members if m in table_snapshots
]

anchor_table.data = _dataframe_to_docling_data(
lt.df,
original_data=original_data,
member_data=member_data,
)

for satellite_idx in lt.members[1:]:
Expand Down
88 changes: 88 additions & 0 deletions tests/test_docling_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,94 @@ def test_single_row_header_original_still_reused(self):
assert td.num_rows == 3 # 1 header + 2 data


class TestBodySpanPreservation:
"""Body col_span cells must survive the merge round-trip, not duplicate.

Docling repeats a spanning cell's text across every column it covers, so a
naive grid -> DataFrame -> grid rebuild flattens a ``col_span=N`` body cell
into N duplicate ``col_span=1`` cells — leaking a full-width description
into every value column and displacing the real values. Passing the member
fragments' original TableData lets injection re-emit the original spans.
"""

@staticmethod
def _cell(text, r, c, *, col_span=1, header=False):
return TableCell(
text=text,
row_span=1,
col_span=col_span,
column_header=header,
row_header=False,
start_row_offset_idx=r,
end_row_offset_idx=r + 1,
start_col_offset_idx=c,
end_col_offset_idx=c + col_span,
)

def _fragment(self) -> TableData:
"""3-col fragment: flat header, a col_span=3 description row, a data row,
and a row with coincidentally-equal adjacent values (separate cells)."""
c = self._cell
# Header row 0
h = [
c("Section", 0, 0, header=True),
c("Plan A", 0, 1, header=True),
c("Plan B", 0, 2, header=True),
]
# Row 1: description spanning all 3 cols — grid repeats the same object.
desc = c("See important notes below", 1, 0, col_span=3)
r1 = [desc, desc, desc]
# Row 2: ordinary data row.
r2 = [c("1", 2, 0), c("100", 2, 1), c("200", 2, 2)]
# Row 3: two value columns share a cap value, but are SEPARATE cells.
r3 = [c("2", 3, 0), c("150", 3, 1), c("150", 3, 2)]
grid = [h, r1, r2, r3]
flat = h + [desc, r2[0], r2[1], r2[2], r3[0], r3[1], r3[2]]
return TableData(num_rows=4, num_cols=3, table_cells=flat, grid=grid)

def test_body_colspan_preserved_not_duplicated(self):
original = self._fragment()
# The DataFrame as _grid_to_dataframe would produce it: the spanning
# description duplicated across all three columns.
merged_df = pd.DataFrame(
[
["See important notes below"] * 3,
["1", "100", "200"],
["2", "150", "150"],
],
columns=["Section", "Plan A", "Plan B"],
)

td = _dataframe_to_docling_data(merged_df, original_data=original, member_data=[original])

# Description row: one origin cell with col_span=3, repeated across the
# grid row — NOT three distinct duplicated cells.
desc_row = td.grid[1]
assert desc_row[0].col_span == 3
assert desc_row[0].text == "See important notes below"
assert desc_row[1] is desc_row[0] and desc_row[2] is desc_row[0]
distinct_desc = [
cell for cell in td.table_cells if cell.text == "See important notes below"
]
assert len(distinct_desc) == 1

# Coincidentally-equal values stay as two separate col_span=1 cells.
last_row = td.grid[3]
assert last_row[1].text == last_row[2].text == "150"
assert last_row[1].col_span == 1 and last_row[2].col_span == 1
assert last_row[1] is not last_row[2]

def test_without_member_data_falls_back_to_flat(self):
"""No member_data -> previous behaviour: flat 1x1 body cells."""
original = self._fragment()
merged_df = pd.DataFrame(
[["See important notes below"] * 3],
columns=["Section", "Plan A", "Plan B"],
)
td = _dataframe_to_docling_data(merged_df, original_data=original)
assert all(cell.col_span == 1 for cell in td.grid[1])


class TestAdapterProtocol:
"""Verify DoclingAdapter satisfies the protocol."""

Expand Down
Loading