diff --git a/CHANGELOG.md b/CHANGELOG.md index a713eef..6a78408 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,21 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html ## [Unreleased] +## [0.4.1] — 2026-06-08 + +### Fixed + +- **Spanning body cells duplicated across columns on multi-page merge** + (`adapters/docling.py`). Docling repeats a `col_span=N` cell's text across + every column it covers; the merge round-trip rebuilt those as `N` separate + `col_span=1` cells, leaking a full-width description into every value column + and displacing the real values (a repeated `col_span` header behaved the same + way). Injection now matches each merged row back to its source grid row and + re-emits the original spans; rows the merger transformed (stitched + continuations, folded overflow) fall back to the flat 1x1 rebuild. The match + uses the original span metadata, never value equality, so coincidentally-equal + adjacent values (e.g. two plan columns sharing a cap) stay separate cells. + ## [0.4.0] — 2026-05-29 ### Added diff --git a/pyproject.toml b/pyproject.toml index 4479943..43d47a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "table-stitcher" -version = "0.4.0" +version = "0.4.1" description = "Reassemble tables split across page boundaries in PDF extraction" readme = "README.md" license = "MIT" diff --git a/src/table_stitcher/adapters/docling.py b/src/table_stitcher/adapters/docling.py index c040ec5..aa865b2 100644 --- a/src/table_stitcher/adapters/docling.py +++ b/src/table_stitcher/adapters/docling.py @@ -419,9 +419,90 @@ def _extract_original_header_rows( return header_rows, header_cells +def _index_member_rows( + member_data: list[Optional[TableData]], +) -> dict[tuple, list[list[TableCell]]]: + """ + Index every member fragment's original grid rows by their expanded + text-vector, for col_span reconstruction during injection. + + Docling repeats a spanning cell's text across each column it covers, so the + text-vector of an original grid row equals the DataFrame row that + ``_grid_to_dataframe`` produced for it (for any row the merger left + untouched). Keying on that vector lets ``_dataframe_to_docling_data`` + recover the original col_span structure instead of flattening every cell to + 1x1 — which duplicates a spanning cell into every column it covered on a + multi-page merge. + + All rows are indexed, including header rows: a satellite fragment's repeated + header (e.g. a ``col_span=6`` banner) arrives in the merged DataFrame as a + body row, and must match its original spanning cell rather than duplicate + across the value columns. (The anchor's own header rows are reconstructed + separately and never looked up here.) Values are buckets because a row can + legitimately repeat; identical text-vectors imply identical span structure, + so any occurrence is interchangeable. + """ + index: dict[tuple, list[list[TableCell]]] = {} + for data in member_data: + if not data or not data.grid: + continue + for row in data.grid: + if not row: + continue + key = tuple((getattr(c, "text", "") or "") if c else "" for c in row) + index.setdefault(key, []).append(row) + return index + + +def _reemit_body_row( + orig_row: list[TableCell], table_row_idx: int, has_row_headers: bool +) -> tuple[list[TableCell], list[TableCell]]: + """ + Re-emit an original grid body row at a new row offset, preserving col_span. + + Returns ``(grid_row, distinct_cells)`` where ``grid_row`` repeats each + spanning cell across the columns it covers (Docling grid convention) and + ``distinct_cells`` lists each origin cell once (for ``table_cells``). + + row_span is intentionally clamped to 1: the merged DataFrame represents one + logical row per grid row, so a multi-row body span cannot be expressed + without desynchronizing the rebuilt grid. (Body row_spans are rare; col_span + is the case that corrupts multi-page merges.) + """ + grid_row: list[Optional[TableCell]] = [] + distinct: list[TableCell] = [] + for c_idx, cell in enumerate(orig_row): + if cell is None: + grid_row.append(None) + continue + start_col = getattr(cell, "start_col_offset_idx", c_idx) + if start_col == c_idx: + col_span = getattr(cell, "col_span", 1) or 1 + new_cell = TableCell( + text=getattr(cell, "text", "") or "", + row_span=1, + col_span=col_span, + column_header=False, + row_header=(c_idx == 0 and has_row_headers) + or bool(getattr(cell, "row_header", False)), + start_row_offset_idx=table_row_idx, + end_row_offset_idx=table_row_idx + 1, + start_col_offset_idx=c_idx, + end_col_offset_idx=c_idx + col_span, + ) + distinct.append(new_cell) + grid_row.append(new_cell) + else: + # Continuation column of a span originating to the left: repeat the + # same cell object, which was already appended at ``start_col``. + grid_row.append(grid_row[start_col] if start_col < len(grid_row) else None) + return grid_row, distinct + + def _dataframe_to_docling_data( df: pd.DataFrame, original_data: Optional[TableData] = None, + member_data: Optional[list[Optional[TableData]]] = None, ) -> TableData: """ Converts a pandas DataFrame back into Docling's TableData structure. @@ -431,6 +512,12 @@ def _dataframe_to_docling_data( are preserved exactly. Only the data rows are rebuilt from the DataFrame. This prevents the lossy roundtrip that would flatten complex headers into simple 1x1 cells. + + When ``member_data`` (the original ``TableData`` of every fragment in the + logical table) is provided, body rows the merger left untouched are + re-emitted from their original grid cells, preserving col_span. Rows the + merger transformed (stitched continuations, folded overflow) fall back to a + flat 1x1 rebuild from the DataFrame. """ if df.empty: cols = list(df.columns) if len(df.columns) > 0 else ["Column_0"] @@ -499,16 +586,28 @@ def _dataframe_to_docling_data( break # --- Build data rows from merged DataFrame --- + # Index member fragments' original body rows so spanning cells survive the + # round-trip (see _index_member_body_rows). + body_index = _index_member_rows(member_data) if member_data else {} + for i, (_, row) in enumerate(df.iterrows()): - grid_row: list[TableCell] = [] table_row_idx = num_header_rows + i - for j, val in enumerate(row): - if pd.isna(val) or val is None: - text_val = "" - else: - text_val = str(val) + row_vals = ["" if (pd.isna(v) or v is None) else str(v) for v in row] + + # Re-emit untouched rows from their original grid cells (preserves + # col_span); only matches when widths align, so coincidentally-equal + # adjacent values are never fused. + bucket = body_index.get(tuple(row_vals)) + if bucket: + orig_row = bucket.pop(0) + grid_row, distinct = _reemit_body_row(orig_row, table_row_idx, has_row_headers) + grid.append(grid_row) + table_cells.extend(distinct) + continue + grid_row: list[TableCell] = [] + for j, text_val in enumerate(row_vals): row_header = j == 0 and has_row_headers cell = TableCell( @@ -899,9 +998,18 @@ def restore_snapshots(): original_data = getattr(anchor_table, "data", None) + # Original TableData of every fragment, captured in + # table_snapshots before any mutation, so injection can recover + # each untouched body row's col_span (see + # _index_member_body_rows). + member_data = [ + table_snapshots[m]["data"] for m in lt.members if m in table_snapshots + ] + anchor_table.data = _dataframe_to_docling_data( lt.df, original_data=original_data, + member_data=member_data, ) for satellite_idx in lt.members[1:]: diff --git a/tests/test_docling_adapter.py b/tests/test_docling_adapter.py index 554239c..cc7182c 100644 --- a/tests/test_docling_adapter.py +++ b/tests/test_docling_adapter.py @@ -300,6 +300,94 @@ def test_single_row_header_original_still_reused(self): assert td.num_rows == 3 # 1 header + 2 data +class TestBodySpanPreservation: + """Body col_span cells must survive the merge round-trip, not duplicate. + + Docling repeats a spanning cell's text across every column it covers, so a + naive grid -> DataFrame -> grid rebuild flattens a ``col_span=N`` body cell + into N duplicate ``col_span=1`` cells — leaking a full-width description + into every value column and displacing the real values. Passing the member + fragments' original TableData lets injection re-emit the original spans. + """ + + @staticmethod + def _cell(text, r, c, *, col_span=1, header=False): + return TableCell( + text=text, + row_span=1, + col_span=col_span, + column_header=header, + row_header=False, + start_row_offset_idx=r, + end_row_offset_idx=r + 1, + start_col_offset_idx=c, + end_col_offset_idx=c + col_span, + ) + + def _fragment(self) -> TableData: + """3-col fragment: flat header, a col_span=3 description row, a data row, + and a row with coincidentally-equal adjacent values (separate cells).""" + c = self._cell + # Header row 0 + h = [ + c("Section", 0, 0, header=True), + c("Plan A", 0, 1, header=True), + c("Plan B", 0, 2, header=True), + ] + # Row 1: description spanning all 3 cols — grid repeats the same object. + desc = c("See important notes below", 1, 0, col_span=3) + r1 = [desc, desc, desc] + # Row 2: ordinary data row. + r2 = [c("1", 2, 0), c("100", 2, 1), c("200", 2, 2)] + # Row 3: two value columns share a cap value, but are SEPARATE cells. + r3 = [c("2", 3, 0), c("150", 3, 1), c("150", 3, 2)] + grid = [h, r1, r2, r3] + flat = h + [desc, r2[0], r2[1], r2[2], r3[0], r3[1], r3[2]] + return TableData(num_rows=4, num_cols=3, table_cells=flat, grid=grid) + + def test_body_colspan_preserved_not_duplicated(self): + original = self._fragment() + # The DataFrame as _grid_to_dataframe would produce it: the spanning + # description duplicated across all three columns. + merged_df = pd.DataFrame( + [ + ["See important notes below"] * 3, + ["1", "100", "200"], + ["2", "150", "150"], + ], + columns=["Section", "Plan A", "Plan B"], + ) + + td = _dataframe_to_docling_data(merged_df, original_data=original, member_data=[original]) + + # Description row: one origin cell with col_span=3, repeated across the + # grid row — NOT three distinct duplicated cells. + desc_row = td.grid[1] + assert desc_row[0].col_span == 3 + assert desc_row[0].text == "See important notes below" + assert desc_row[1] is desc_row[0] and desc_row[2] is desc_row[0] + distinct_desc = [ + cell for cell in td.table_cells if cell.text == "See important notes below" + ] + assert len(distinct_desc) == 1 + + # Coincidentally-equal values stay as two separate col_span=1 cells. + last_row = td.grid[3] + assert last_row[1].text == last_row[2].text == "150" + assert last_row[1].col_span == 1 and last_row[2].col_span == 1 + assert last_row[1] is not last_row[2] + + def test_without_member_data_falls_back_to_flat(self): + """No member_data -> previous behaviour: flat 1x1 body cells.""" + original = self._fragment() + merged_df = pd.DataFrame( + [["See important notes below"] * 3], + columns=["Section", "Plan A", "Plan B"], + ) + td = _dataframe_to_docling_data(merged_df, original_data=original) + assert all(cell.col_span == 1 for cell in td.grid[1]) + + class TestAdapterProtocol: """Verify DoclingAdapter satisfies the protocol."""