PebbleRoad · maish · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,21 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html
 
 ## [Unreleased]
 
+## [0.4.1] — 2026-06-08
+
+### Fixed
+
+- **Spanning body cells duplicated across columns on multi-page merge**
+  (`adapters/docling.py`). Docling repeats a `col_span=N` cell's text across
+  every column it covers; the merge round-trip rebuilt those as `N` separate
+  `col_span=1` cells, leaking a full-width description into every value column
+  and displacing the real values (a repeated `col_span` header behaved the same
+  way). Injection now matches each merged row back to its source grid row and
+  re-emits the original spans; rows the merger transformed (stitched
+  continuations, folded overflow) fall back to the flat 1x1 rebuild. The match
+  uses the original span metadata, never value equality, so coincidentally-equal
+  adjacent values (e.g. two plan columns sharing a cap) stay separate cells.
+
 ## [0.4.0] — 2026-05-29
 
 ### Added

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "table-stitcher"
-version = "0.4.0"
+version = "0.4.1"
 description = "Reassemble tables split across page boundaries in PDF extraction"
 readme = "README.md"
 license = "MIT"

diff --git a/src/table_stitcher/adapters/docling.py b/src/table_stitcher/adapters/docling.py
@@ -419,9 +419,90 @@ def _extract_original_header_rows(
     return header_rows, header_cells
 
 
+def _index_member_rows(
+    member_data: list[Optional[TableData]],
+) -> dict[tuple, list[list[TableCell]]]:
+    """
+    Index every member fragment's original grid rows by their expanded
+    text-vector, for col_span reconstruction during injection.
+
+    Docling repeats a spanning cell's text across each column it covers, so the
+    text-vector of an original grid row equals the DataFrame row that
+    ``_grid_to_dataframe`` produced for it (for any row the merger left
+    untouched). Keying on that vector lets ``_dataframe_to_docling_data``
+    recover the original col_span structure instead of flattening every cell to
+    1x1 — which duplicates a spanning cell into every column it covered on a
+    multi-page merge.
+
+    All rows are indexed, including header rows: a satellite fragment's repeated
+    header (e.g. a ``col_span=6`` banner) arrives in the merged DataFrame as a
+    body row, and must match its original spanning cell rather than duplicate
+    across the value columns. (The anchor's own header rows are reconstructed
+    separately and never looked up here.) Values are buckets because a row can
+    legitimately repeat; identical text-vectors imply identical span structure,
+    so any occurrence is interchangeable.
+    """
+    index: dict[tuple, list[list[TableCell]]] = {}
+    for data in member_data:
+        if not data or not data.grid:
+            continue
+        for row in data.grid:
+            if not row:
+                continue
+            key = tuple((getattr(c, "text", "") or "") if c else "" for c in row)
+            index.setdefault(key, []).append(row)
+    return index
+
+
+def _reemit_body_row(
+    orig_row: list[TableCell], table_row_idx: int, has_row_headers: bool
+) -> tuple[list[TableCell], list[TableCell]]:
+    """
+    Re-emit an original grid body row at a new row offset, preserving col_span.
+
+    Returns ``(grid_row, distinct_cells)`` where ``grid_row`` repeats each
+    spanning cell across the columns it covers (Docling grid convention) and
+    ``distinct_cells`` lists each origin cell once (for ``table_cells``).
+
+    row_span is intentionally clamped to 1: the merged DataFrame represents one
+    logical row per grid row, so a multi-row body span cannot be expressed
+    without desynchronizing the rebuilt grid. (Body row_spans are rare; col_span
+    is the case that corrupts multi-page merges.)
+    """
+    grid_row: list[Optional[TableCell]] = []
+    distinct: list[TableCell] = []
+    for c_idx, cell in enumerate(orig_row):
+        if cell is None:
+            grid_row.append(None)
+            continue
+        start_col = getattr(cell, "start_col_offset_idx", c_idx)
+        if start_col == c_idx:
+            col_span = getattr(cell, "col_span", 1) or 1
+            new_cell = TableCell(
+                text=getattr(cell, "text", "") or "",
+                row_span=1,
+                col_span=col_span,
+                column_header=False,
+                row_header=(c_idx == 0 and has_row_headers)
+                or bool(getattr(cell, "row_header", False)),
+                start_row_offset_idx=table_row_idx,
+                end_row_offset_idx=table_row_idx + 1,
+                start_col_offset_idx=c_idx,
+                end_col_offset_idx=c_idx + col_span,
+            )
+            distinct.append(new_cell)
+            grid_row.append(new_cell)
+        else:
+            # Continuation column of a span originating to the left: repeat the
+            # same cell object, which was already appended at ``start_col``.
+            grid_row.append(grid_row[start_col] if start_col < len(grid_row) else None)
+    return grid_row, distinct
+
+
 def _dataframe_to_docling_data(
     df: pd.DataFrame,
     original_data: Optional[TableData] = None,
+    member_data: Optional[list[Optional[TableData]]] = None,
 ) -> TableData:
     """
     Converts a pandas DataFrame back into Docling's TableData structure.
@@ -431,6 +512,12 @@ def _dataframe_to_docling_data(
     are preserved exactly.  Only the data rows are rebuilt from the DataFrame.
     This prevents the lossy roundtrip that would flatten complex headers into
     simple 1x1 cells.
+
+    When ``member_data`` (the original ``TableData`` of every fragment in the
+    logical table) is provided, body rows the merger left untouched are
+    re-emitted from their original grid cells, preserving col_span. Rows the
+    merger transformed (stitched continuations, folded overflow) fall back to a
+    flat 1x1 rebuild from the DataFrame.
     """
     if df.empty:
         cols = list(df.columns) if len(df.columns) > 0 else ["Column_0"]
@@ -499,16 +586,28 @@ def _dataframe_to_docling_data(
                     break
 
     # --- Build data rows from merged DataFrame ---
+    # Index member fragments' original body rows so spanning cells survive the
+    # round-trip (see _index_member_body_rows).
+    body_index = _index_member_rows(member_data) if member_data else {}
+
     for i, (_, row) in enumerate(df.iterrows()):
-        grid_row: list[TableCell] = []
         table_row_idx = num_header_rows + i
 
-        for j, val in enumerate(row):
-            if pd.isna(val) or val is None:
-                text_val = ""
-            else:
-                text_val = str(val)
+        row_vals = ["" if (pd.isna(v) or v is None) else str(v) for v in row]
+
+        # Re-emit untouched rows from their original grid cells (preserves
+        # col_span); only matches when widths align, so coincidentally-equal
+        # adjacent values are never fused.
+        bucket = body_index.get(tuple(row_vals))
+        if bucket:
+            orig_row = bucket.pop(0)
+            grid_row, distinct = _reemit_body_row(orig_row, table_row_idx, has_row_headers)
+            grid.append(grid_row)
+            table_cells.extend(distinct)
+            continue
 
+        grid_row: list[TableCell] = []
+        for j, text_val in enumerate(row_vals):
             row_header = j == 0 and has_row_headers
 
             cell = TableCell(
@@ -899,9 +998,18 @@ def restore_snapshots():
 
                 original_data = getattr(anchor_table, "data", None)
 
+                # Original TableData of every fragment, captured in
+                # table_snapshots before any mutation, so injection can recover
+                # each untouched body row's col_span (see
+                # _index_member_body_rows).
+                member_data = [
+                    table_snapshots[m]["data"] for m in lt.members if m in table_snapshots
+                ]
+
                 anchor_table.data = _dataframe_to_docling_data(
                     lt.df,
                     original_data=original_data,
+                    member_data=member_data,
                 )
 
                 for satellite_idx in lt.members[1:]:

diff --git a/tests/test_docling_adapter.py b/tests/test_docling_adapter.py
@@ -300,6 +300,94 @@ def test_single_row_header_original_still_reused(self):
         assert td.num_rows == 3  # 1 header + 2 data
 
 
+class TestBodySpanPreservation:
+    """Body col_span cells must survive the merge round-trip, not duplicate.
+
+    Docling repeats a spanning cell's text across every column it covers, so a
+    naive grid -> DataFrame -> grid rebuild flattens a ``col_span=N`` body cell
+    into N duplicate ``col_span=1`` cells — leaking a full-width description
+    into every value column and displacing the real values. Passing the member
+    fragments' original TableData lets injection re-emit the original spans.
+    """
+
+    @staticmethod
+    def _cell(text, r, c, *, col_span=1, header=False):
+        return TableCell(
+            text=text,
+            row_span=1,
+            col_span=col_span,
+            column_header=header,
+            row_header=False,
+            start_row_offset_idx=r,
+            end_row_offset_idx=r + 1,
+            start_col_offset_idx=c,
+            end_col_offset_idx=c + col_span,
+        )
+
+    def _fragment(self) -> TableData:
+        """3-col fragment: flat header, a col_span=3 description row, a data row,
+        and a row with coincidentally-equal adjacent values (separate cells)."""
+        c = self._cell
+        # Header row 0
+        h = [
+            c("Section", 0, 0, header=True),
+            c("Plan A", 0, 1, header=True),
+            c("Plan B", 0, 2, header=True),
+        ]
+        # Row 1: description spanning all 3 cols — grid repeats the same object.
+        desc = c("See important notes below", 1, 0, col_span=3)
+        r1 = [desc, desc, desc]
+        # Row 2: ordinary data row.
+        r2 = [c("1", 2, 0), c("100", 2, 1), c("200", 2, 2)]
+        # Row 3: two value columns share a cap value, but are SEPARATE cells.
+        r3 = [c("2", 3, 0), c("150", 3, 1), c("150", 3, 2)]
+        grid = [h, r1, r2, r3]
+        flat = h + [desc, r2[0], r2[1], r2[2], r3[0], r3[1], r3[2]]
+        return TableData(num_rows=4, num_cols=3, table_cells=flat, grid=grid)
+
+    def test_body_colspan_preserved_not_duplicated(self):
+        original = self._fragment()
+        # The DataFrame as _grid_to_dataframe would produce it: the spanning
+        # description duplicated across all three columns.
+        merged_df = pd.DataFrame(
+            [
+                ["See important notes below"] * 3,
+                ["1", "100", "200"],
+                ["2", "150", "150"],
+            ],
+            columns=["Section", "Plan A", "Plan B"],
+        )
+
+        td = _dataframe_to_docling_data(merged_df, original_data=original, member_data=[original])
+
+        # Description row: one origin cell with col_span=3, repeated across the
+        # grid row — NOT three distinct duplicated cells.
+        desc_row = td.grid[1]
+        assert desc_row[0].col_span == 3
+        assert desc_row[0].text == "See important notes below"
+        assert desc_row[1] is desc_row[0] and desc_row[2] is desc_row[0]
+        distinct_desc = [
+            cell for cell in td.table_cells if cell.text == "See important notes below"
+        ]
+        assert len(distinct_desc) == 1
+
+        # Coincidentally-equal values stay as two separate col_span=1 cells.
+        last_row = td.grid[3]
+        assert last_row[1].text == last_row[2].text == "150"
+        assert last_row[1].col_span == 1 and last_row[2].col_span == 1
+        assert last_row[1] is not last_row[2]
+
+    def test_without_member_data_falls_back_to_flat(self):
+        """No member_data -> previous behaviour: flat 1x1 body cells."""
+        original = self._fragment()
+        merged_df = pd.DataFrame(
+            [["See important notes below"] * 3],
+            columns=["Section", "Plan A", "Plan B"],
+        )
+        td = _dataframe_to_docling_data(merged_df, original_data=original)
+        assert all(cell.col_span == 1 for cell in td.grid[1])
+
+
 class TestAdapterProtocol:
     """Verify DoclingAdapter satisfies the protocol."""