Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,30 @@ the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html

## [Unreleased]

## [0.4.0] — 2026-05-29

### Added

- **Intervening-content guard** (`block_on_intervening_content`, default `True`).
Two tables that share a column schema but belong to different sections — a
heading sits between them in reading order — are no longer stitched into one.
A genuine page-split continuation has nothing but page furniture between its
fragments, so a section heading between them is a reliable "separate tables"
signal. The docling adapter computes a per-table `TableMeta.content_before`
signal; both merge paths (`_classify_sequential_pair` and
`should_force_orphan_merge`) consult it.
- Running headers mislabeled as headings (e.g. a journal banner labeled
`page_header` on one page and `section_header` on another, or a repeated
"Summary of benefits" banner above every page of a multi-page table) are
detected as furniture via near-identical (Jaccard ≥ 0.8) recurrence across
pages, so they do not block legitimate continuations.
- Only `section_header`/`title` nodes block; plain paragraphs, list items,
captions, footnotes and figures are deliberately ignored, since real PDFs
routinely scatter those between fragments of a single continued table.
- Fixes over-eager merging of same-schema per-section tables (e.g. an
insurance policy's eight `Prestige | Elite | Classic` benefit grids being
collapsed into one).

## [0.3.0] — 2026-05-06

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "table-stitcher"
version = "0.3.0"
version = "0.4.0"
description = "Reassemble tables split across page boundaries in PDF extraction"
readme = "README.md"
license = "MIT"
Expand Down
181 changes: 181 additions & 0 deletions src/table_stitcher/adapters/docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from ..merger import (
first_row_has_number,
is_numeric_like_colnames,
jaccard,
normalize_col_name,
tokenize,
)
Expand Down Expand Up @@ -556,6 +557,176 @@ def _get_ref_pointer(ref_obj: Any) -> str:
# -------------------------------------------------------------------


# -------------------------------------------------------------------
# Intervening-content detection (reading-order furniture filtering)
#
# Philosophy: a table split across a page break has nothing but page
# furniture (running headers/footers, page numbers) between its fragments.
# If *substantive* body content — a heading, paragraph, list item, or real
# figure — sits between two fragments in reading order, they are separate
# tables that merely share a column schema. These helpers classify the body
# nodes between consecutive tables so the merger can refuse such merges.
# -------------------------------------------------------------------

# Only a structural section boundary reliably means "these are separate
# tables." Plain prose, list items, captions, footnotes and figures all turn
# up *between* fragments of genuine continuations on real-world PDFs (cell text
# extracted as body nodes, interleaved multi-column reading order, repeated
# legends), so blocking on them regresses legitimate merges. A new heading
# between two tables, by contrast, is a clean separator.
_BLOCKING_LABELS = {"section_header", "title"}

_CONTINUATION_RE = re.compile(r"\bcont(?:inued|inuation|'?d|\.)?\b", re.IGNORECASE)


def _label_str(item: Any) -> str:
"""Normalize a docling item label (enum or str) to a lowercase string."""
lab = getattr(item, "label", None)
return str(getattr(lab, "value", lab) or "").lower()


def _norm_text(item: Any) -> str:
"""Whitespace-collapsed lowercase text of an item."""
return " ".join(str(getattr(item, "text", "") or "").split()).lower()


def _flatten_body_refs(doc: Any) -> list[str]:
"""Return body reference pointers in reading order (DFS through groups)."""
seq: list[str] = []
groups = getattr(doc, "groups", []) or []
seen_groups: set[int] = set()

def walk(node: Any) -> None:
children = getattr(node, "children", None) or []
for child in children:
ref = _get_ref_pointer(child)
if not ref:
continue
if ref.startswith("#/groups/"):
try:
gi = int(ref.split("/")[-1])
except ValueError:
continue
# Guard against malformed self-referential group cycles.
if gi in seen_groups or not (0 <= gi < len(groups)):
continue
seen_groups.add(gi)
walk(groups[gi])
else:
seq.append(ref)

body = getattr(doc, "body", None)
if body is not None:
walk(body)
return seq


def _resolve_ref(doc: Any, ref: str) -> Optional[Any]:
"""Resolve a ``#/kind/N`` pointer to its item, or None."""
try:
_, kind, n_str = ref.split("/")
n = int(n_str)
except (ValueError, AttributeError):
return None
coll = {
"texts": getattr(doc, "texts", None),
"tables": getattr(doc, "tables", None),
"pictures": getattr(doc, "pictures", None),
}.get(kind)
if not coll or n >= len(coll):
return None
return coll[n]


def _detect_running_furniture(doc: Any, cfg: MultiPageConfig) -> set[str]:
"""
Identify headings that are actually running headers, not section boundaries.

Only headings (``_BLOCKING_LABELS``) can block a merge, so only headings
need to be exempted. A heading is a running header when near-identical text
appears on a *different* page — e.g. a repeated ``Summary of benefits``
banner above every page of a multi-page table, or a journal name that one
page labels ``page_header`` and another mislabels ``section_header`` (the
parser inconsistency this protects against).

Similarity uses symmetric Jaccard with a high threshold (near-duplicate).
A real, unique heading such as ``38e - Trip postponement`` has no
near-duplicate on another page, so it correctly stays a blocker — unlike a
looser containment metric, which a short subset string (a TOC entry, say)
would spuriously satisfy.
"""
texts = getattr(doc, "texts", []) or []
# (ref, tokens, page, label) for every text node — the comparison pool.
nodes: list[tuple[str, set, Any, str]] = []
for i, item in enumerate(texts):
prov = getattr(item, "prov", None) or []
page = getattr(prov[0], "page_no", None) if prov else None
nodes.append((f"#/texts/{i}", tokenize(_norm_text(item)), page, _label_str(item)))

furniture: set[str] = set()
for ref, toks, page, label in nodes:
if label not in _BLOCKING_LABELS or not toks or page is None:
continue
for ref2, toks2, page2, _ in nodes:
if ref2 == ref or page2 is None or page2 == page or not toks2:
continue
if jaccard(toks, toks2) >= 0.8:
furniture.add(ref)
break
return furniture


def _ref_is_blocking(doc: Any, ref: str, furniture: set[str], cfg: MultiPageConfig) -> bool:
"""Whether a body node between two tables marks a real table boundary.

Only a non-furniture section heading qualifies (see ``_BLOCKING_LABELS``).
Figures, list items and plain paragraphs are deliberately ignored: on real
PDFs they routinely appear between fragments of a single continued table.
"""
if ref in furniture or not ref.startswith("#/texts/"):
return False
item = _resolve_ref(doc, ref)
if item is None:
return False
if _label_str(item) not in _BLOCKING_LABELS:
return False
text = str(getattr(item, "text", "") or "")
if not text.strip():
return False
# A "(continued)" heading marks a continuation, not a new table.
if _CONTINUATION_RE.search(text):
return False
return True


def _compute_content_before(doc: Any, cfg: MultiPageConfig) -> dict[int, bool]:
"""
For each table (keyed by docling index), whether substantive body content
separates it from the previous table in reading order. Tables absent from
the body reading order (orphans) are omitted, disabling the guard for them.
"""
seq = _flatten_body_refs(doc)
if not seq:
return {}
furniture = _detect_running_furniture(doc, cfg)

result: dict[int, bool] = {}
prev_table_seen = False
blocking_seen = False
for ref in seq:
if ref.startswith("#/tables/"):
try:
t_idx = int(ref.split("/")[-1])
except ValueError:
continue
result[t_idx] = prev_table_seen and blocking_seen
prev_table_seen = True
blocking_seen = False
elif not blocking_seen and _ref_is_blocking(doc, ref, furniture, cfg):
blocking_seen = True
return result


class DoclingAdapter:
"""
Table-stitcher adapter for Docling (docling-core).
Expand All @@ -569,6 +740,15 @@ def extract(self, doc: DoclingDocument, cfg: MultiPageConfig) -> list[TableMeta]
total = len(doc.tables)
skipped = 0

# Reading-order map: which tables have substantive body content before
# them (used by the merger's intervening-content guard). Computed once.
content_before_map: dict[int, bool] = {}
if cfg.block_on_intervening_content:
try:
content_before_map = _compute_content_before(doc, cfg)
except Exception as e: # never let the guard break extraction
log.warning(f"Intervening-content detection failed: {e}")

for idx, table in enumerate(doc.tables):
try:
df = _grid_to_dataframe(table, doc)
Expand Down Expand Up @@ -640,6 +820,7 @@ def extract(self, doc: DoclingDocument, cfg: MultiPageConfig) -> list[TableMeta]
row_count=df.shape[0],
continuation_content=continuation_content,
is_headerless=is_headerless,
content_before=content_before_map.get(idx),
)
)

Expand Down
16 changes: 16 additions & 0 deletions src/table_stitcher/merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,11 @@ def should_force_orphan_merge(h: TableMeta, d: TableMeta, cfg: MultiPageConfig)
return False, ""
if abs(h.width - d.width) > cfg.max_width_difference:
return False, ""
# Intervening-content guard — sibling of the one in _classify_sequential_pair.
# Pass 2 reaches this without going through that function, so the guard must
# be repeated here: a heading before the data fragment means a new table.
if cfg.block_on_intervening_content and d.content_before:
return False, "content_between_tables"

layout = layout_suggests_continuation(h, d, cfg)
if h.is_header_orphan and d.is_data_orphan:
Expand Down Expand Up @@ -705,6 +710,17 @@ def _classify_sequential_pair(
if page_gap < 1 or page_gap > cfg.max_page_gap:
return False, "page_gap_out_of_range", False, []

# --- Intervening-content guard ---
# A genuine page-split continuation has nothing but page furniture between
# its fragments. Substantive body content (a heading, paragraph, list item,
# or figure) between tA and tB means they are separate tables that merely
# share a column schema, not one table split across a page break. The
# adapter computes this in reading order (furniture, captions and footnotes
# are already filtered out); ``None`` means position unknown, so we defer to
# the other signals rather than block.
if cfg.block_on_intervening_content and tB.content_before:
return False, "content_between_tables", False, []

# --- Spillover (checked before width guards since spillover can cross
# width boundaries legitimately: 1-col fragment follows N-col table) ---
if is_spillover_fragment(tA, tB, cfg):
Expand Down
20 changes: 20 additions & 0 deletions src/table_stitcher/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,19 @@ class MultiPageConfig:
The structural signal (1 col following N cols) is strong enough for most cases.
"""

# --- Intervening-Content Guard ---
block_on_intervening_content: bool = True
"""
If True, refuse to merge two fragments when substantive body content
(a heading, paragraph, list item, or figure) appears between them in
reading order. A genuine page-split continuation has nothing between its
fragments except page furniture (running headers/footers, page numbers),
which is filtered out, as are table-attached captions/footnotes and
``(continued)`` markers. Requires an adapter that populates
``TableMeta.content_before``; when it is left ``None`` the guard is a
no-op, so non-docling adapters are unaffected.
"""

# --- Cell Stitching ---
stitch_separator: str = "\n"
"""Character(s) used to join split cell content."""
Expand All @@ -118,6 +131,13 @@ class TableMeta:
row_count: int
continuation_content: list[dict] = field(default_factory=list)
is_headerless: bool = False
content_before: Optional[bool] = None
"""
Whether substantive (non-furniture) body content immediately precedes this
fragment in reading order, since the previous table fragment. ``None`` means
the adapter could not place the table in reading order (e.g. an orphan
table), in which case the intervening-content guard is skipped for it.
"""


@dataclass
Expand Down
Loading
Loading