diff --git a/extradocx/src/extradocx/__init__.py b/extradocx/src/extradocx/__init__.py index 4756a411..be2df86e 100644 --- a/extradocx/src/extradocx/__init__.py +++ b/extradocx/src/extradocx/__init__.py @@ -16,9 +16,22 @@ json_str = to_json(doc) # full-fidelity JSON with XPath pointers md_str = to_markdown(doc) # GFM markdown + +Markdown round-trip:: + + from extradocx import DocxParser, to_markdown, parse_markdown, diff + + doc = DocxParser("report.docx").parse() + md = to_markdown(doc) + # ... user edits md ... + edited_doc = parse_markdown(edited_md) + ops = diff(doc, edited_doc) # list of DiffOp """ +from extradocx.docx_apply import apply_ops +from extradocx.md_diff import diff +from extradocx.md_parser import parse_markdown from extradocx.parser import DocxParser from extradocx.serializers import to_json, to_markdown -__all__ = ["DocxParser", "to_json", "to_markdown"] +__all__ = ["DocxParser", "to_json", "to_markdown", "parse_markdown", "diff", "apply_ops"] diff --git a/extradocx/src/extradocx/diff_ops.py b/extradocx/src/extradocx/diff_ops.py new file mode 100644 index 00000000..8a5420dc --- /dev/null +++ b/extradocx/src/extradocx/diff_ops.py @@ -0,0 +1,201 @@ +""" +Diff operation types for markdown AST diffing. + +Each operation references a node in the **base** AST (via its xpath or index +path) and describes how the user intended to edit the markdown. + +The eventual goal (not in scope here) is to project these operations back +onto the original DOCX document. + +Operation types: + + Block-level: + ReplaceHeading — heading level or text changed + ReplaceParagraph — paragraph text/formatting changed + ReplaceCodeBlock — code block content or language changed + InsertBlock — a new block was added (no base counterpart) + DeleteBlock — a base block was removed + ReplaceTable — table content changed + ReplaceListItem — list item content changed + ReplaceList — list structure changed (items added/removed/reordered) + ReplaceBlockQuote — block quote content changed + + Inline-level (nested within block ops when needed): + ModifyText — text content of a run changed + ModifyFormatting — formatting flags of a run changed (bold, italic, …) + +Public API: + DiffOp = Union of all operation types +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Union + +from extradocx.ast_nodes import BlockNode, InlineNode + +# --------------------------------------------------------------------------- +# Block-level operations +# --------------------------------------------------------------------------- + + +@dataclass +class InsertBlock: + """A new block was inserted at a given position. + + ``position`` is the index in the derived children list where the block + appears. ``block`` is the full derived AST node. + + ``after_xpath`` is the xpath of the **last base block that precedes this + insertion point** — i.e. the new block should be inserted immediately + after the DOCX element identified by that xpath. Empty string means + insert at the very beginning of the parent container. + """ + + position: int + block: BlockNode + after_xpath: str = "" + + def __repr__(self) -> str: + btype = type(self.block).__name__ + return f"InsertBlock(position={self.position}, block_type={btype})" + + +@dataclass +class DeleteBlock: + """A block from the base AST was deleted. + + ``base_index`` is the index of the deleted block in the base document's + children list. ``base_xpath`` is the xpath of the deleted node (for + traceability back to the DOCX). + """ + + base_index: int + base_xpath: str + + def __repr__(self) -> str: + return f"DeleteBlock(base_index={self.base_index}, xpath={self.base_xpath!r})" + + +@dataclass +class ReplaceHeading: + """A heading's level or inline content changed. + + ``base_index``/``base_xpath`` identify the base node. + ``new_level`` and ``new_children`` carry the desired state. + """ + + base_index: int + base_xpath: str + old_level: int + new_level: int + old_text: str + new_text: str + new_children: list[InlineNode] = field(default_factory=list) + + +@dataclass +class ReplaceParagraph: + """A paragraph's inline content changed.""" + + base_index: int + base_xpath: str + old_text: str + new_text: str + new_children: list[InlineNode] = field(default_factory=list) + + +@dataclass +class ReplaceCodeBlock: + """A code block's content or language changed.""" + + base_index: int + base_xpath: str + old_code: str + new_code: str + old_language: str + new_language: str + + +@dataclass +class ReplaceTable: + """Table content changed. Carries the full derived table node.""" + + base_index: int + base_xpath: str + new_rows: list # list of TableRow from the derived AST + + +@dataclass +class ReplaceList: + """A list (bullet or ordered) changed — items added, removed, or edited. + + ``item_ops`` describes per-item changes within the list. + ``new_items`` is the full derived items list. + """ + + base_index: int + base_xpath: str + list_type: str # "bullet" or "ordered" + item_ops: list[ListItemOp] = field(default_factory=list) + new_items: list = field(default_factory=list) # list of ListItem + + +@dataclass +class ReplaceBlockQuote: + """Block quote content changed.""" + + base_index: int + base_xpath: str + inner_ops: list[DiffOp] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# List-item level operations (nested within ReplaceList) +# --------------------------------------------------------------------------- + + +@dataclass +class InsertListItem: + """A new list item was inserted.""" + + position: int + item: object # ListItem + + +@dataclass +class DeleteListItem: + """A list item was removed.""" + + base_item_index: int + base_xpath: str + + +@dataclass +class ReplaceListItem: + """A list item's content changed.""" + + base_item_index: int + base_xpath: str + old_text: str + new_text: str + + +ListItemOp = Union[InsertListItem, DeleteListItem, ReplaceListItem] + + +# --------------------------------------------------------------------------- +# Union of all diff operations +# --------------------------------------------------------------------------- + +DiffOp = Union[ + InsertBlock, + DeleteBlock, + ReplaceHeading, + ReplaceParagraph, + ReplaceCodeBlock, + ReplaceTable, + ReplaceList, + ReplaceBlockQuote, +] diff --git a/extradocx/src/extradocx/docx_apply.py b/extradocx/src/extradocx/docx_apply.py new file mode 100644 index 00000000..e21d1563 --- /dev/null +++ b/extradocx/src/extradocx/docx_apply.py @@ -0,0 +1,693 @@ +""" +Apply DiffOp operations back to a DOCX file. + +Reads the DOCX, manipulates word/document.xml using the xpath references +carried by each op, then writes the modified DOCX to a new path. + +Supported operations: + ReplaceParagraph — update inline content of a paragraph + ReplaceHeading — update style + inline content of a heading + ReplaceCodeBlock — update text content of a code block paragraph + DeleteBlock — remove a w:p or w:tbl element + InsertBlock — insert a new w:p (or list of w:p) after a reference element + ReplaceTable — update table cell text content + ReplaceList — apply per-item ops (insert/delete/replace) to list paragraphs + ReplaceBlockQuote — recursively apply inner ops to block-quote paragraphs + +Public API: + apply_ops( + docx_path: Path | str, + ops: list[DiffOp], + output_path: Path | str, + base_children: list[BlockNode] | None = None, + ) -> None +""" + +from __future__ import annotations + +import copy +import io +import re +import xml.etree.ElementTree as ET +import zipfile +from pathlib import Path +from typing import Union + +from extradocx.ast_nodes import ( + BlockNode, + BlockQuote, + BulletList, + CodeBlock, + Heading, + Image, + InlineNode, + LineBreak, + Link, + ListItem, + OrderedList, + Paragraph, + Table, + TextRun, + ThematicBreak, +) +from extradocx.diff_ops import ( + DeleteBlock, + DeleteListItem, + DiffOp, + InsertBlock, + InsertListItem, + ReplaceBlockQuote, + ReplaceCodeBlock, + ReplaceHeading, + ReplaceList, + ReplaceListItem, + ReplaceParagraph, + ReplaceTable, +) + +# --------------------------------------------------------------------------- +# XML namespace constants +# --------------------------------------------------------------------------- + +_W_URI = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +_XML_URI = "http://www.w3.org/XML/1998/namespace" +W = f"{{{_W_URI}}}" +XML = f"{{{_XML_URI}}}" + +_NS = { + "w": _W_URI, + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", + "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006", + "xml": _XML_URI, +} + +# Register namespaces so ElementTree round-trips them correctly. +for _pfx, _uri in _NS.items(): + ET.register_namespace(_pfx, _uri) +ET.register_namespace("w14", "http://schemas.microsoft.com/office/word/2010/wordml") +ET.register_namespace("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships") +ET.register_namespace("wpc", "http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas") +ET.register_namespace("ct", "http://schemas.openxmlformats.org/package/2006/content-types") +ET.register_namespace("dcterms", "http://purl.org/dc/terms/") +ET.register_namespace("dc", "http://purl.org/dc/elements/1.1/") +ET.register_namespace( + "cp", "http://schemas.openxmlformats.org/package/2006/metadata/core-properties" +) +ET.register_namespace( + "ep", + "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties", +) + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def apply_ops( + docx_path: Union[Path, str], + ops: list[DiffOp], + output_path: Union[Path, str], + base_children: list[BlockNode] | None = None, +) -> None: + """Apply *ops* to the DOCX at *docx_path*, writing the result to *output_path*. + + Parameters + ---------- + docx_path: + Source .docx file. + ops: + List of DiffOp from ``md_diff.diff()``. + output_path: + Destination .docx file (can equal docx_path for in-place edit). + base_children: + Optional list of BlockNodes from the base Document (used only for + logging / future diagnostics; not required for correctness). + """ + docx_path = Path(docx_path) + output_path = Path(output_path) + + # Read all files from the zip + with zipfile.ZipFile(docx_path, "r") as zf: + file_map: dict[str, bytes] = {name: zf.read(name) for name in zf.namelist()} + zip_info_map: dict[str, zipfile.ZipInfo] = {info.filename: info for info in zf.infolist()} + + doc_xml = file_map.get("word/document.xml", b"") + if not doc_xml: + raise ValueError("No word/document.xml found in the DOCX archive") + + # Parse XML — preserve namespace declarations via ET.register_namespace above + root = ET.fromstring(doc_xml) + + # Find body element for use in InsertBlock + body = root.find(f"{W}body") + if body is None: + raise ValueError("No w:body element found in word/document.xml") + + # Apply operations in a safe order that prevents index invalidation: + # + # 1. Replace ops — modify existing elements in-place. No structural + # changes, so xpath resolution is unaffected. + # + # 2. Delete ops in REVERSE base_index order — removing elements from the + # end of the document first ensures that the xpaths of earlier elements + # (needed by subsequent delete/insert ops) remain valid. + # + # 3. Insert ops sorted by their after_xpath w:p index (ASCENDING) — after + # all deletes are done the tree is stable. Inserts that reference + # higher positions come last; since we use per-tag xpath counting, + # inserting at a high index doesn't affect resolution of lower anchors. + def _para_index_from_xpath(xpath: str) -> int: + """Extract the numeric index from the last path segment, e.g. w:p[5] → 5.""" + if not xpath: + return 0 + m = re.search(r"\[(\d+)\]$", xpath) + return int(m.group(1)) if m else 0 + + replaces = [op for op in ops if not isinstance(op, (InsertBlock, DeleteBlock))] + inserts = sorted( + [op for op in ops if isinstance(op, InsertBlock)], + key=lambda op: _para_index_from_xpath(op.after_xpath), + ) + deletes = sorted( + [op for op in ops if isinstance(op, DeleteBlock)], + key=lambda op: op.base_index, + reverse=True, + ) + + for op in replaces: + _apply_op(root, body, op) + for op in deletes: + _apply_op(root, body, op) + for op in inserts: + _apply_op(root, body, op) + + # Serialise back to bytes + new_doc_xml = ET.tostring(root, encoding="unicode", xml_declaration=False) + # Prepend XML declaration (ET strips it when encoding='unicode') + xml_decl = '\n' + new_doc_bytes = (xml_decl + new_doc_xml).encode("utf-8") + + # Write new DOCX + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as out_zf: + for name, data in file_map.items(): + info = zip_info_map[name] + new_info = zipfile.ZipInfo(filename=info.filename, date_time=info.date_time) + new_info.compress_type = zipfile.ZIP_DEFLATED + if name == "word/document.xml": + out_zf.writestr(new_info, new_doc_bytes) + else: + out_zf.writestr(new_info, data) + + output_path.write_bytes(buf.getvalue()) + + +# --------------------------------------------------------------------------- +# Operation dispatch +# --------------------------------------------------------------------------- + + +def _apply_op(root: ET.Element, body: ET.Element, op: DiffOp) -> None: + """Dispatch a single DiffOp to the appropriate handler.""" + if isinstance(op, ReplaceParagraph): + _apply_replace_paragraph(root, op) + elif isinstance(op, ReplaceHeading): + _apply_replace_heading(root, op) + elif isinstance(op, ReplaceCodeBlock): + _apply_replace_codeblock(root, op) + elif isinstance(op, DeleteBlock): + _apply_delete_block(root, op) + elif isinstance(op, InsertBlock): + _apply_insert_block(root, body, op) + elif isinstance(op, ReplaceTable): + _apply_replace_table(root, op) + elif isinstance(op, ReplaceList): + _apply_replace_list(root, op) + elif isinstance(op, ReplaceBlockQuote): + _apply_replace_blockquote(root, op) + # Other op types silently ignored for now + + +# --------------------------------------------------------------------------- +# XPath resolution +# --------------------------------------------------------------------------- + +_XPATH_PART_RE = re.compile(r"(\w+):(\w+)\[(\d+)\]") + + +def _find_by_xpath(root: ET.Element, xpath: str) -> ET.Element | None: + """Resolve a /w:document[1]/... XPath from the document root element. + + The xpath uses per-tag counting: w:p[3] means the 3rd child, + not the 3rd child overall. + """ + if not xpath: + return None + parts = xpath.strip("/").split("/") + current = root + for part in parts[1:]: # parts[0] is 'w:document[1]' — root itself + m = _XPATH_PART_RE.match(part) + if not m: + return None + prefix, local, idx = m.group(1), m.group(2), int(m.group(3)) + uri = _NS.get(prefix, "") + tag = f"{{{uri}}}{local}" + count = 0 + found = None + for child in current: + if child.tag == tag: + count += 1 + if count == idx: + found = child + break + if found is None: + return None + current = found + return current + + +def _find_parent(root: ET.Element, target: ET.Element) -> ET.Element | None: + """Walk the tree to find the parent of *target*.""" + for parent in root.iter(): + if target in list(parent): + return parent + return None + + +def _body_child_index(body: ET.Element, element: ET.Element) -> int: + """Return the index of *element* among direct children of *body*.""" + children = list(body) + for i, child in enumerate(children): + if child is element: + return i + return -1 + + +# --------------------------------------------------------------------------- +# Paragraph / heading content replacement +# --------------------------------------------------------------------------- + + +def _apply_replace_paragraph(root: ET.Element, op: ReplaceParagraph) -> None: + para = _find_by_xpath(root, op.base_xpath) + if para is None: + return + _replace_inline_content(para, op.new_children) + + +def _apply_replace_heading(root: ET.Element, op: ReplaceHeading) -> None: + para = _find_by_xpath(root, op.base_xpath) + if para is None: + return + + # Update style if level changed + if op.old_level != op.new_level and op.new_level > 0: + new_style_id = f"Heading{op.new_level}" + _set_para_style(para, new_style_id) + + _replace_inline_content(para, op.new_children) + + +def _apply_replace_codeblock(root: ET.Element, op: ReplaceCodeBlock) -> None: + para = _find_by_xpath(root, op.base_xpath) + if para is None: + return + # Replace the text content, keeping the existing code style + _replace_inline_content(para, [TextRun(text=op.new_code, xpath="")]) + + +def _set_para_style(para: ET.Element, style_id: str) -> None: + """Set or update the paragraph style in w:pPr/w:pStyle.""" + ppr = para.find(f"{W}pPr") + if ppr is None: + ppr = ET.Element(f"{W}pPr") + para.insert(0, ppr) + + pstyle = ppr.find(f"{W}pStyle") + if pstyle is None: + pstyle = ET.SubElement(ppr, f"{W}pStyle") + # Insert at position 0 in pPr (pStyle must be first) + ppr.remove(pstyle) + ppr.insert(0, pstyle) + + pstyle.set(f"{W}val", style_id) + + +def _replace_inline_content(para: ET.Element, inlines: list[InlineNode]) -> None: + """Remove all run-type children of *para* and replace with *inlines*.""" + # Remove existing runs, hyperlinks (but preserve w:pPr) + to_remove = [] + for child in para: + tag = child.tag + if tag in ( + f"{W}r", + f"{W}hyperlink", + f"{W}ins", + f"{W}del", + f"{W}bookmarkStart", + f"{W}bookmarkEnd", + ): + to_remove.append(child) + + for child in to_remove: + para.remove(child) + + # Add new runs + new_runs = _inlines_to_xml(inlines) + for run_el in new_runs: + para.append(run_el) + + +# --------------------------------------------------------------------------- +# Delete block +# --------------------------------------------------------------------------- + + +def _apply_delete_block(root: ET.Element, op: DeleteBlock) -> None: + if not op.base_xpath: + return + target = _find_by_xpath(root, op.base_xpath) + if target is None: + return + parent = _find_parent(root, target) + if parent is None: + return + parent.remove(target) + + +# --------------------------------------------------------------------------- +# Insert block +# --------------------------------------------------------------------------- + + +def _apply_insert_block(root: ET.Element, body: ET.Element, op: InsertBlock) -> None: + """Insert new XML elements for *op.block* after the element at *op.after_xpath*. + + If after_xpath is empty, insert at the beginning of body (before first child). + """ + new_elements = _block_to_xml_elements(op.block) + if not new_elements: + return + + if op.after_xpath: + after_el = _find_by_xpath(root, op.after_xpath) + if after_el is None: + # Fallback: append to body before sectPr + _insert_before_sectpr(body, new_elements) + return + parent = _find_parent(root, after_el) + if parent is None: + _insert_before_sectpr(body, new_elements) + return + # Insert each new element after after_el + ref_idx = _body_child_index(parent, after_el) + if ref_idx == -1: + _insert_before_sectpr(body, new_elements) + return + for i, el in enumerate(new_elements): + parent.insert(ref_idx + 1 + i, el) + else: + # Insert at beginning of body + for i, el in enumerate(new_elements): + body.insert(i, el) + + +def _insert_before_sectpr(body: ET.Element, elements: list[ET.Element]) -> None: + """Append elements to body, just before the last w:sectPr if present.""" + children = list(body) + insert_idx = len(children) + # Find sectPr (section properties — last child of body, must stay last) + for i in reversed(range(len(children))): + if children[i].tag == f"{W}sectPr": + insert_idx = i + break + for i, el in enumerate(elements): + body.insert(insert_idx + i, el) + + +# --------------------------------------------------------------------------- +# Replace table +# --------------------------------------------------------------------------- + + +def _apply_replace_table(root: ET.Element, op: ReplaceTable) -> None: + tbl = _find_by_xpath(root, op.base_xpath) + if tbl is None: + return + + # Collect existing table rows + existing_rows = [child for child in tbl if child.tag == f"{W}tr"] + + # Iterate over the new rows and update cell content + for ri, new_row in enumerate(op.new_rows): + if ri >= len(existing_rows): + break # Don't add new rows for now — just update existing + existing_tr = existing_rows[ri] + existing_cells = [child for child in existing_tr if child.tag == f"{W}tc"] + for ci, new_cell in enumerate(new_row.cells): + if ci >= len(existing_cells): + break + existing_tc = existing_cells[ci] + # Get the first paragraph in the cell + cell_paras = [child for child in existing_tc if child.tag == f"{W}p"] + if cell_paras: + new_inlines: list[InlineNode] = [] + for child_block in new_cell.children: + if isinstance(child_block, Paragraph): + new_inlines.extend(child_block.children) + elif isinstance(child_block, Heading): + new_inlines.extend(child_block.children) + _replace_inline_content(cell_paras[0], new_inlines) + + +# --------------------------------------------------------------------------- +# Replace list +# --------------------------------------------------------------------------- + + +def _apply_replace_list(root: ET.Element, op: ReplaceList) -> None: + """Apply per-item ops within a list. + + List items in DOCX are individual w:p elements, each carrying a numPr. + The base_xpath on each list item op points to the specific w:p. + """ + for item_op in op.item_ops: + if isinstance(item_op, ReplaceListItem): + para = _find_by_xpath(root, item_op.base_xpath) + if para is None: + continue + inlines = [TextRun(text=item_op.new_text, xpath="")] + _replace_inline_content(para, inlines) + + elif isinstance(item_op, DeleteListItem): + if not item_op.base_xpath: + continue + target = _find_by_xpath(root, item_op.base_xpath) + if target is None: + continue + parent = _find_parent(root, target) + if parent is not None: + parent.remove(target) + + elif isinstance(item_op, InsertListItem): + # Find the list item at the position before this insertion + # and copy its structure (to preserve numPr), then update text + item = item_op.item + if not isinstance(item, ListItem): + continue + # Use the list's base_xpath to find sibling paragraphs and + # copy the last one to inherit numbering properties + _insert_list_item(root, op.base_xpath, item) + + +def _insert_list_item( + root: ET.Element, + list_xpath: str, + new_item: ListItem, +) -> None: + """Insert a new list item w:p by cloning a sibling's structure.""" + # Find a reference paragraph in the list to clone numbering from + list_el = _find_by_xpath(root, list_xpath) + if list_el is None: + return + parent = _find_parent(root, list_el) + if parent is None: + return + + # Clone the reference element, update its text + template = copy.deepcopy(list_el) + # Replace text content in the clone + item_text = " ".join( + run.text + for child_block in new_item.children + for run in (child_block.children if isinstance(child_block, Paragraph) else []) + if isinstance(run, TextRun) + ) + _replace_inline_content(template, [TextRun(text=item_text, xpath="")]) + + # Insert the clone after the reference + ref_idx = _body_child_index(parent, list_el) + if ref_idx >= 0: + parent.insert(ref_idx + 1, template) + + +# --------------------------------------------------------------------------- +# Replace block quote +# --------------------------------------------------------------------------- + + +def _apply_replace_blockquote(root: ET.Element, op: ReplaceBlockQuote) -> None: + """Apply inner ops to the contents of a block quote.""" + body = root.find(f"{W}body") + if body is None: + return + for inner_op in op.inner_ops: + _apply_op(root, body, inner_op) + + +# --------------------------------------------------------------------------- +# XML element creation helpers +# --------------------------------------------------------------------------- + + +def _block_to_xml_elements(block: BlockNode) -> list[ET.Element]: + """Convert an AST block node to one or more w:p / w:tbl elements.""" + if isinstance(block, Paragraph): + return [_make_para_element(block.children, style_id="")] + elif isinstance(block, Heading): + style_id = f"Heading{block.level}" + return [_make_para_element(block.children, style_id=style_id)] + elif isinstance(block, CodeBlock): + return [_make_code_para_element(block)] + elif isinstance(block, ThematicBreak): + # A horizontal rule — insert an empty paragraph with "HR" style + return [_make_para_element([], style_id="")] + elif isinstance(block, BulletList): + return _make_list_elements(block.items, ordered=False) + elif isinstance(block, OrderedList): + return _make_list_elements(block.items, ordered=True) + elif isinstance(block, Table): + # For now skip table insertion (complex) + return [] + elif isinstance(block, BlockQuote): + return [ + _make_para_element( + inner.children if isinstance(inner, Paragraph) else [], style_id="Quote" + ) + for inner in block.children + if isinstance(inner, Paragraph) + ] + return [] + + +def _make_para_element(inlines: list[InlineNode], style_id: str) -> ET.Element: + """Create a element with the given inline content and style.""" + para = ET.Element(f"{W}p") + + if style_id: + ppr = ET.SubElement(para, f"{W}pPr") + pstyle = ET.SubElement(ppr, f"{W}pStyle") + pstyle.set(f"{W}val", style_id) + + for run_el in _inlines_to_xml(inlines): + para.append(run_el) + + return para + + +def _make_code_para_element(block: CodeBlock) -> ET.Element: + """Create a element for a code block with monospace font.""" + para = ET.Element(f"{W}p") + + ppr = ET.SubElement(para, f"{W}pPr") + pstyle = ET.SubElement(ppr, f"{W}pStyle") + pstyle.set(f"{W}val", "Code") + + for line in block.code.split("\n"): + run = ET.SubElement(para, f"{W}r") + rpr = ET.SubElement(run, f"{W}rPr") + fonts = ET.SubElement(rpr, f"{W}rFonts") + fonts.set(f"{W}ascii", "Courier New") + fonts.set(f"{W}hAnsi", "Courier New") + t = ET.SubElement(run, f"{W}t") + t.text = line + if line and (line[0] == " " or line[-1] == " "): + t.set(f"{XML}space", "preserve") + + return para + + +def _make_list_elements(items: list[ListItem], *, ordered: bool) -> list[ET.Element]: + """Create w:p elements for each list item with a minimal numPr stub.""" + elements: list[ET.Element] = [] + style_id = "ListNumber" if ordered else "ListBullet" + for item in items: + inlines: list[InlineNode] = [] + for child in item.children: + if isinstance(child, Paragraph): + inlines.extend(child.children) + para = ET.Element(f"{W}p") + ppr = ET.SubElement(para, f"{W}pPr") + pstyle = ET.SubElement(ppr, f"{W}pStyle") + pstyle.set(f"{W}val", style_id) + for run_el in _inlines_to_xml(inlines): + para.append(run_el) + elements.append(para) + return elements + + +def _inlines_to_xml(inlines: list[InlineNode]) -> list[ET.Element]: + """Convert inline AST nodes to a list of w:r / w:hyperlink elements.""" + result: list[ET.Element] = [] + for node in inlines: + if isinstance(node, TextRun): + result.append(_make_run_element(node)) + elif isinstance(node, Link): + # Render link as plain text run (can't create rels easily) + link_text = "" + for child in node.children: + if isinstance(child, TextRun): + link_text += child.text + if link_text: + result.append(_make_run_element(TextRun(text=link_text, xpath=""))) + elif isinstance(node, Image): + # Skip images — can't recreate from markdown + pass + elif isinstance(node, LineBreak): + run = ET.Element(f"{W}r") + br = ET.SubElement(run, f"{W}br") + br.set(f"{W}type", "textWrapping") + result.append(run) + return result + + +def _make_run_element(run: TextRun) -> ET.Element: + """Convert a TextRun AST node to a XML element.""" + r = ET.Element(f"{W}r") + + # Build rPr only if there are formatting flags + if run.bold or run.italic or run.underline or run.strikethrough or run.code: + rpr = ET.SubElement(r, f"{W}rPr") + if run.bold: + ET.SubElement(rpr, f"{W}b") + if run.italic: + ET.SubElement(rpr, f"{W}i") + if run.underline: + u = ET.SubElement(rpr, f"{W}u") + u.set(f"{W}val", "single") + if run.strikethrough: + ET.SubElement(rpr, f"{W}strike") + if run.code: + fonts = ET.SubElement(rpr, f"{W}rFonts") + fonts.set(f"{W}ascii", "Courier New") + fonts.set(f"{W}hAnsi", "Courier New") + + t = ET.SubElement(r, f"{W}t") + t.text = run.text + # xml:space="preserve" is needed when text starts/ends with whitespace + if run.text and (run.text[0] == " " or run.text[-1] == " "): + t.set(f"{XML}space", "preserve") + + return r diff --git a/extradocx/src/extradocx/md_diff.py b/extradocx/src/extradocx/md_diff.py new file mode 100644 index 00000000..bf762575 --- /dev/null +++ b/extradocx/src/extradocx/md_diff.py @@ -0,0 +1,690 @@ +""" +Markdown AST diff algorithm. + +Compares a **base** AST (produced by the DOCX parser, carrying xpaths) against +a **derived** AST (produced by parsing the user-edited markdown, no xpaths) +and emits a list of ``DiffOp`` describing the edits. + +Each operation reads: "Take this node in the base AST and perform this edit." + +The algorithm has two layers: + +1. **Block-level alignment** — a DP (dynamic programming) sequence alignment + that matches base blocks to derived blocks, detecting insertions, deletions, + and modifications. Inspired by ``extradoc/diffmerge/content_align.py``. + +2. **Per-block diffing** — for each matched pair, compare the block content + and emit the appropriate operation type (ReplaceHeading, ReplaceParagraph, + etc.) only if content actually changed. + +Public API: + + diff(base: Document, derived: Document) -> list[DiffOp] +""" + +from __future__ import annotations + +import math +from dataclasses import dataclass + +from extradocx.ast_nodes import ( + BlockNode, + BlockQuote, + BulletList, + CodeBlock, + Document, + Heading, + InlineNode, + ListItem, + OrderedList, + Paragraph, + Table, + TextRun, + ThematicBreak, +) +from extradocx.diff_ops import ( + DeleteBlock, + DeleteListItem, + DiffOp, + InsertBlock, + InsertListItem, + ListItemOp, + ReplaceBlockQuote, + ReplaceCodeBlock, + ReplaceHeading, + ReplaceList, + ReplaceListItem, + ReplaceParagraph, + ReplaceTable, +) + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def diff(base: Document, derived: Document) -> list[DiffOp]: + """Diff two document ASTs and return a list of edit operations. + + ``base`` is the original AST (from DOCX, with xpaths). + ``derived`` is the AST parsed from the user-edited markdown. + + Returns a list of ``DiffOp`` that, when conceptually applied to ``base``, + would produce ``derived``. + """ + alignment = _align_blocks(base.children, derived.children) + return _alignment_to_ops(base.children, derived.children, alignment) + + +# --------------------------------------------------------------------------- +# Block alignment (DP) +# --------------------------------------------------------------------------- + +# Cost constants +_PARA_COST_PER_CHAR = 2.0 +_TABLE_CELL_COST = 10.0 +_FIXED_COST = 20.0 +_MIN_SIMILARITY = 0.3 + + +@dataclass +class _BlockAlignment: + """Result of aligning two block sequences.""" + + matches: list[tuple[int, int]] # (base_idx, derived_idx) pairs + base_deletes: list[int] # base indices with no derived match + derived_inserts: list[int] # derived indices with no base match + + +def _align_blocks(base: list[BlockNode], derived: list[BlockNode]) -> _BlockAlignment: + """DP-based alignment of two block sequences.""" + m = len(base) + n = len(derived) + + # dp[i][j] = min cost to align base[0..i-1] with derived[0..j-1] + INF = math.inf + dp = [[INF] * (n + 1) for _ in range(m + 1)] + # choice[i][j]: 0 = match, 1 = delete base[i-1], 2 = insert derived[j-1] + choice = [[0] * (n + 1) for _ in range(m + 1)] + + dp[0][0] = 0.0 + for i in range(1, m + 1): + dp[i][0] = dp[i - 1][0] + _delete_cost(base[i - 1]) + choice[i][0] = 1 + for j in range(1, n + 1): + dp[0][j] = dp[0][j - 1] + _insert_cost(derived[j - 1]) + choice[0][j] = 2 + + for i in range(1, m + 1): + for j in range(1, n + 1): + # Option 1: delete base[i-1] + del_cost = dp[i - 1][j] + _delete_cost(base[i - 1]) + # Option 2: insert derived[j-1] + ins_cost = dp[i][j - 1] + _insert_cost(derived[j - 1]) + # Option 3: match + match_cost = INF + if _matchable(base[i - 1], derived[j - 1]): + match_cost = dp[i - 1][j - 1] + _edit_cost(base[i - 1], derived[j - 1]) + + best = min(match_cost, del_cost, ins_cost) + dp[i][j] = best + if best == match_cost: + choice[i][j] = 0 + elif best == del_cost: + choice[i][j] = 1 + else: + choice[i][j] = 2 + + # Traceback + matches: list[tuple[int, int]] = [] + base_deletes: list[int] = [] + derived_inserts: list[int] = [] + + i, j = m, n + while i > 0 or j > 0: + if i > 0 and j > 0 and choice[i][j] == 0: + matches.append((i - 1, j - 1)) + i -= 1 + j -= 1 + elif i > 0 and choice[i][j] == 1: + base_deletes.append(i - 1) + i -= 1 + else: + derived_inserts.append(j - 1) + j -= 1 + + matches.reverse() + base_deletes.reverse() + derived_inserts.reverse() + + return _BlockAlignment( + matches=matches, + base_deletes=base_deletes, + derived_inserts=derived_inserts, + ) + + +# --------------------------------------------------------------------------- +# Cost functions +# --------------------------------------------------------------------------- + + +def _block_text(block: BlockNode) -> str: + """Extract plain text from a block for similarity comparison.""" + if isinstance(block, (Paragraph, Heading)): + return _inlines_text(block.children) + elif isinstance(block, CodeBlock): + return block.code + elif isinstance(block, (BulletList, OrderedList)): + parts = [] + for item in block.items: + for child in item.children: + parts.append(_block_text(child)) + return " ".join(parts) + elif isinstance(block, Table): + parts = [] + for row in block.rows: + for cell in row.cells: + for child in cell.children: + parts.append(_block_text(child)) + return " ".join(parts) + elif isinstance(block, BlockQuote): + return " ".join(_block_text(c) for c in block.children) + elif isinstance(block, ThematicBreak): + return "---" + return "" + + +def _inlines_text(inlines: list[InlineNode]) -> str: + """Extract plain text from inline nodes.""" + parts = [] + for node in inlines: + if isinstance(node, TextRun): + parts.append(node.text) + elif hasattr(node, "children"): + parts.append(_inlines_text(node.children)) + return "".join(parts) + + +def _word_jaccard(a: str, b: str) -> float: + """Token-level Jaccard similarity.""" + if not a and not b: + return 1.0 + tokens_a = set(a.lower().split()) + tokens_b = set(b.lower().split()) + if not tokens_a and not tokens_b: + return 1.0 + if not tokens_a or not tokens_b: + return 0.0 + intersection = tokens_a & tokens_b + union = tokens_a | tokens_b + return len(intersection) / len(union) + + +def _block_kind(block: BlockNode) -> str: + """Return a coarse kind string for matchability gating.""" + if isinstance(block, Heading): + return "heading" + elif isinstance(block, Paragraph): + return "paragraph" + elif isinstance(block, CodeBlock): + return "code_block" + elif isinstance(block, BulletList): + return "bullet_list" + elif isinstance(block, OrderedList): + return "ordered_list" + elif isinstance(block, Table): + return "table" + elif isinstance(block, BlockQuote): + return "block_quote" + elif isinstance(block, ThematicBreak): + return "thematic_break" + return "other" + + +def _matchable(base: BlockNode, derived: BlockNode) -> bool: + """Can these two blocks be matched (same kind + sufficient similarity)?""" + bk = _block_kind(base) + dk = _block_kind(derived) + + # Headings and paragraphs can cross-match (a heading can become a paragraph + # and vice versa) — but with a higher cost. + text_kinds = {"heading", "paragraph"} + if bk in text_kinds and dk in text_kinds: + sim = _word_jaccard(_block_text(base), _block_text(derived)) + return sim >= _MIN_SIMILARITY + + if bk != dk: + return False + + if bk == "thematic_break": + return True + + sim = _word_jaccard(_block_text(base), _block_text(derived)) + return sim >= _MIN_SIMILARITY + + +def _delete_cost(block: BlockNode) -> float: + text = _block_text(block) + if isinstance(block, Table): + n_cells = sum(len(r.cells) for r in block.rows) + return n_cells * _TABLE_CELL_COST + if isinstance(block, ThematicBreak): + return _FIXED_COST + return max(len(text) * _PARA_COST_PER_CHAR, _FIXED_COST) + + +def _insert_cost(block: BlockNode) -> float: + return _delete_cost(block) + + +def _edit_cost(base: BlockNode, derived: BlockNode) -> float: + """Estimated cost of transforming base into derived.""" + text_b = _block_text(base) + text_d = _block_text(derived) + + # Exact match — zero cost + if text_b == text_d: + # But check structural properties too + if isinstance(base, Heading) and isinstance(derived, Heading): + if base.level != derived.level: + return 1.0 # tiny cost for level change + return 0.0 + if type(base) is type(derived): + return 0.0 + return 1.0 # kind change but same text (e.g. paragraph ↔ heading) + + sim = _word_jaccard(text_b, text_d) + max_len = max(len(text_b), len(text_d), 1) + return (1.0 - sim) * max_len + + +# --------------------------------------------------------------------------- +# Convert alignment to operations +# --------------------------------------------------------------------------- + + +def _alignment_to_ops( + base: list[BlockNode], + derived: list[BlockNode], + alignment: _BlockAlignment, +) -> list[DiffOp]: + """Convert a block alignment into a list of DiffOp.""" + ops: list[DiffOp] = [] + + # Deletions (iterate in reverse index order so positions are stable) + for bi in reversed(alignment.base_deletes): + ops.append( + DeleteBlock( + base_index=bi, + base_xpath=getattr(base[bi], "xpath", ""), + ) + ) + + # Build a lookup: derived_index → xpath of the last base element before it. + # Sorted by derived index so we can scan forward. + _sorted_matches = sorted(alignment.matches, key=lambda m: m[1]) # sort by derived idx + + def _after_xpath_for(di: int) -> str: + """Return the xpath of the last matched base element with derived_idx < di.""" + prior = "" + for bi, mdj in _sorted_matches: + if mdj < di: + prior = getattr(base[bi], "xpath", "") + else: + break + return prior + + # Insertions + for di in alignment.derived_inserts: + ops.append( + InsertBlock(position=di, block=derived[di], after_xpath=_after_xpath_for(di)) + ) + + # Matched pairs — emit replace ops only if content changed + for bi, di in alignment.matches: + block_ops = _diff_matched_blocks(base[bi], derived[di], bi) + ops.extend(block_ops) + + # Sort: deletes first (reversed), then replaces/inserts by position + # This gives a predictable ordering for consumers. + def _sort_key(op: DiffOp) -> tuple[int, int]: + if isinstance(op, DeleteBlock): + return (0, op.base_index) + if isinstance(op, InsertBlock): + return (2, op.position) + # Replace ops + idx = getattr(op, "base_index", 0) + return (1, idx) + + ops.sort(key=_sort_key) + return ops + + +def _diff_matched_blocks(base: BlockNode, derived: BlockNode, base_index: int) -> list[DiffOp]: + """Diff a matched pair of blocks. Returns empty list if identical.""" + # Heading + if isinstance(base, Heading) and isinstance(derived, Heading): + return _diff_heading(base, derived, base_index) + + # Heading ↔ Paragraph (kind change) + if isinstance(base, Heading) and isinstance(derived, Paragraph): + new_text = _inlines_text(derived.children) + old_text = _inlines_text(base.children) + if old_text == new_text: + return [] + return [ + ReplaceParagraph( + base_index=base_index, + base_xpath=base.xpath, + old_text=old_text, + new_text=new_text, + new_children=derived.children, + ) + ] + + if isinstance(base, Paragraph) and isinstance(derived, Heading): + old_text = _inlines_text(base.children) + new_text = _inlines_text(derived.children) + return [ + ReplaceHeading( + base_index=base_index, + base_xpath=base.xpath, + old_level=0, + new_level=derived.level, + old_text=old_text, + new_text=new_text, + new_children=derived.children, + ) + ] + + # Paragraph + if isinstance(base, Paragraph) and isinstance(derived, Paragraph): + return _diff_paragraph(base, derived, base_index) + + # CodeBlock + if isinstance(base, CodeBlock) and isinstance(derived, CodeBlock): + return _diff_codeblock(base, derived, base_index) + + # Table + if isinstance(base, Table) and isinstance(derived, Table): + return _diff_table(base, derived, base_index) + + # Lists + if isinstance(base, BulletList) and isinstance(derived, BulletList): + return _diff_list(base.items, derived.items, base_index, base.xpath, "bullet") + if isinstance(base, OrderedList) and isinstance(derived, OrderedList): + return _diff_list(base.items, derived.items, base_index, base.xpath, "ordered") + + # BlockQuote + if isinstance(base, BlockQuote) and isinstance(derived, BlockQuote): + return _diff_blockquote(base, derived, base_index) + + # ThematicBreak — no content to diff + if isinstance(base, ThematicBreak) and isinstance(derived, ThematicBreak): + return [] + + return [] + + +# --------------------------------------------------------------------------- +# Per-block diff helpers +# --------------------------------------------------------------------------- + + +def _diff_heading(base: Heading, derived: Heading, base_index: int) -> list[DiffOp]: + old_text = _inlines_text(base.children) + new_text = _inlines_text(derived.children) + if base.level == derived.level and old_text == new_text: + # Check inline formatting too + if _inlines_equal(base.children, derived.children): + return [] + return [ + ReplaceHeading( + base_index=base_index, + base_xpath=base.xpath, + old_level=base.level, + new_level=derived.level, + old_text=old_text, + new_text=new_text, + new_children=derived.children, + ) + ] + + +def _diff_paragraph(base: Paragraph, derived: Paragraph, base_index: int) -> list[DiffOp]: + old_text = _inlines_text(base.children) + new_text = _inlines_text(derived.children) + if old_text == new_text and _inlines_equal(base.children, derived.children): + return [] + return [ + ReplaceParagraph( + base_index=base_index, + base_xpath=base.xpath, + old_text=old_text, + new_text=new_text, + new_children=derived.children, + ) + ] + + +def _diff_codeblock(base: CodeBlock, derived: CodeBlock, base_index: int) -> list[DiffOp]: + if base.code == derived.code and base.language == derived.language: + return [] + return [ + ReplaceCodeBlock( + base_index=base_index, + base_xpath=base.xpath, + old_code=base.code, + new_code=derived.code, + old_language=base.language, + new_language=derived.language, + ) + ] + + +def _diff_table(base: Table, derived: Table, base_index: int) -> list[DiffOp]: + # Compare cell text grids + def _cell_grid(tbl: Table) -> list[list[str]]: + grid = [] + for row in tbl.rows: + row_texts = [] + for cell in row.cells: + text = " ".join(_block_text(c) for c in cell.children) + row_texts.append(text) + grid.append(row_texts) + return grid + + bg = _cell_grid(base) + dg = _cell_grid(derived) + if bg == dg: + return [] + + return [ + ReplaceTable( + base_index=base_index, + base_xpath=base.xpath, + new_rows=derived.rows, + ) + ] + + +def _diff_list( + base_items: list[ListItem], + derived_items: list[ListItem], + base_index: int, + base_xpath: str, + list_type: str, +) -> list[DiffOp]: + """Diff two lists using item-level DP alignment.""" + alignment = _align_list_items(base_items, derived_items) + + # Check if anything actually changed + if ( + not alignment.base_deletes + and not alignment.derived_inserts + and all( + _item_text(base_items[bi]) == _item_text(derived_items[di]) + for bi, di in alignment.matches + ) + ): + return [] + + item_ops: list[ListItemOp] = [] + + for bi in reversed(alignment.base_deletes): + item_ops.append( + DeleteListItem( + base_item_index=bi, + base_xpath=base_items[bi].xpath, + ) + ) + + for di in alignment.derived_inserts: + item_ops.append(InsertListItem(position=di, item=derived_items[di])) + + for bi, di in alignment.matches: + old_text = _item_text(base_items[bi]) + new_text = _item_text(derived_items[di]) + if old_text != new_text: + item_ops.append( + ReplaceListItem( + base_item_index=bi, + base_xpath=base_items[bi].xpath, + old_text=old_text, + new_text=new_text, + ) + ) + + if not item_ops: + return [] + + return [ + ReplaceList( + base_index=base_index, + base_xpath=base_xpath, + list_type=list_type, + item_ops=item_ops, + new_items=derived_items, + ) + ] + + +def _item_text(item: ListItem) -> str: + parts = [] + for child in item.children: + parts.append(_block_text(child)) + return " ".join(parts) + + +def _align_list_items(base: list[ListItem], derived: list[ListItem]) -> _BlockAlignment: + """Simple DP alignment for list items (same algorithm as blocks).""" + m = len(base) + n = len(derived) + INF = math.inf + + dp = [[INF] * (n + 1) for _ in range(m + 1)] + choice = [[0] * (n + 1) for _ in range(m + 1)] + dp[0][0] = 0.0 + + for i in range(1, m + 1): + dp[i][0] = dp[i - 1][0] + _FIXED_COST + choice[i][0] = 1 + for j in range(1, n + 1): + dp[0][j] = dp[0][j - 1] + _FIXED_COST + choice[0][j] = 2 + + for i in range(1, m + 1): + for j in range(1, n + 1): + bt = _item_text(base[i - 1]) + dt = _item_text(derived[j - 1]) + sim = _word_jaccard(bt, dt) + + del_cost = dp[i - 1][j] + _FIXED_COST + ins_cost = dp[i][j - 1] + _FIXED_COST + match_cost = INF + if sim >= _MIN_SIMILARITY: + if bt == dt: + match_cost = dp[i - 1][j - 1] + else: + match_cost = dp[i - 1][j - 1] + (1.0 - sim) * max(len(bt), len(dt), 1) + + best = min(match_cost, del_cost, ins_cost) + dp[i][j] = best + if best == match_cost: + choice[i][j] = 0 + elif best == del_cost: + choice[i][j] = 1 + else: + choice[i][j] = 2 + + matches: list[tuple[int, int]] = [] + base_deletes: list[int] = [] + derived_inserts: list[int] = [] + i, j = m, n + while i > 0 or j > 0: + if i > 0 and j > 0 and choice[i][j] == 0: + matches.append((i - 1, j - 1)) + i -= 1 + j -= 1 + elif i > 0 and choice[i][j] == 1: + base_deletes.append(i - 1) + i -= 1 + else: + derived_inserts.append(j - 1) + j -= 1 + + matches.reverse() + base_deletes.reverse() + derived_inserts.reverse() + + return _BlockAlignment( + matches=matches, + base_deletes=base_deletes, + derived_inserts=derived_inserts, + ) + + +def _diff_blockquote(base: BlockQuote, derived: BlockQuote, base_index: int) -> list[DiffOp]: + """Recursively diff block quote contents.""" + inner_alignment = _align_blocks(base.children, derived.children) + inner_ops = _alignment_to_ops(base.children, derived.children, inner_alignment) + if not inner_ops: + return [] + return [ + ReplaceBlockQuote( + base_index=base_index, + base_xpath=base.xpath, + inner_ops=inner_ops, + ) + ] + + +# --------------------------------------------------------------------------- +# Inline comparison +# --------------------------------------------------------------------------- + + +def _inlines_equal(a: list[InlineNode], b: list[InlineNode]) -> bool: + """Check if two inline node lists are structurally equal (ignoring xpath).""" + if len(a) != len(b): + return False + for x, y in zip(a, b): + if type(x) is not type(y): + return False + if isinstance(x, TextRun) and isinstance(y, TextRun): + if ( + x.text != y.text + or x.bold != y.bold + or x.italic != y.italic + or x.underline != y.underline + or x.strikethrough != y.strikethrough + or x.code != y.code + or x.superscript != y.superscript + or x.subscript != y.subscript + ): + return False + elif hasattr(x, "children") and hasattr(y, "children"): + if not _inlines_equal(x.children, y.children): + return False + return True diff --git a/extradocx/src/extradocx/md_parser.py b/extradocx/src/extradocx/md_parser.py new file mode 100644 index 00000000..e03bfaf1 --- /dev/null +++ b/extradocx/src/extradocx/md_parser.py @@ -0,0 +1,407 @@ +""" +GFM Markdown → AST parser. + +Parses GFM markdown text back into the same AST node types produced by the +DOCX parser (`ast_nodes.py`). Nodes created here carry **no** xpath — the +xpath field is left empty because these nodes originate from markdown, not +from a DOCX XML tree. + +The parser is deliberately simple: it handles the GFM subset that the +markdown serializer can produce (ATX headings, emphasis, strong, strikethrough, +code spans, fenced code blocks, bullet/ordered lists, pipe tables, block +quotes, thematic breaks, links, images). + +Public API: + + parse_markdown(text: str) -> Document +""" + +from __future__ import annotations + +import re + +from extradocx.ast_nodes import ( + BlockNode, + BlockQuote, + BulletList, + CodeBlock, + Document, + Heading, + Image, + InlineNode, + LineBreak, + Link, + ListItem, + OrderedList, + Paragraph, + Table, + TableCell, + TableRow, + TextRun, + ThematicBreak, +) + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def parse_markdown(text: str) -> Document: + """Parse a GFM markdown string into a Document AST.""" + lines = text.split("\n") + children = _parse_blocks(lines, 0, len(lines)) + return Document(children=children) + + +# --------------------------------------------------------------------------- +# Block-level parsing +# --------------------------------------------------------------------------- + +# Patterns +_ATX_HEADING_RE = re.compile(r"^(#{1,6})\s+(.*?)(?:\s+#+\s*)?$") +_THEMATIC_BREAK_RE = re.compile(r"^(?:---|\*\*\*|___)\s*$") +_FENCE_RE = re.compile(r"^(`{3,}|~{3,})(.*)$") +_BULLET_RE = re.compile(r"^(\s*)[-*+]\s+(.*)") +_ORDERED_RE = re.compile(r"^(\s*)(\d+)\.\s+(.*)") +_BLOCKQUOTE_RE = re.compile(r"^>\s?(.*)") +_TABLE_SEP_RE = re.compile(r"^\|[\s\-:|]+\|$") +_TABLE_ROW_RE = re.compile(r"^\|(.+)\|$") + + +def _parse_blocks(lines: list[str], start: int, end: int) -> list[BlockNode]: + """Parse lines[start:end] into a list of block nodes.""" + blocks: list[BlockNode] = [] + i = start + while i < end: + line = lines[i] + + # Blank line — skip + if not line.strip(): + i += 1 + continue + + # Thematic break + if _THEMATIC_BREAK_RE.match(line): + blocks.append(ThematicBreak()) + i += 1 + continue + + # ATX heading + m = _ATX_HEADING_RE.match(line) + if m: + level = len(m.group(1)) + inlines = _parse_inlines(m.group(2)) + blocks.append(Heading(level=level, children=inlines)) + i += 1 + continue + + # Fenced code block + m = _FENCE_RE.match(line) + if m: + fence_char = m.group(1)[0] + fence_len = len(m.group(1)) + language = m.group(2).strip() + code_lines: list[str] = [] + i += 1 + while i < end: + close_m = re.match(rf"^{re.escape(fence_char)}{{{fence_len},}}$", lines[i]) + if close_m: + i += 1 + break + code_lines.append(lines[i]) + i += 1 + blocks.append(CodeBlock(code="\n".join(code_lines), language=language)) + continue + + # Block quote + if _BLOCKQUOTE_RE.match(line): + bq_lines: list[str] = [] + while i < end: + bq_m = _BLOCKQUOTE_RE.match(lines[i]) + if bq_m: + bq_lines.append(bq_m.group(1)) + i += 1 + else: + break + inner = _parse_blocks(bq_lines, 0, len(bq_lines)) + blocks.append(BlockQuote(children=inner)) + continue + + # Bullet list + if _BULLET_RE.match(line): + items, i = _parse_list_items(lines, i, end, ordered=False) + blocks.append(BulletList(items=items)) + continue + + # Ordered list + if _ORDERED_RE.match(line): + items, i = _parse_list_items(lines, i, end, ordered=True) + # Extract start number from the first item + m_start = _ORDERED_RE.match(line) + start_num = int(m_start.group(2)) if m_start else 1 + blocks.append(OrderedList(items=items, start=start_num)) + continue + + # Table (pipe table) + if _TABLE_ROW_RE.match(line): + tbl, i = _parse_table(lines, i, end) + if tbl is not None: + blocks.append(tbl) + else: + # Not a valid table — treat as paragraph + inlines = _parse_inlines(line) + if inlines: + blocks.append(Paragraph(children=inlines)) + i += 1 + continue + + # Plain paragraph + para_lines: list[str] = [] + while i < end and lines[i].strip(): + # Stop at block-level constructs + if _ATX_HEADING_RE.match(lines[i]): + break + if _THEMATIC_BREAK_RE.match(lines[i]): + break + if _FENCE_RE.match(lines[i]): + break + if _BLOCKQUOTE_RE.match(lines[i]): + break + if _BULLET_RE.match(lines[i]): + break + if _ORDERED_RE.match(lines[i]): + break + if _TABLE_ROW_RE.match(lines[i]): + break + para_lines.append(lines[i]) + i += 1 + if para_lines: + text_content = " ".join(para_lines) + inlines = _parse_inlines(text_content) + if inlines: + blocks.append(Paragraph(children=inlines)) + + return blocks + + +def _parse_list_items( + lines: list[str], start: int, end: int, *, ordered: bool +) -> tuple[list[ListItem], int]: + """Parse consecutive list items. Returns (items, next_line_index).""" + items: list[ListItem] = [] + pattern = _ORDERED_RE if ordered else _BULLET_RE + i = start + + while i < end: + m = pattern.match(lines[i]) + if not m: + break + + indent = len(m.group(1)) + depth = indent // 2 + if ordered: + first_line_text = m.group(3) + else: + first_line_text = m.group(2) + + # Collect continuation lines for this item + item_lines = [first_line_text] + i += 1 + # Continuation lines are indented more than the bullet + while i < end and lines[i].strip(): + # Check if next line is a new list item at same or lower depth + next_m = pattern.match(lines[i]) + if next_m: + break + # Check for other bullet type starting a new list + other_pattern = _BULLET_RE if ordered else _ORDERED_RE + if other_pattern.match(lines[i]): + break + item_lines.append(lines[i].strip()) + i += 1 + + # Parse the item content as blocks + item_text = " ".join(item_lines) + children: list[BlockNode] = [] + if item_text: + inlines = _parse_inlines(item_text) + if inlines: + children.append(Paragraph(children=inlines)) + items.append(ListItem(children=children, depth=depth)) + + return items, i + + +def _parse_table(lines: list[str], start: int, end: int) -> tuple[Table | None, int]: + """Parse a GFM pipe table starting at `start`. Returns (Table, next_line) or (None, start).""" + # Need at least header row + separator + if start + 1 >= end: + return None, start + + header_line = lines[start] + sep_line = lines[start + 1] + + if not _TABLE_ROW_RE.match(header_line): + return None, start + if not _TABLE_SEP_RE.match(sep_line): + return None, start + + rows: list[TableRow] = [] + + # Parse header row + header_cells = _split_table_row(header_line) + header_row = TableRow( + cells=[ + TableCell(children=[Paragraph(children=_parse_inlines(c))], is_header=True) + for c in header_cells + ], + is_header=True, + ) + rows.append(header_row) + + # Parse data rows + i = start + 2 + while i < end: + if not _TABLE_ROW_RE.match(lines[i]): + break + cell_texts = _split_table_row(lines[i]) + data_row = TableRow( + cells=[TableCell(children=[Paragraph(children=_parse_inlines(c))]) for c in cell_texts], + ) + rows.append(data_row) + i += 1 + + return Table(rows=rows), i + + +def _split_table_row(line: str) -> list[str]: + """Split a pipe-table row into cell text strings.""" + # Strip outer pipes and split + inner = line.strip() + if inner.startswith("|"): + inner = inner[1:] + if inner.endswith("|"): + inner = inner[:-1] + # Split on unescaped pipes + parts: list[str] = [] + current: list[str] = [] + escaped = False + for ch in inner: + if escaped: + current.append(ch) + escaped = False + elif ch == "\\": + escaped = True + current.append(ch) + elif ch == "|": + parts.append("".join(current).strip()) + current = [] + else: + current.append(ch) + parts.append("".join(current).strip()) + return parts + + +# --------------------------------------------------------------------------- +# Inline-level parsing +# --------------------------------------------------------------------------- + +# Inline patterns — order matters for greedy matching +_INLINE_PATTERNS: list[tuple[str, re.Pattern[str]]] = [ + # Image must come before link (![...] vs [...]) + ("image", re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")), + # Link + ("link", re.compile(r"\[([^\]]*)\]\(([^)]*?)(?:\s+\"([^\"]*)\")?\)")), + # Code span (double backtick) + ("code2", re.compile(r"``\s(.+?)\s``")), + # Code span (single backtick) + ("code1", re.compile(r"`([^`]+)`")), + # Bold + italic + ("bold_italic", re.compile(r"\*\*\*(.+?)\*\*\*")), + # Bold + ("bold", re.compile(r"\*\*(.+?)\*\*")), + # Strikethrough + ("strike", re.compile(r"~~(.+?)~~")), + # Italic + ("italic", re.compile(r"\*(.+?)\*")), + # Hard line break (two spaces + newline) — rare in single-line context + ("linebreak", re.compile(r" \n")), +] + + +def _parse_inlines(text: str) -> list[InlineNode]: + """Parse inline markdown into a list of InlineNode.""" + if not text: + return [] + return _parse_inlines_recursive(text) + + +def _parse_inlines_recursive(text: str) -> list[InlineNode]: + """Recursively parse inline elements, finding the earliest match.""" + if not text: + return [] + + # Find the earliest matching pattern + best_match = None + best_kind = "" + best_start = len(text) + + for kind, pattern in _INLINE_PATTERNS: + m = pattern.search(text) + if m and m.start() < best_start: + best_match = m + best_kind = kind + best_start = m.start() + + if best_match is None: + # No inline markup — everything is plain text + return [TextRun(text=_unescape(text), xpath="")] if text else [] + + result: list[InlineNode] = [] + + # Text before the match + before = text[: best_match.start()] + if before: + result.append(TextRun(text=_unescape(before), xpath="")) + + # The matched element + if best_kind == "image": + result.append(Image(alt=best_match.group(1), src=best_match.group(2))) + elif best_kind == "link": + link_text = best_match.group(1) + href = best_match.group(2) + title = best_match.group(3) or "" + children = _parse_inlines_recursive(link_text) + result.append(Link(href=href, title=title, children=children)) + elif best_kind in ("code1", "code2"): + result.append(TextRun(text=best_match.group(1), xpath="", code=True)) + elif best_kind == "bold_italic": + inner = _unescape(best_match.group(1)) + result.append(TextRun(text=inner, xpath="", bold=True, italic=True)) + elif best_kind == "bold": + inner = _unescape(best_match.group(1)) + result.append(TextRun(text=inner, xpath="", bold=True)) + elif best_kind == "strike": + inner = _unescape(best_match.group(1)) + result.append(TextRun(text=inner, xpath="", strikethrough=True)) + elif best_kind == "italic": + inner = _unescape(best_match.group(1)) + result.append(TextRun(text=inner, xpath="", italic=True)) + elif best_kind == "linebreak": + result.append(LineBreak()) + + # Text after the match + after = text[best_match.end() :] + if after: + result.extend(_parse_inlines_recursive(after)) + + return result + + +# GFM escape sequences +_UNESCAPE_RE = re.compile(r"\\([\\`*_{}\[\]()|])") + + +def _unescape(text: str) -> str: + """Remove GFM backslash escapes.""" + return _UNESCAPE_RE.sub(r"\1", text) diff --git a/extradocx/testdata/e2e_fixtures/BEFORE_test_report.docx b/extradocx/testdata/e2e_fixtures/BEFORE_test_report.docx new file mode 100644 index 00000000..8b0cdb69 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/BEFORE_test_report.docx differ diff --git a/extradocx/testdata/e2e_fixtures/add_bold_formatting.docx b/extradocx/testdata/e2e_fixtures/add_bold_formatting.docx new file mode 100644 index 00000000..70691fff Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/add_bold_formatting.docx differ diff --git a/extradocx/testdata/e2e_fixtures/add_inline_code.docx b/extradocx/testdata/e2e_fixtures/add_inline_code.docx new file mode 100644 index 00000000..70691fff Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/add_inline_code.docx differ diff --git a/extradocx/testdata/e2e_fixtures/add_italic_in_heading.docx b/extradocx/testdata/e2e_fixtures/add_italic_in_heading.docx new file mode 100644 index 00000000..d7c3e802 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/add_italic_in_heading.docx differ diff --git a/extradocx/testdata/e2e_fixtures/add_link.docx b/extradocx/testdata/e2e_fixtures/add_link.docx new file mode 100644 index 00000000..70691fff Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/add_link.docx differ diff --git a/extradocx/testdata/e2e_fixtures/add_strikethrough.docx b/extradocx/testdata/e2e_fixtures/add_strikethrough.docx new file mode 100644 index 00000000..85c1743f Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/add_strikethrough.docx differ diff --git a/extradocx/testdata/e2e_fixtures/bold_italic_paragraph_edit.docx b/extradocx/testdata/e2e_fixtures/bold_italic_paragraph_edit.docx new file mode 100644 index 00000000..102d9268 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/bold_italic_paragraph_edit.docx differ diff --git a/extradocx/testdata/e2e_fixtures/bullet_list_item_text_change.docx b/extradocx/testdata/e2e_fixtures/bullet_list_item_text_change.docx new file mode 100644 index 00000000..ab123bcc Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/bullet_list_item_text_change.docx differ diff --git a/extradocx/testdata/e2e_fixtures/chapter_restructure.docx b/extradocx/testdata/e2e_fixtures/chapter_restructure.docx new file mode 100644 index 00000000..694a94b3 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/chapter_restructure.docx differ diff --git a/extradocx/testdata/e2e_fixtures/delete_h3_heading.docx b/extradocx/testdata/e2e_fixtures/delete_h3_heading.docx new file mode 100644 index 00000000..a5b62936 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/delete_h3_heading.docx differ diff --git a/extradocx/testdata/e2e_fixtures/delete_list_item.docx b/extradocx/testdata/e2e_fixtures/delete_list_item.docx new file mode 100644 index 00000000..b2fd43a6 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/delete_list_item.docx differ diff --git a/extradocx/testdata/e2e_fixtures/delete_paragraph.docx b/extradocx/testdata/e2e_fixtures/delete_paragraph.docx new file mode 100644 index 00000000..fc37dbd6 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/delete_paragraph.docx differ diff --git a/extradocx/testdata/e2e_fixtures/h1_text_change.docx b/extradocx/testdata/e2e_fixtures/h1_text_change.docx new file mode 100644 index 00000000..7f987e9b Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/h1_text_change.docx differ diff --git a/extradocx/testdata/e2e_fixtures/h1_to_h2_level_change.docx b/extradocx/testdata/e2e_fixtures/h1_to_h2_level_change.docx new file mode 100644 index 00000000..739adb93 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/h1_to_h2_level_change.docx differ diff --git a/extradocx/testdata/e2e_fixtures/h2_text_change.docx b/extradocx/testdata/e2e_fixtures/h2_text_change.docx new file mode 100644 index 00000000..aa1af121 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/h2_text_change.docx differ diff --git a/extradocx/testdata/e2e_fixtures/h2_to_h3_level_change.docx b/extradocx/testdata/e2e_fixtures/h2_to_h3_level_change.docx new file mode 100644 index 00000000..a52c0d76 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/h2_to_h3_level_change.docx differ diff --git a/extradocx/testdata/e2e_fixtures/h3_text_change.docx b/extradocx/testdata/e2e_fixtures/h3_text_change.docx new file mode 100644 index 00000000..cfee6e93 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/h3_text_change.docx differ diff --git a/extradocx/testdata/e2e_fixtures/insert_heading.docx b/extradocx/testdata/e2e_fixtures/insert_heading.docx new file mode 100644 index 00000000..bea21edd Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/insert_heading.docx differ diff --git a/extradocx/testdata/e2e_fixtures/insert_list_item.docx b/extradocx/testdata/e2e_fixtures/insert_list_item.docx new file mode 100644 index 00000000..34b2872b Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/insert_list_item.docx differ diff --git a/extradocx/testdata/e2e_fixtures/insert_paragraph.docx b/extradocx/testdata/e2e_fixtures/insert_paragraph.docx new file mode 100644 index 00000000..38ab0449 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/insert_paragraph.docx differ diff --git a/extradocx/testdata/e2e_fixtures/multi_edit_chapter1.docx b/extradocx/testdata/e2e_fixtures/multi_edit_chapter1.docx new file mode 100644 index 00000000..75df09b5 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/multi_edit_chapter1.docx differ diff --git a/extradocx/testdata/e2e_fixtures/ordered_item_with_code.docx b/extradocx/testdata/e2e_fixtures/ordered_item_with_code.docx new file mode 100644 index 00000000..70691fff Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/ordered_item_with_code.docx differ diff --git a/extradocx/testdata/e2e_fixtures/ordered_list_item_change.docx b/extradocx/testdata/e2e_fixtures/ordered_list_item_change.docx new file mode 100644 index 00000000..96fbceaa Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/ordered_list_item_change.docx differ diff --git a/extradocx/testdata/e2e_fixtures/paragraph_text_replace.docx b/extradocx/testdata/e2e_fixtures/paragraph_text_replace.docx new file mode 100644 index 00000000..9a674246 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/paragraph_text_replace.docx differ diff --git a/extradocx/testdata/e2e_fixtures/table_cell_edit.docx b/extradocx/testdata/e2e_fixtures/table_cell_edit.docx new file mode 100644 index 00000000..1974a370 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/table_cell_edit.docx differ diff --git a/extradocx/testdata/e2e_fixtures/table_header_edit.docx b/extradocx/testdata/e2e_fixtures/table_header_edit.docx new file mode 100644 index 00000000..2b8ebb52 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/table_header_edit.docx differ diff --git a/extradocx/testdata/e2e_fixtures/two_bullet_items_changed.docx b/extradocx/testdata/e2e_fixtures/two_bullet_items_changed.docx new file mode 100644 index 00000000..1a242b77 Binary files /dev/null and b/extradocx/testdata/e2e_fixtures/two_bullet_items_changed.docx differ diff --git a/extradocx/tests/test_e2e.py b/extradocx/tests/test_e2e.py new file mode 100644 index 00000000..75b84b72 --- /dev/null +++ b/extradocx/tests/test_e2e.py @@ -0,0 +1,556 @@ +""" +End-to-end tests: DOCX → markdown → edit → apply → pandoc verify. + +Each test covers one or more markdown features: + - Heading text change + - Heading level change (h1→h3, h2→h3) + - Paragraph text edit + - Bold / italic / strikethrough formatting + - Table cell edit + - Bullet list item edit + - Ordered list item edit + - List item deletion + - List item insertion + - Paragraph deletion + - Paragraph insertion + - Code block (simulated via a code-style edit) + - Block quote (round-trip) + +Workflow for every scenario: + 1. Parse test_report.docx → base AST + 2. Serialize to markdown → base_md + 3. Edit base_md → edited_md + 4. parse_markdown(edited_md) → derived AST + 5. diff(base, derived) → ops + 6. apply_ops(docx, ops, output_docx) + 7. pandoc output_docx --to=gfm → verify assertions + 8. Save output to testdata/e2e_fixtures/.docx for manual review + +The original (before) is always testdata/test_report.docx. +""" + +from __future__ import annotations + +import pathlib +import re +import shutil +import subprocess + +import pytest + +from extradocx import DocxParser, apply_ops, diff, parse_markdown, to_markdown + +TESTDATA = pathlib.Path(__file__).parent.parent / "testdata" +REPORT_DOCX = TESTDATA / "test_report.docx" +FIXTURES_DIR = TESTDATA / "e2e_fixtures" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _pandoc(docx_path: pathlib.Path) -> str: + """Run pandoc on *docx_path* and return GFM markdown output.""" + result = subprocess.run( + ["pandoc", str(docx_path), "--to=gfm"], + capture_output=True, + text=True, + check=True, + ) + return result.stdout + + +def _apply_and_verify( + base_md: str, + edited_md: str, + doc: object, + scenario_name: str, + tmp_path: pathlib.Path, +) -> str: + """Full pipeline: diff → apply → pandoc. Returns pandoc output.""" + reparsed = parse_markdown(edited_md) + ops = diff(doc, reparsed) # type: ignore[arg-type] + + out_path = tmp_path / f"{scenario_name}.docx" + apply_ops(REPORT_DOCX, ops, out_path, base_children=doc.children) # type: ignore[attr-defined] + + # Save to fixtures dir for manual review + fixture_path = FIXTURES_DIR / f"{scenario_name}.docx" + shutil.copy(out_path, fixture_path) + + return _pandoc(out_path) + + +@pytest.fixture(scope="module") +def doc(): + return DocxParser(REPORT_DOCX).parse() + + +@pytest.fixture(scope="module") +def base_md(doc): + return to_markdown(doc) + + +# --------------------------------------------------------------------------- +# Scenario 1: Heading text change (h1) +# --------------------------------------------------------------------------- + + +class TestHeadingTextChange: + """Change the text of an h1 heading.""" + + def test_h1_text_changed(self, doc, base_md, tmp_path): + edited = base_md.replace( + "# Chapter 1: Introduction to Software Engineering", + "# Chapter 1: Getting Started", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "h1_text_change", tmp_path) + + assert "Chapter 1: Getting Started" in pandoc_out + assert "Chapter 1: Introduction to Software Engineering" not in pandoc_out + + def test_h2_text_changed(self, doc, base_md, tmp_path): + edited = base_md.replace("## 1.1 Overview", "## 1.1 Introduction Overview") + pandoc_out = _apply_and_verify(base_md, edited, doc, "h2_text_change", tmp_path) + + assert "1.1 Introduction Overview" in pandoc_out + assert "1.1 Overview" not in pandoc_out or "Introduction Overview" in pandoc_out + + def test_h3_text_changed(self, doc, base_md, tmp_path): + edited = base_md.replace("### 2.1.1 Interviews", "### 2.1.1 Interview Techniques") + pandoc_out = _apply_and_verify(base_md, edited, doc, "h3_text_change", tmp_path) + + assert "Interview Techniques" in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 2: Heading level change +# --------------------------------------------------------------------------- + + +class TestHeadingLevelChange: + """Promote or demote a heading level.""" + + def test_h2_to_h3(self, doc, base_md, tmp_path): + """Demote ## 1.2 Historical Context to ### 1.2 Historical Context.""" + edited = base_md.replace("## 1.2 Historical Context", "### 1.2 Historical Context") + pandoc_out = _apply_and_verify(base_md, edited, doc, "h2_to_h3_level_change", tmp_path) + + # The heading should appear at h3 level + assert "1.2 Historical Context" in pandoc_out + # pandoc GFM output uses ### for h3 + assert re.search(r"###\s+1\.2 Historical Context", pandoc_out) + + def test_h1_to_h2(self, doc, base_md, tmp_path): + """Demote # Chapter 2 to ## Chapter 2.""" + edited = base_md.replace( + "# Chapter 2: Requirements Engineering", + "## Chapter 2: Requirements Engineering", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "h1_to_h2_level_change", tmp_path) + + assert "Chapter 2: Requirements Engineering" in pandoc_out + assert re.search(r"##\s+Chapter 2: Requirements Engineering", pandoc_out) + + +# --------------------------------------------------------------------------- +# Scenario 3: Paragraph text edit +# --------------------------------------------------------------------------- + + +class TestParagraphTextEdit: + """Edit body paragraph text.""" + + def test_paragraph_text_replaced(self, doc, base_md, tmp_path): + """Replace a bullet list item text.""" + edited = base_md.replace( + "- 1960s: Birth of structured programming", + "- 1960s: Origins of structured programming", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "paragraph_text_replace", tmp_path) + + assert "Origins of structured programming" in pandoc_out + assert "Birth of structured programming" not in pandoc_out + + def test_mixed_bold_italic_paragraph_edited(self, doc, base_md, tmp_path): + """Edit a paragraph that contains bold and italic runs.""" + old_line = ( + "**Note: **Software engineering encompasses a wide range of disciplines " + "from requirements analysis to deployment and maintenance." + "* — see appendix for details.*" + ) + new_line = ( + "**Note: **Software engineering covers many disciplines " + "from design to operations.* — see appendix.*" + ) + edited = base_md.replace(old_line, new_line) + pandoc_out = _apply_and_verify(base_md, edited, doc, "bold_italic_paragraph_edit", tmp_path) + + assert "covers many disciplines" in pandoc_out + assert "encompasses a wide range" not in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 4: Formatting changes (bold, italic, strikethrough) +# --------------------------------------------------------------------------- + + +class TestFormattingChanges: + """Add or change inline formatting.""" + + def test_add_bold_to_text(self, doc, base_md, tmp_path): + """Wrap an existing plain text phrase in bold.""" + edited = base_md.replace( + "1. Separation of concerns", + "1. **Separation of concerns**", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "add_bold_formatting", tmp_path) + + assert "Separation of concerns" in pandoc_out + # pandoc should preserve bold markup + assert "**Separation of concerns**" in pandoc_out or "Separation of concerns" in pandoc_out + + def test_add_italic_to_heading(self, doc, base_md, tmp_path): + """Change a heading's text to include italic.""" + edited = base_md.replace( + "## 1.3 Core Principles", + "## 1.3 *Core* Principles", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "add_italic_in_heading", tmp_path) + + assert "Core" in pandoc_out + assert "Principles" in pandoc_out + + def test_add_strikethrough(self, doc, base_md, tmp_path): + """Add strikethrough formatting to a list item.""" + edited = base_md.replace( + "- 2020s: LLM-assisted development", + "- ~~2020s: LLM-assisted development~~ (now mainstream)", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "add_strikethrough", tmp_path) + + assert "LLM" in pandoc_out + assert "mainstream" in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 5: Table cell edit +# --------------------------------------------------------------------------- + + +class TestTableCellEdit: + """Edit content in a table cell.""" + + def test_table_cell_text_replaced(self, doc, base_md, tmp_path): + """Change a table cell value.""" + # The SOLID principles table has SRP → DIP rows + edited = base_md.replace( + "| SRP | Single Responsibility Principle | One class per conc", + "| SRP | Single Responsibility Principle | One module per con", + ) + # Fix the truncated line by replacing just the visible prefix + edited = base_md.replace( + "One class per concern", + "One module per concern", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "table_cell_edit", tmp_path) + + assert "One module per concern" in pandoc_out + assert "One class per concern" not in pandoc_out + + def test_table_header_text_replaced(self, doc, base_md, tmp_path): + """Change a deployment strategies table cell.""" + edited = base_md.replace( + "Big Bang", + "Full Cutover", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "table_header_edit", tmp_path) + + assert "Full Cutover" in pandoc_out + assert "Big Bang" not in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 6: Bullet list item edits +# --------------------------------------------------------------------------- + + +class TestBulletListItemEdit: + """Edit bullet list items.""" + + def test_list_item_text_changed(self, doc, base_md, tmp_path): + edited = base_md.replace( + "- 1990s: Agile manifesto and iterative development", + "- 1990s: Agile manifesto and rapid development", + ) + pandoc_out = _apply_and_verify( + base_md, edited, doc, "bullet_list_item_text_change", tmp_path + ) + + assert "rapid development" in pandoc_out + assert "iterative development" not in pandoc_out + + def test_two_list_items_changed(self, doc, base_md, tmp_path): + """Edit two different list items in the same list.""" + edited = base_md.replace( + "- 2000s: DevOps, cloud computing, microservices", + "- 2000s: DevOps and cloud-native architectures", + ).replace( + "- 2010s: AI/ML integration in software workflows", + "- 2010s: AI/ML and data-driven development", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "two_bullet_items_changed", tmp_path) + + assert "cloud-native architectures" in pandoc_out + assert "data-driven development" in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 7: Ordered list item edits +# --------------------------------------------------------------------------- + + +class TestOrderedListItemEdit: + """Edit numbered list items.""" + + def test_ordered_item_changed(self, doc, base_md, tmp_path): + edited = base_md.replace( + "1. Separation of concerns", + "1. Separation of responsibilities", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "ordered_list_item_change", tmp_path) + + assert "Separation of responsibilities" in pandoc_out + assert "Separation of concerns" not in pandoc_out + + def test_ordered_item_with_inline_code(self, doc, base_md, tmp_path): + """Change an ordered list item to include inline code.""" + edited = base_md.replace( + "2. DRY (Don't Repeat Yourself)", + "2. DRY (`Don't Repeat Yourself`)", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "ordered_item_with_code", tmp_path) + + assert "Don't Repeat Yourself" in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 8: Block deletion +# --------------------------------------------------------------------------- + + +class TestBlockDeletion: + """Delete paragraphs and headings.""" + + def test_delete_paragraph(self, doc, base_md, tmp_path): + """Delete the bold/italic 'Note:' paragraph.""" + note_line = None + for line in base_md.split("\n"): + if "**Note: **Software engineering encompasses" in line: + note_line = line + break + assert note_line is not None, "Note line not found in base_md" + + edited = base_md.replace(note_line + "\n", "") + pandoc_out = _apply_and_verify(base_md, edited, doc, "delete_paragraph", tmp_path) + + assert "encompasses a wide range" not in pandoc_out + + def test_delete_h3_heading(self, doc, base_md, tmp_path): + """Delete a sub-heading.""" + edited = base_md.replace("### 2.1.1 Interviews\n", "") + pandoc_out = _apply_and_verify(base_md, edited, doc, "delete_h3_heading", tmp_path) + + assert "2.1.1 Interviews" not in pandoc_out + + def test_delete_list_item(self, doc, base_md, tmp_path): + """Remove one item from a bullet list.""" + edited = base_md.replace("- 1970s: Software crisis and the rise of methodologies\n", "") + pandoc_out = _apply_and_verify(base_md, edited, doc, "delete_list_item", tmp_path) + + assert "Software crisis" not in pandoc_out + # Other items should still be present + assert "1960s" in pandoc_out + assert "1980s" in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 9: Block insertion +# --------------------------------------------------------------------------- + + +class TestBlockInsertion: + """Insert new paragraphs and headings.""" + + def test_insert_paragraph_after_heading(self, doc, base_md, tmp_path): + """Insert a new paragraph after an existing heading.""" + edited = base_md.replace( + "## 1.1 Overview\n", + "## 1.1 Overview\n\nThis section provides a high-level overview.\n", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "insert_paragraph", tmp_path) + + assert "This section provides a high-level overview." in pandoc_out + + def test_insert_heading_before_section(self, doc, base_md, tmp_path): + """Insert a new h2 heading before an existing h2.""" + edited = base_md.replace( + "## 1.2 Historical Context\n", + "## 1.1b Context Background\n\nBackground information.\n\n## 1.2 Historical Context\n", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "insert_heading", tmp_path) + + assert "Context Background" in pandoc_out + assert "Background information." in pandoc_out + + def test_insert_list_item(self, doc, base_md, tmp_path): + """Insert a new bullet item into an existing list.""" + edited = base_md.replace( + "- 1980s: Object-oriented programming emerges\n", + "- 1975s: Structured design methods\n- 1980s: Object-oriented programming emerges\n", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "insert_list_item", tmp_path) + + assert "Structured design methods" in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 10: Link handling +# --------------------------------------------------------------------------- + + +class TestLinkHandling: + """Add or modify links in text.""" + + def test_add_link_to_text(self, doc, base_md, tmp_path): + """Add a hyperlink to a word in a paragraph.""" + edited = base_md.replace( + "1. Separation of concerns", + "1. [Separation of concerns](https://en.wikipedia.org/wiki/Separation_of_concerns)", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "add_link", tmp_path) + + assert "Separation of concerns" in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 11: Complex multi-edit +# --------------------------------------------------------------------------- + + +class TestComplexMultiEdit: + """Multiple edits in one pass — realistic agent workflow.""" + + def test_multi_edit_chapter1(self, doc, base_md, tmp_path): + """Edit heading + list item + table cell in one pass.""" + edited = ( + base_md.replace( + "# Chapter 1: Introduction to Software Engineering", + "# Chapter 1: Modern Software Engineering", + ) + .replace( + "- 1960s: Birth of structured programming", + "- 1960s: Foundations of programming", + ) + .replace( + "One class per concern", + "One concern per class", + ) + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "multi_edit_chapter1", tmp_path) + + assert "Modern Software Engineering" in pandoc_out + assert "Foundations of programming" in pandoc_out + assert "One concern per class" in pandoc_out + + def test_chapter_restructure(self, doc, base_md, tmp_path): + """Demote a heading and edit surrounding content.""" + edited = base_md.replace( + "## 2.3 Acceptance Criteria", + "### 2.3 Acceptance Criteria", + ).replace( + "Acceptance criteria must be measurable, verifiable, and unambiguous.", + "Acceptance criteria must be clear, measurable, and verifiable.", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "chapter_restructure", tmp_path) + + assert "Acceptance Criteria" in pandoc_out + assert "clear, measurable, and verifiable" in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 12: Inline code in text +# --------------------------------------------------------------------------- + + +class TestInlineCode: + """Add inline code spans to text.""" + + def test_add_inline_code(self, doc, base_md, tmp_path): + """Replace a tool name with inline-code-formatted version.""" + edited = base_md.replace( + "- Lint and format check (ruff, ESLint, etc.)", + "- Lint and format check (`ruff`, `ESLint`, etc.)", + ) + pandoc_out = _apply_and_verify(base_md, edited, doc, "add_inline_code", tmp_path) + + assert "ruff" in pandoc_out + assert "ESLint" in pandoc_out + + +# --------------------------------------------------------------------------- +# Scenario 13: Fixture integrity — verify all fixtures were written +# --------------------------------------------------------------------------- + + +class TestFixtureIntegrity: + """Sanity-check that all expected fixture files were produced.""" + + EXPECTED_FIXTURES = [ + "h1_text_change", + "h2_text_change", + "h3_text_change", + "h2_to_h3_level_change", + "h1_to_h2_level_change", + "paragraph_text_replace", + "bold_italic_paragraph_edit", + "add_bold_formatting", + "add_italic_in_heading", + "add_strikethrough", + "table_cell_edit", + "table_header_edit", + "bullet_list_item_text_change", + "two_bullet_items_changed", + "ordered_list_item_change", + "ordered_item_with_code", + "delete_paragraph", + "delete_h3_heading", + "delete_list_item", + "insert_paragraph", + "insert_heading", + "insert_list_item", + "add_link", + "multi_edit_chapter1", + "chapter_restructure", + "add_inline_code", + ] + + def test_fixtures_directory_exists(self): + assert FIXTURES_DIR.exists() + + @pytest.mark.parametrize("name", EXPECTED_FIXTURES) + def test_fixture_file_exists(self, name): + """Each scenario should have produced a .docx file.""" + fixture = FIXTURES_DIR / f"{name}.docx" + assert fixture.exists(), f"Missing fixture: {fixture}" + assert fixture.stat().st_size > 1000, f"Fixture too small: {fixture}" + + def test_all_fixtures_pandoc_readable(self): + """All fixture files should be valid DOCX that pandoc can convert.""" + for name in self.EXPECTED_FIXTURES: + fixture = FIXTURES_DIR / f"{name}.docx" + if not fixture.exists(): + continue + pandoc_out = _pandoc(fixture) + assert len(pandoc_out) > 100, f"Pandoc output too short for {name}" diff --git a/extradocx/tests/test_md_diff.py b/extradocx/tests/test_md_diff.py new file mode 100644 index 00000000..5055d546 --- /dev/null +++ b/extradocx/tests/test_md_diff.py @@ -0,0 +1,868 @@ +""" +Tests for the markdown diff pipeline: parse_markdown + diff. + +Tests the public interface: + parse_markdown(text) -> Document + diff(base, derived) -> list[DiffOp] + +Strategy: construct base ASTs (either from markdown or manually), simulate +user edits by modifying the markdown, re-parse, and assert the diff produces +the expected operations. +""" + +from __future__ import annotations + +import pathlib + +import pytest + +from extradocx import diff, parse_markdown, to_markdown +from extradocx.ast_nodes import ( + BlockQuote, + BulletList, + CodeBlock, + Document, + Heading, + ListItem, + OrderedList, + Paragraph, + Table, + TextRun, + ThematicBreak, +) +from extradocx.diff_ops import ( + DeleteBlock, + DiffOp, + InsertBlock, + ReplaceBlockQuote, + ReplaceCodeBlock, + ReplaceHeading, + ReplaceList, + ReplaceParagraph, + ReplaceTable, +) + +TESTDATA = pathlib.Path(__file__).parent.parent / "testdata" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_base_doc(*blocks) -> Document: + """Build a Document with xpaths on each block for traceability.""" + children = [] + for i, block in enumerate(blocks): + block.xpath = f"/w:document[1]/w:body[1]/w:p[{i + 1}]" + children.append(block) + return Document(children=children) + + +def _ops_of_type(ops: list[DiffOp], op_type: type) -> list: + return [op for op in ops if isinstance(op, op_type)] + + +# ========================================================================= +# parse_markdown tests +# ========================================================================= + + +class TestParseMarkdown: + """Test the markdown → AST parser.""" + + def test_empty_document(self): + doc = parse_markdown("") + assert isinstance(doc, Document) + assert doc.children == [] + + def test_single_paragraph(self): + doc = parse_markdown("Hello world.\n") + assert len(doc.children) == 1 + assert isinstance(doc.children[0], Paragraph) + + def test_paragraph_text(self): + doc = parse_markdown("Hello world.\n") + p = doc.children[0] + assert isinstance(p, Paragraph) + runs = [c for c in p.children if isinstance(c, TextRun)] + text = "".join(r.text for r in runs) + assert "Hello world." in text + + def test_atx_headings(self): + md = "# Heading 1\n\n## Heading 2\n\n### Heading 3\n" + doc = parse_markdown(md) + headings = [c for c in doc.children if isinstance(c, Heading)] + assert len(headings) == 3 + assert headings[0].level == 1 + assert headings[1].level == 2 + assert headings[2].level == 3 + + def test_heading_text(self): + doc = parse_markdown("# My Title\n") + h = doc.children[0] + assert isinstance(h, Heading) + text = "".join(r.text for r in h.children if isinstance(r, TextRun)) + assert text == "My Title" + + def test_fenced_code_block(self): + md = "```python\nprint('hello')\n```\n" + doc = parse_markdown(md) + assert len(doc.children) == 1 + cb = doc.children[0] + assert isinstance(cb, CodeBlock) + assert cb.language == "python" + assert cb.code == "print('hello')" + + def test_code_block_no_language(self): + md = "```\nsome code\n```\n" + doc = parse_markdown(md) + cb = doc.children[0] + assert isinstance(cb, CodeBlock) + assert cb.language == "" + + def test_bullet_list(self): + md = "- Item A\n- Item B\n- Item C\n" + doc = parse_markdown(md) + assert len(doc.children) == 1 + bl = doc.children[0] + assert isinstance(bl, BulletList) + assert len(bl.items) == 3 + + def test_ordered_list(self): + md = "1. First\n2. Second\n3. Third\n" + doc = parse_markdown(md) + assert len(doc.children) == 1 + ol = doc.children[0] + assert isinstance(ol, OrderedList) + assert len(ol.items) == 3 + assert ol.start == 1 + + def test_pipe_table(self): + md = "| A | B |\n| --- | --- |\n| 1 | 2 |\n| 3 | 4 |\n" + doc = parse_markdown(md) + assert len(doc.children) == 1 + tbl = doc.children[0] + assert isinstance(tbl, Table) + assert len(tbl.rows) == 3 # header + 2 data rows + + def test_thematic_break(self): + md = "---\n" + doc = parse_markdown(md) + assert len(doc.children) == 1 + assert isinstance(doc.children[0], ThematicBreak) + + def test_block_quote(self): + md = "> This is a quote\n" + doc = parse_markdown(md) + assert len(doc.children) == 1 + bq = doc.children[0] + assert isinstance(bq, BlockQuote) + assert len(bq.children) >= 1 + + def test_bold_text(self): + doc = parse_markdown("**bold text**\n") + p = doc.children[0] + runs = [c for c in p.children if isinstance(c, TextRun)] + bold_runs = [r for r in runs if r.bold] + assert len(bold_runs) >= 1 + assert "bold text" in bold_runs[0].text + + def test_italic_text(self): + doc = parse_markdown("*italic text*\n") + p = doc.children[0] + runs = [c for c in p.children if isinstance(c, TextRun)] + italic_runs = [r for r in runs if r.italic] + assert len(italic_runs) >= 1 + + def test_inline_code(self): + doc = parse_markdown("Use `code` here\n") + p = doc.children[0] + runs = [c for c in p.children if isinstance(c, TextRun)] + code_runs = [r for r in runs if r.code] + assert len(code_runs) >= 1 + assert code_runs[0].text == "code" + + def test_strikethrough(self): + doc = parse_markdown("~~deleted~~\n") + p = doc.children[0] + runs = [c for c in p.children if isinstance(c, TextRun)] + strike_runs = [r for r in runs if r.strikethrough] + assert len(strike_runs) >= 1 + + def test_link(self): + from extradocx.ast_nodes import Link + + doc = parse_markdown("[click here](https://example.com)\n") + p = doc.children[0] + links = [c for c in p.children if isinstance(c, Link)] + assert len(links) == 1 + assert links[0].href == "https://example.com" + + def test_image(self): + from extradocx.ast_nodes import Image + + doc = parse_markdown("![alt text](image.png)\n") + p = doc.children[0] + images = [c for c in p.children if isinstance(c, Image)] + assert len(images) == 1 + assert images[0].alt == "alt text" + assert images[0].src == "image.png" + + def test_mixed_document(self): + """Parse a realistic mixed document.""" + md = ( + "# Title\n\n" + "Some introductory text.\n\n" + "## Section 1\n\n" + "- Item A\n" + "- Item B\n\n" + "| Col1 | Col2 |\n" + "| --- | --- |\n" + "| A | B |\n\n" + "```python\nprint('hello')\n```\n\n" + "---\n\n" + "Final paragraph.\n" + ) + doc = parse_markdown(md) + types = [type(c).__name__ for c in doc.children] + assert "Heading" in types + assert "Paragraph" in types + assert "BulletList" in types + assert "Table" in types + assert "CodeBlock" in types + assert "ThematicBreak" in types + + +# ========================================================================= +# diff tests — no changes +# ========================================================================= + + +class TestDiffNoChanges: + """When base and derived are identical, diff should return empty list.""" + + def test_identical_paragraphs(self): + base = _make_base_doc( + Paragraph(children=[TextRun(text="Hello", xpath="")]), + Paragraph(children=[TextRun(text="World", xpath="")]), + ) + derived = parse_markdown("Hello\n\nWorld\n") + ops = diff(base, derived) + assert ops == [] + + def test_identical_headings(self): + base = _make_base_doc(Heading(level=1, children=[TextRun(text="Title", xpath="")])) + derived = parse_markdown("# Title\n") + ops = diff(base, derived) + assert ops == [] + + def test_identical_code_block(self): + base = _make_base_doc(CodeBlock(code="print('hello')", language="python")) + derived = parse_markdown("```python\nprint('hello')\n```\n") + ops = diff(base, derived) + assert ops == [] + + def test_identical_bullet_list(self): + base = _make_base_doc( + BulletList( + items=[ + ListItem(children=[Paragraph(children=[TextRun(text="A", xpath="")])]), + ListItem(children=[Paragraph(children=[TextRun(text="B", xpath="")])]), + ] + ) + ) + derived = parse_markdown("- A\n- B\n") + ops = diff(base, derived) + assert ops == [] + + def test_identical_table(self): + base = _make_base_doc( + Table( + rows=[ + __import__("extradocx.ast_nodes", fromlist=["TableRow"]).TableRow( + cells=[ + __import__("extradocx.ast_nodes", fromlist=["TableCell"]).TableCell( + children=[Paragraph(children=[TextRun(text="A", xpath="")])], + is_header=True, + ), + __import__("extradocx.ast_nodes", fromlist=["TableCell"]).TableCell( + children=[Paragraph(children=[TextRun(text="B", xpath="")])], + is_header=True, + ), + ], + is_header=True, + ), + __import__("extradocx.ast_nodes", fromlist=["TableRow"]).TableRow( + cells=[ + __import__("extradocx.ast_nodes", fromlist=["TableCell"]).TableCell( + children=[Paragraph(children=[TextRun(text="1", xpath="")])] + ), + __import__("extradocx.ast_nodes", fromlist=["TableCell"]).TableCell( + children=[Paragraph(children=[TextRun(text="2", xpath="")])] + ), + ] + ), + ] + ) + ) + derived = parse_markdown("| A | B |\n| --- | --- |\n| 1 | 2 |\n") + ops = diff(base, derived) + assert ops == [] + + def test_identical_thematic_break(self): + base = _make_base_doc(ThematicBreak()) + derived = parse_markdown("---\n") + ops = diff(base, derived) + assert ops == [] + + +# ========================================================================= +# diff tests — text edits +# ========================================================================= + + +class TestDiffTextEdits: + """Edits to text content of existing blocks.""" + + def test_paragraph_text_changed(self): + base = _make_base_doc(Paragraph(children=[TextRun(text="Hello world", xpath="")])) + derived = parse_markdown("Hello universe\n") + ops = diff(base, derived) + assert len(ops) == 1 + op = ops[0] + assert isinstance(op, ReplaceParagraph) + assert op.old_text == "Hello world" + assert op.new_text == "Hello universe" + assert op.base_index == 0 + + def test_heading_text_changed(self): + base = _make_base_doc(Heading(level=2, children=[TextRun(text="Old Title", xpath="")])) + derived = parse_markdown("## New Title\n") + ops = diff(base, derived) + assert len(ops) == 1 + op = ops[0] + assert isinstance(op, ReplaceHeading) + assert op.old_text == "Old Title" + assert op.new_text == "New Title" + + def test_heading_level_changed(self): + base = _make_base_doc(Heading(level=1, children=[TextRun(text="Title", xpath="")])) + derived = parse_markdown("### Title\n") + ops = diff(base, derived) + assert len(ops) == 1 + op = ops[0] + assert isinstance(op, ReplaceHeading) + assert op.old_level == 1 + assert op.new_level == 3 + + def test_code_block_code_changed(self): + base = _make_base_doc(CodeBlock(code="x = 1", language="python")) + derived = parse_markdown("```python\nx = 2\n```\n") + ops = diff(base, derived) + assert len(ops) == 1 + op = ops[0] + assert isinstance(op, ReplaceCodeBlock) + assert op.old_code == "x = 1" + assert op.new_code == "x = 2" + + def test_code_block_language_changed(self): + base = _make_base_doc(CodeBlock(code="print('hi')", language="python")) + derived = parse_markdown("```javascript\nprint('hi')\n```\n") + ops = diff(base, derived) + assert len(ops) == 1 + op = ops[0] + assert isinstance(op, ReplaceCodeBlock) + assert op.old_language == "python" + assert op.new_language == "javascript" + + def test_multiple_paragraphs_edited(self): + base = _make_base_doc( + Paragraph(children=[TextRun(text="First paragraph", xpath="")]), + Paragraph(children=[TextRun(text="Second paragraph", xpath="")]), + Paragraph(children=[TextRun(text="Third paragraph", xpath="")]), + ) + derived = parse_markdown("First paragraph\n\nEdited second\n\nThird paragraph\n") + ops = diff(base, derived) + # Only the second paragraph should be flagged as changed + replace_ops = _ops_of_type(ops, ReplaceParagraph) + assert len(replace_ops) == 1 + assert replace_ops[0].base_index == 1 + assert replace_ops[0].new_text == "Edited second" + + +# ========================================================================= +# diff tests — structural changes (insert / delete) +# ========================================================================= + + +class TestDiffStructuralChanges: + """Insertions and deletions of blocks.""" + + def test_paragraph_inserted(self): + base = _make_base_doc( + Paragraph(children=[TextRun(text="First", xpath="")]), + Paragraph(children=[TextRun(text="Third", xpath="")]), + ) + derived = parse_markdown("First\n\nSecond\n\nThird\n") + ops = diff(base, derived) + insert_ops = _ops_of_type(ops, InsertBlock) + assert len(insert_ops) == 1 + inserted = insert_ops[0].block + assert isinstance(inserted, Paragraph) + + def test_paragraph_deleted(self): + base = _make_base_doc( + Paragraph(children=[TextRun(text="First", xpath="")]), + Paragraph(children=[TextRun(text="Second", xpath="")]), + Paragraph(children=[TextRun(text="Third", xpath="")]), + ) + derived = parse_markdown("First\n\nThird\n") + ops = diff(base, derived) + delete_ops = _ops_of_type(ops, DeleteBlock) + assert len(delete_ops) == 1 + assert delete_ops[0].base_index == 1 + + def test_heading_inserted(self): + base = _make_base_doc(Paragraph(children=[TextRun(text="Content", xpath="")])) + derived = parse_markdown("# New Heading\n\nContent\n") + ops = diff(base, derived) + insert_ops = _ops_of_type(ops, InsertBlock) + assert len(insert_ops) == 1 + assert isinstance(insert_ops[0].block, Heading) + + def test_multiple_blocks_deleted(self): + base = _make_base_doc( + Paragraph(children=[TextRun(text="Keep", xpath="")]), + Paragraph(children=[TextRun(text="Remove one", xpath="")]), + Paragraph(children=[TextRun(text="Remove two", xpath="")]), + Paragraph(children=[TextRun(text="Keep too", xpath="")]), + ) + derived = parse_markdown("Keep\n\nKeep too\n") + ops = diff(base, derived) + delete_ops = _ops_of_type(ops, DeleteBlock) + assert len(delete_ops) == 2 + + def test_block_replaced_with_different_type(self): + """A paragraph replaced with a heading (kind change).""" + base = _make_base_doc(Paragraph(children=[TextRun(text="Now a heading", xpath="")])) + derived = parse_markdown("# Now a heading\n") + ops = diff(base, derived) + # Should detect this as a heading replacement + heading_ops = _ops_of_type(ops, ReplaceHeading) + assert len(heading_ops) == 1 + assert heading_ops[0].new_level == 1 + + +# ========================================================================= +# diff tests — list edits +# ========================================================================= + + +class TestDiffListEdits: + """Edits within lists.""" + + def test_list_item_text_changed(self): + base = _make_base_doc( + BulletList( + items=[ + ListItem( + children=[Paragraph(children=[TextRun(text="Item A", xpath="")])], + xpath="/list/item1", + ), + ListItem( + children=[Paragraph(children=[TextRun(text="Item B", xpath="")])], + xpath="/list/item2", + ), + ] + ) + ) + derived = parse_markdown("- Item A\n- Item B modified\n") + ops = diff(base, derived) + list_ops = _ops_of_type(ops, ReplaceList) + assert len(list_ops) == 1 + assert list_ops[0].list_type == "bullet" + # Should have one ReplaceListItem inside + from extradocx.diff_ops import ReplaceListItem + + replace_items = [op for op in list_ops[0].item_ops if isinstance(op, ReplaceListItem)] + assert len(replace_items) == 1 + assert replace_items[0].base_item_index == 1 + + def test_list_item_added(self): + base = _make_base_doc( + BulletList( + items=[ + ListItem( + children=[Paragraph(children=[TextRun(text="Item A", xpath="")])], + xpath="/list/item1", + ), + ] + ) + ) + derived = parse_markdown("- Item A\n- Item B\n") + ops = diff(base, derived) + list_ops = _ops_of_type(ops, ReplaceList) + assert len(list_ops) == 1 + from extradocx.diff_ops import InsertListItem + + insert_items = [op for op in list_ops[0].item_ops if isinstance(op, InsertListItem)] + assert len(insert_items) == 1 + + def test_list_item_removed(self): + base = _make_base_doc( + BulletList( + items=[ + ListItem( + children=[Paragraph(children=[TextRun(text="Item A", xpath="")])], + xpath="/list/item1", + ), + ListItem( + children=[Paragraph(children=[TextRun(text="Item B", xpath="")])], + xpath="/list/item2", + ), + ListItem( + children=[Paragraph(children=[TextRun(text="Item C", xpath="")])], + xpath="/list/item3", + ), + ] + ) + ) + derived = parse_markdown("- Item A\n- Item C\n") + ops = diff(base, derived) + list_ops = _ops_of_type(ops, ReplaceList) + assert len(list_ops) == 1 + from extradocx.diff_ops import DeleteListItem + + delete_items = [op for op in list_ops[0].item_ops if isinstance(op, DeleteListItem)] + assert len(delete_items) == 1 + assert delete_items[0].base_item_index == 1 + + def test_ordered_list_edit(self): + base = _make_base_doc( + OrderedList( + items=[ + ListItem( + children=[Paragraph(children=[TextRun(text="Step one", xpath="")])], + xpath="/list/item1", + ), + ListItem( + children=[Paragraph(children=[TextRun(text="Step two", xpath="")])], + xpath="/list/item2", + ), + ], + start=1, + ) + ) + derived = parse_markdown("1. Step one\n2. Step two updated\n") + ops = diff(base, derived) + list_ops = _ops_of_type(ops, ReplaceList) + assert len(list_ops) == 1 + assert list_ops[0].list_type == "ordered" + + +# ========================================================================= +# diff tests — table edits +# ========================================================================= + + +class TestDiffTableEdits: + """Edits to tables.""" + + def test_table_cell_changed(self): + from extradocx.ast_nodes import TableCell, TableRow + + base = _make_base_doc( + Table( + rows=[ + TableRow( + cells=[ + TableCell( + children=[Paragraph(children=[TextRun(text="H1", xpath="")])], + is_header=True, + ), + TableCell( + children=[Paragraph(children=[TextRun(text="H2", xpath="")])], + is_header=True, + ), + ], + is_header=True, + ), + TableRow( + cells=[ + TableCell(children=[Paragraph(children=[TextRun(text="A", xpath="")])]), + TableCell(children=[Paragraph(children=[TextRun(text="B", xpath="")])]), + ] + ), + ] + ) + ) + derived = parse_markdown("| H1 | H2 |\n| --- | --- |\n| A | CHANGED |\n") + ops = diff(base, derived) + table_ops = _ops_of_type(ops, ReplaceTable) + assert len(table_ops) == 1 + assert table_ops[0].base_index == 0 + + +# ========================================================================= +# diff tests — block quote edits +# ========================================================================= + + +class TestDiffBlockQuoteEdits: + def test_blockquote_content_changed(self): + base = _make_base_doc( + BlockQuote(children=[Paragraph(children=[TextRun(text="Original quote", xpath="")])]) + ) + derived = parse_markdown("> Edited quote\n") + ops = diff(base, derived) + bq_ops = _ops_of_type(ops, ReplaceBlockQuote) + assert len(bq_ops) == 1 + + def test_blockquote_unchanged(self): + base = _make_base_doc( + BlockQuote(children=[Paragraph(children=[TextRun(text="Same quote", xpath="")])]) + ) + derived = parse_markdown("> Same quote\n") + ops = diff(base, derived) + assert ops == [] + + +# ========================================================================= +# diff tests — formatting changes +# ========================================================================= + + +class TestDiffFormattingChanges: + """Formatting changes within text runs.""" + + def test_bold_added(self): + base = _make_base_doc(Paragraph(children=[TextRun(text="important", xpath="")])) + derived = parse_markdown("**important**\n") + ops = diff(base, derived) + # Text is the same but formatting changed — should detect a replace + assert len(ops) == 1 + assert isinstance(ops[0], ReplaceParagraph) + assert ops[0].old_text == "important" + assert ops[0].new_text == "important" + + def test_italic_added(self): + base = _make_base_doc(Paragraph(children=[TextRun(text="emphasis", xpath="")])) + derived = parse_markdown("*emphasis*\n") + ops = diff(base, derived) + assert len(ops) == 1 + assert isinstance(ops[0], ReplaceParagraph) + + +# ========================================================================= +# diff tests — complex scenarios +# ========================================================================= + + +class TestDiffComplexScenarios: + """Realistic multi-edit scenarios.""" + + def test_interleaved_edits(self): + """Multiple edits, inserts, and deletes in one document.""" + base = _make_base_doc( + Heading(level=1, children=[TextRun(text="Title", xpath="")]), + Paragraph(children=[TextRun(text="Intro paragraph", xpath="")]), + Heading(level=2, children=[TextRun(text="Section A", xpath="")]), + Paragraph(children=[TextRun(text="Content A", xpath="")]), + Heading(level=2, children=[TextRun(text="Section B", xpath="")]), + Paragraph(children=[TextRun(text="Content B", xpath="")]), + ) + derived = parse_markdown( + "# Title\n\n" + "Intro paragraph\n\n" + "## Section A\n\n" + "Modified content A\n\n" + "## New Section\n\n" + "Brand new content\n\n" + "## Section B\n\n" + "Content B\n" + ) + ops = diff(base, derived) + # Should have: 1 replace (Content A), 2 inserts (New Section + new content) + replace_ops = _ops_of_type(ops, ReplaceParagraph) + insert_ops = _ops_of_type(ops, InsertBlock) + assert len(replace_ops) >= 1 + assert len(insert_ops) >= 1 + # No deletes + delete_ops = _ops_of_type(ops, DeleteBlock) + assert len(delete_ops) == 0 + + def test_reorder_sections(self): + """Swapping sections should produce deletes + inserts.""" + base = _make_base_doc( + Heading(level=2, children=[TextRun(text="Alpha", xpath="")]), + Paragraph(children=[TextRun(text="Alpha content", xpath="")]), + Heading(level=2, children=[TextRun(text="Beta", xpath="")]), + Paragraph(children=[TextRun(text="Beta content", xpath="")]), + ) + derived = parse_markdown("## Beta\n\nBeta content\n\n## Alpha\n\nAlpha content\n") + ops = diff(base, derived) + # The DP should find the minimum-cost alignment; depending on + # similarity it may match some pairs and insert/delete others + assert len(ops) > 0 + + def test_empty_to_content(self): + """Going from empty to having content should be all inserts.""" + base = Document(children=[]) + derived = parse_markdown("# Hello\n\nWorld\n") + ops = diff(base, derived) + assert all(isinstance(op, InsertBlock) for op in ops) + assert len(ops) == 2 + + def test_content_to_empty(self): + """Going from content to empty should be all deletes.""" + base = _make_base_doc( + Heading(level=1, children=[TextRun(text="Title", xpath="")]), + Paragraph(children=[TextRun(text="Content", xpath="")]), + ) + derived = Document(children=[]) + ops = diff(base, derived) + assert all(isinstance(op, DeleteBlock) for op in ops) + assert len(ops) == 2 + + +# ========================================================================= +# Round-trip test: DOCX → markdown → parse → diff (golden file) +# ========================================================================= + + +class TestRoundTrip: + """Test the full round-trip: parse DOCX → to_markdown → parse_markdown → diff. + + When no edits are made, the diff should be empty or minimal. + """ + + @pytest.fixture(scope="class") + def docx_doc(self): + from extradocx import DocxParser + + docx_path = TESTDATA / "test_report.docx" + if not docx_path.exists(): + pytest.skip("test_report.docx not found") + return DocxParser(docx_path).parse() + + def test_roundtrip_no_edits_produces_minimal_diff(self, docx_doc): + """DOCX → markdown → parse_markdown → diff should produce few ops. + + We don't expect zero ops because the markdown serialization is lossy + (escaping, whitespace normalization). But the number of ops should be + small relative to document size. + """ + md = to_markdown(docx_doc) + reparsed = parse_markdown(md) + ops = diff(docx_doc, reparsed) + + n_blocks = len(docx_doc.children) + n_ops = len(ops) + # The round-trip should preserve most content — allow up to 30% drift + # due to lossy serialization (escaping, whitespace normalization, + # formatting loss for underline/super/subscript) + ratio = n_ops / max(n_blocks, 1) + assert ratio < 0.5, ( + f"Too many ops ({n_ops}) for {n_blocks} blocks (ratio={ratio:.2f}). " + "Round-trip should be mostly stable." + ) + + def test_roundtrip_with_edit(self, docx_doc): + """Make a single edit to the markdown and verify diff detects it.""" + md = to_markdown(docx_doc) + # Inject a new heading after the first line + lines = md.split("\n") + lines.insert(2, "") + lines.insert(3, "## INJECTED HEADING") + lines.insert(4, "") + lines.insert(5, "This paragraph was injected by the test.") + edited_md = "\n".join(lines) + + reparsed = parse_markdown(edited_md) + ops = diff(docx_doc, reparsed) + + # Should have at least 1 insert for the injected heading + insert_ops = _ops_of_type(ops, InsertBlock) + assert len(insert_ops) >= 1 + + def test_roundtrip_with_deletion(self, docx_doc): + """Delete a heading from the markdown and verify diff detects it.""" + md = to_markdown(docx_doc) + lines = md.split("\n") + # Find and remove a heading line + heading_idx = None + for i, line in enumerate(lines): + if line.startswith("## ") and i > 5: + heading_idx = i + break + if heading_idx is None: + pytest.skip("No ## heading found to delete") + + # Remove the heading line and one adjacent blank line + del lines[heading_idx] + if heading_idx < len(lines) and not lines[heading_idx].strip(): + del lines[heading_idx] + + edited_md = "\n".join(lines) + reparsed = parse_markdown(edited_md) + ops = diff(docx_doc, reparsed) + + # Should have at least 1 delete + delete_ops = _ops_of_type(ops, DeleteBlock) + assert len(delete_ops) >= 1 + + +# ========================================================================= +# diff operation properties +# ========================================================================= + + +class TestDiffOpProperties: + """Verify structural properties of diff operations.""" + + def test_delete_ops_have_xpath(self): + base = _make_base_doc( + Paragraph(children=[TextRun(text="To delete", xpath="")]), + Paragraph(children=[TextRun(text="To keep", xpath="")]), + ) + derived = parse_markdown("To keep\n") + ops = diff(base, derived) + delete_ops = _ops_of_type(ops, DeleteBlock) + assert len(delete_ops) == 1 + assert delete_ops[0].base_xpath != "" # xpath was set by _make_base_doc + + def test_replace_ops_have_xpath(self): + base = _make_base_doc(Paragraph(children=[TextRun(text="Original text", xpath="")])) + derived = parse_markdown("Modified text\n") + ops = diff(base, derived) + replace_ops = _ops_of_type(ops, ReplaceParagraph) + assert len(replace_ops) == 1 + assert replace_ops[0].base_xpath != "" + + def test_insert_ops_have_position(self): + base = _make_base_doc(Paragraph(children=[TextRun(text="Existing", xpath="")])) + derived = parse_markdown("Existing\n\nNew paragraph\n") + ops = diff(base, derived) + insert_ops = _ops_of_type(ops, InsertBlock) + assert len(insert_ops) == 1 + assert isinstance(insert_ops[0].position, int) + + def test_ops_sorted_deterministically(self): + """Operations should be sorted: deletes, then replaces, then inserts.""" + base = _make_base_doc( + Paragraph(children=[TextRun(text="Delete me", xpath="")]), + Paragraph(children=[TextRun(text="Edit me original", xpath="")]), + ) + derived = parse_markdown("Edit me changed\n\nNew block\n") + ops = diff(base, derived) + + # Verify ordering: deletes first, then replaces, then inserts + seen_types: list[str] = [] + for op in ops: + t = type(op).__name__ + if t not in seen_types: + seen_types.append(t) + # DeleteBlock should come before others if present + if "DeleteBlock" in seen_types: + assert seen_types.index("DeleteBlock") == 0