diff --git a/extradocx/pyproject.toml b/extradocx/pyproject.toml new file mode 100644 index 00000000..efdfb2da --- /dev/null +++ b/extradocx/pyproject.toml @@ -0,0 +1,28 @@ +[project] +name = "extradocx" +version = "0.1.0" +description = "Experimental DOCX → GFM Markdown AST converter" +requires-python = ">=3.11" +dependencies = [] + +[project.scripts] +extradocx = "extradocx.cli:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/extradocx"] + +[tool.uv] +dev-dependencies = [ + "pytest>=8.0", + "ruff>=0.4", +] + +[tool.ruff] +line-length = 100 + +[tool.ruff.lint] +select = ["E", "F", "I"] diff --git a/extradocx/src/extradocx/__init__.py b/extradocx/src/extradocx/__init__.py new file mode 100644 index 00000000..4756a411 --- /dev/null +++ b/extradocx/src/extradocx/__init__.py @@ -0,0 +1,24 @@ +""" +extradocx — experimental DOCX → GFM Markdown AST converter. + +Proof-of-concept for bidirectional DOCX ↔ Markdown transformation via an +intermediate AST that: + - Represents GFM markdown structure (headings, lists, tables, …) + - Preserves text at run granularity (bold, italic, … per TextRun node) + - Points every node back to the source DOCX XML via XPath + +Usage:: + + from extradocx import DocxParser, to_json, to_markdown + + parser = DocxParser("report.docx") + doc = parser.parse() + + json_str = to_json(doc) # full-fidelity JSON with XPath pointers + md_str = to_markdown(doc) # GFM markdown +""" + +from extradocx.parser import DocxParser +from extradocx.serializers import to_json, to_markdown + +__all__ = ["DocxParser", "to_json", "to_markdown"] diff --git a/extradocx/src/extradocx/ast_nodes.py b/extradocx/src/extradocx/ast_nodes.py new file mode 100644 index 00000000..8a40b128 --- /dev/null +++ b/extradocx/src/extradocx/ast_nodes.py @@ -0,0 +1,362 @@ +""" +AST node definitions for the DOCX → GFM Markdown AST. + +Design principles: +- Every node carries an `xpath` field that points back to the originating + element in word/document.xml (or word/numbering.xml, word/styles.xml). +- Text content is always represented as `TextRun` leaf nodes, never bare + strings. This preserves run-level formatting (bold, italic, …) and + traceability. +- The shape of the tree is GFM-centric, not OOXML-centric. Heading levels, + lists, tables, etc. map to their GFM equivalents. + +Serialization: + - JSON — full fidelity (use `node_to_dict`) + - Markdown — lossy but human-readable (use serializers.to_markdown) + +Inspired by Pandoc's Haskell AST (Block / Inline split) but extended with +XPath pointers and text-run granularity. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Union + + +# --------------------------------------------------------------------------- +# Inline nodes +# --------------------------------------------------------------------------- + + +@dataclass +class TextRun: + """A single OOXML run () turned into a leaf inline node. + + Formatting flags are read from on the run (or inherited from the + paragraph / character style). All formatting is *resolved* — i.e. the + effective value after style inheritance is applied. + """ + + text: str + xpath: str # XPath to the element in document.xml + bold: bool = False + italic: bool = False + underline: bool = False + strikethrough: bool = False + code: bool = False # True when the run uses a monospace / code font + superscript: bool = False + subscript: bool = False + + def to_dict(self) -> dict: + d: dict = {"type": "text_run", "text": self.text, "xpath": self.xpath} + if self.bold: + d["bold"] = True + if self.italic: + d["italic"] = True + if self.underline: + d["underline"] = True + if self.strikethrough: + d["strikethrough"] = True + if self.code: + d["code"] = True + if self.superscript: + d["superscript"] = True + if self.subscript: + d["subscript"] = True + return d + + +@dataclass +class Link: + """Hyperlink () containing inline children.""" + + href: str + children: list[InlineNode] = field(default_factory=list) + title: str = "" + xpath: str = "" + + def to_dict(self) -> dict: + return { + "type": "link", + "href": self.href, + "title": self.title, + "xpath": self.xpath, + "children": [c.to_dict() for c in self.children], + } + + +@dataclass +class Image: + """Inline image ( or ).""" + + alt: str + src: str # rId resolved to a filename / URL when possible + xpath: str = "" + + def to_dict(self) -> dict: + return {"type": "image", "alt": self.alt, "src": self.src, "xpath": self.xpath} + + +@dataclass +class LineBreak: + """Explicit line break ().""" + + xpath: str = "" + + def to_dict(self) -> dict: + return {"type": "line_break", "xpath": self.xpath} + + +@dataclass +class SoftBreak: + """Soft (rendered) break — used for without an explicit type.""" + + xpath: str = "" + + def to_dict(self) -> dict: + return {"type": "soft_break", "xpath": self.xpath} + + +# Union of all inline node types +InlineNode = Union[TextRun, Link, Image, LineBreak, SoftBreak] + + +# --------------------------------------------------------------------------- +# Block nodes +# --------------------------------------------------------------------------- + + +@dataclass +class Paragraph: + """A body paragraph () with no heading style.""" + + children: list[InlineNode] = field(default_factory=list) + xpath: str = "" + # Preserved style name from the source (e.g. "Normal", "Quote") + style_id: str = "" + + def to_dict(self) -> dict: + return { + "type": "paragraph", + "style_id": self.style_id, + "xpath": self.xpath, + "children": [c.to_dict() for c in self.children], + } + + +@dataclass +class Heading: + """A paragraph with a heading style, mapped to GFM h1–h6.""" + + level: int # 1–6 + children: list[InlineNode] = field(default_factory=list) + xpath: str = "" + style_id: str = "" + + def to_dict(self) -> dict: + return { + "type": "heading", + "level": self.level, + "style_id": self.style_id, + "xpath": self.xpath, + "children": [c.to_dict() for c in self.children], + } + + +@dataclass +class CodeBlock: + """A preformatted / code paragraph.""" + + code: str + language: str = "" + xpath: str = "" + + def to_dict(self) -> dict: + return { + "type": "code_block", + "language": self.language, + "code": self.code, + "xpath": self.xpath, + } + + +@dataclass +class BlockQuote: + """A block quote. DOCX doesn't have a native equivalent; mapped from + style names like 'Quote', 'Intense Quote', 'Block Text'.""" + + children: list[BlockNode] = field(default_factory=list) + xpath: str = "" + + def to_dict(self) -> dict: + return { + "type": "block_quote", + "xpath": self.xpath, + "children": [c.to_dict() for c in self.children], + } + + +@dataclass +class ListItem: + """A single list item. May contain nested blocks (continuation paragraphs + and sub-lists are represented as children).""" + + children: list[BlockNode] = field(default_factory=list) + xpath: str = "" + # The depth at which this item appears (0 = top level) + depth: int = 0 + + def to_dict(self) -> dict: + return { + "type": "list_item", + "depth": self.depth, + "xpath": self.xpath, + "children": [c.to_dict() for c in self.children], + } + + +@dataclass +class BulletList: + """An unordered list.""" + + items: list[ListItem] = field(default_factory=list) + xpath: str = "" + + def to_dict(self) -> dict: + return { + "type": "bullet_list", + "xpath": self.xpath, + "items": [i.to_dict() for i in self.items], + } + + +@dataclass +class OrderedList: + """An ordered list.""" + + items: list[ListItem] = field(default_factory=list) + start: int = 1 + xpath: str = "" + + def to_dict(self) -> dict: + return { + "type": "ordered_list", + "start": self.start, + "xpath": self.xpath, + "items": [i.to_dict() for i in self.items], + } + + +@dataclass +class TableCell: + """A single table cell ().""" + + children: list[BlockNode] = field(default_factory=list) + xpath: str = "" + colspan: int = 1 + rowspan: int = 1 + is_header: bool = False + + def to_dict(self) -> dict: + return { + "type": "table_cell", + "is_header": self.is_header, + "colspan": self.colspan, + "rowspan": self.rowspan, + "xpath": self.xpath, + "children": [c.to_dict() for c in self.children], + } + + +@dataclass +class TableRow: + """A table row ().""" + + cells: list[TableCell] = field(default_factory=list) + xpath: str = "" + is_header: bool = False + + def to_dict(self) -> dict: + return { + "type": "table_row", + "is_header": self.is_header, + "xpath": self.xpath, + "cells": [c.to_dict() for c in self.cells], + } + + +@dataclass +class Table: + """A table ().""" + + rows: list[TableRow] = field(default_factory=list) + xpath: str = "" + + def to_dict(self) -> dict: + return { + "type": "table", + "xpath": self.xpath, + "rows": [r.to_dict() for r in self.rows], + } + + +@dataclass +class ThematicBreak: + """A horizontal rule — mapped from page-break paragraphs or explicit HR + styles.""" + + xpath: str = "" + + def to_dict(self) -> dict: + return {"type": "thematic_break", "xpath": self.xpath} + + +@dataclass +class RawBlock: + """A block that couldn't be mapped to a GFM construct. The original XML + is preserved verbatim so it can be round-tripped.""" + + xml: str + xpath: str = "" + + def to_dict(self) -> dict: + return {"type": "raw_block", "xml": self.xml, "xpath": self.xpath} + + +# Union of all block node types +BlockNode = Union[ + Paragraph, + Heading, + CodeBlock, + BlockQuote, + BulletList, + OrderedList, + Table, + ThematicBreak, + RawBlock, +] + + +# --------------------------------------------------------------------------- +# Root +# --------------------------------------------------------------------------- + + +@dataclass +class Document: + """The root of the AST. Represents the full word/document.xml body.""" + + children: list[BlockNode] = field(default_factory=list) + # XPath to + xpath: str = "/w:document/w:body" + # Source metadata + source_path: str = "" + + def to_dict(self) -> dict: + return { + "type": "document", + "source_path": self.source_path, + "xpath": self.xpath, + "children": [c.to_dict() for c in self.children], + } diff --git a/extradocx/src/extradocx/cli.py b/extradocx/src/extradocx/cli.py new file mode 100644 index 00000000..24f6a364 --- /dev/null +++ b/extradocx/src/extradocx/cli.py @@ -0,0 +1,90 @@ +""" +CLI for extradocx. + +Usage:: + + python -m extradocx [--output-dir DIR] [--json] [--markdown] + +Outputs: + .ast.json — full-fidelity AST JSON + .md — GFM markdown +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from extradocx import DocxParser, to_json, to_markdown + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="extradocx", + description="Convert a Microsoft Word .docx to GFM Markdown via an AST.", + ) + parser.add_argument("docx", metavar="INPUT.docx", help="Path to the .docx file") + parser.add_argument( + "--output-dir", + "-o", + default=None, + help="Directory for output files (default: same dir as INPUT.docx)", + ) + parser.add_argument( + "--json", + action="store_true", + default=True, + help="Write AST as JSON (default: on)", + ) + parser.add_argument( + "--no-json", + dest="json", + action="store_false", + help="Disable JSON output", + ) + parser.add_argument( + "--markdown", + action="store_true", + default=True, + help="Write GFM markdown (default: on)", + ) + parser.add_argument( + "--no-markdown", + dest="markdown", + action="store_false", + help="Disable markdown output", + ) + args = parser.parse_args(argv) + + docx_path = Path(args.docx) + if not docx_path.exists(): + print(f"error: file not found: {docx_path}", file=sys.stderr) + return 1 + if not docx_path.suffix.lower() == ".docx": + print(f"warning: file doesn't have .docx extension: {docx_path}", file=sys.stderr) + + out_dir = Path(args.output_dir) if args.output_dir else docx_path.parent + out_dir.mkdir(parents=True, exist_ok=True) + stem = docx_path.stem + + print(f"Parsing {docx_path} …", flush=True) + doc = DocxParser(docx_path).parse() + + if args.json: + json_path = out_dir / f"{stem}.ast.json" + json_str = to_json(doc) + json_path.write_text(json_str, encoding="utf-8") + print(f" AST JSON → {json_path} ({len(json_str):,} bytes)") + + if args.markdown: + md_path = out_dir / f"{stem}.md" + md_str = to_markdown(doc) + md_path.write_text(md_str, encoding="utf-8") + print(f" Markdown → {md_path} ({len(md_str):,} bytes)") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/extradocx/src/extradocx/parser.py b/extradocx/src/extradocx/parser.py new file mode 100644 index 00000000..a9032618 --- /dev/null +++ b/extradocx/src/extradocx/parser.py @@ -0,0 +1,758 @@ +""" +DOCX → GFM Markdown AST parser. + +Reads word/document.xml from a .docx archive and produces an AST whose nodes +are defined in ast_nodes.py. Every node carries an `xpath` attribute that +points back to the originating element in word/document.xml. + +Design notes: + - We use stdlib xml.etree.ElementTree for XML parsing. lxml would give us + getpath() for free, but we compute XPaths manually so there's no hard dep. + - Style inheritance is resolved from word/styles.xml. + - List detection uses word/numbering.xml to determine bullet vs ordered. + - Relationships (hyperlinks, images) are resolved from + word/_rels/document.xml.rels. + +Usage:: + + from extradocx.parser import DocxParser + + parser = DocxParser("path/to/file.docx") + doc = parser.parse() # returns ast_nodes.Document +""" + +from __future__ import annotations + +import re +import zipfile +import xml.etree.ElementTree as ET +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +from extradocx.ast_nodes import ( + BlockNode, + BlockQuote, + BulletList, + CodeBlock, + Document, + Heading, + Image, + InlineNode, + LineBreak, + Link, + ListItem, + OrderedList, + Paragraph, + RawBlock, + SoftBreak, + Table, + TableCell, + TableRow, + TextRun, + ThematicBreak, +) + +# --------------------------------------------------------------------------- +# XML namespace map used throughout this file +# --------------------------------------------------------------------------- + +NS = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "a": "http://schemas.openxmlformats.org/drawingml/2006/main", + "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture", + "v": "urn:schemas-microsoft-com:vml", + "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006", + "w14": "http://schemas.microsoft.com/office/word/2010/wordml", +} + + +def _tag(ns: str, local: str) -> str: + """Return Clark-notation tag string, e.g. '{...ns...}local'.""" + return f"{{{NS[ns]}}}{local}" + + +# --------------------------------------------------------------------------- +# XPath helper – stdlib ET doesn't give getpath() so we track position manually +# --------------------------------------------------------------------------- + + +def _element_xpath(path_parts: list[tuple[str, int]]) -> str: + """Build an XPath string from a list of (clark-tag, 1-based-index) pairs.""" + parts: list[str] = [] + for clark, idx in path_parts: + # Simplify clark notation → prefix:local for readability + local = clark + for prefix, uri in NS.items(): + uri_braced = f"{{{uri}}}" + if clark.startswith(uri_braced): + local = f"{prefix}:{clark[len(uri_braced):]}" + break + parts.append(f"{local}[{idx}]") + return "/" + "/".join(parts) + + +# --------------------------------------------------------------------------- +# Style resolution helpers +# --------------------------------------------------------------------------- + + +@dataclass +class StyleInfo: + """Resolved style properties for a paragraph or character style.""" + + style_id: str = "" + name: str = "" + # Heading level 1-6 if this is a heading style, else None + heading_level: Optional[int] = None + is_code: bool = False + is_quote: bool = False + is_title: bool = False # Document title → rendered as h1 + is_bullet_list: bool = False # "List Bullet" family → bullet list item + is_ordered_list: bool = False # "List Number" family → ordered list item + list_depth: int = 0 # 0 = top level; 1 = nested once; etc. + + +_HEADING_RE = re.compile(r"heading\s*(\d)", re.IGNORECASE) +_CODE_NAMES = {"Code", "CodeBlock", "Code Block", "Verbatim", "Preformatted"} +_QUOTE_NAMES = {"Quote", "Intense Quote", "Block Text", "Blockquote"} +# Matches "List Bullet", "List Bullet 2", "List Bullet 3" +_LIST_BULLET_RE = re.compile(r"list bullet\s*(\d?)", re.IGNORECASE) +# Matches "List Number", "List Number 2", "List Number 3" +_LIST_NUMBER_RE = re.compile(r"list number\s*(\d?)", re.IGNORECASE) + + +def _parse_styles(xml_bytes: bytes) -> dict[str, StyleInfo]: + """Parse word/styles.xml → map of styleId → StyleInfo.""" + styles: dict[str, StyleInfo] = {} + root = ET.fromstring(xml_bytes) + for style_el in root.findall(f".//{_tag('w','style')}"): + sid = style_el.get(_tag("w", "styleId"), "") + if not sid: + continue + name_el = style_el.find(_tag("w", "name")) + name = name_el.get(_tag("w", "val"), "") if name_el is not None else sid + + info = StyleInfo(style_id=sid, name=name) + m = _HEADING_RE.match(name) + if m: + level = int(m.group(1)) + info.heading_level = min(level, 6) # GFM max is h6 + elif name.lower() in ("title",): + info.is_title = True + elif name in _CODE_NAMES: + info.is_code = True + elif name in _QUOTE_NAMES: + info.is_quote = True + else: + mb = _LIST_BULLET_RE.match(name) + if mb: + info.is_bullet_list = True + depth_str = mb.group(1) + info.list_depth = max(0, int(depth_str) - 1) if depth_str else 0 + else: + mn = _LIST_NUMBER_RE.match(name) + if mn: + info.is_ordered_list = True + depth_str = mn.group(1) + info.list_depth = max(0, int(depth_str) - 1) if depth_str else 0 + + styles[sid] = info + return styles + + +# --------------------------------------------------------------------------- +# Numbering resolution +# --------------------------------------------------------------------------- + + +@dataclass +class NumFmt: + """Resolved numbering info for a numId+ilvl combination.""" + + is_ordered: bool # True = decimal/alpha/roman; False = bullet + start_val: int = 1 + + +def _parse_numbering(xml_bytes: bytes) -> dict[tuple[str, str], NumFmt]: + """Parse word/numbering.xml → {(numId, ilvl): NumFmt}.""" + result: dict[tuple[str, str], NumFmt] = {} + root = ET.fromstring(xml_bytes) + + # Collect abstractNum definitions + abstract: dict[str, dict[str, NumFmt]] = {} + for abs_el in root.findall(f".//{_tag('w','abstractNum')}"): + abs_id = abs_el.get(_tag("w", "abstractNumId"), "") + levels: dict[str, NumFmt] = {} + for lvl in abs_el.findall(f".//{_tag('w','lvl')}"): + ilvl = lvl.get(_tag("w", "ilvl"), "0") + fmt_el = lvl.find(_tag("w", "numFmt")) + start_el = lvl.find(_tag("w", "start")) + fmt_val = fmt_el.get(_tag("w", "val"), "bullet") if fmt_el is not None else "bullet" + start_val = int(start_el.get(_tag("w", "val"), "1")) if start_el is not None else 1 + is_ordered = fmt_val not in ("bullet", "none", "") + levels[ilvl] = NumFmt(is_ordered=is_ordered, start_val=start_val) + abstract[abs_id] = levels + + # Map numId → abstractNumId + for num_el in root.findall(f".//{_tag('w','num')}"): + num_id = num_el.get(_tag("w", "numId"), "") + abs_ref = num_el.find(_tag("w", "abstractNumId")) + if abs_ref is None: + continue + abs_id = abs_ref.get(_tag("w", "val"), "") + levels = abstract.get(abs_id, {}) + for ilvl, fmt in levels.items(): + result[(num_id, ilvl)] = fmt + + return result + + +# --------------------------------------------------------------------------- +# Relationship resolution +# --------------------------------------------------------------------------- + + +def _parse_rels(xml_bytes: bytes) -> dict[str, str]: + """Parse word/_rels/document.xml.rels → {rId: target}.""" + rels: dict[str, str] = {} + root = ET.fromstring(xml_bytes) + for rel in root: + rid = rel.get("Id", "") + target = rel.get("Target", "") + if rid: + rels[rid] = target + return rels + + +# --------------------------------------------------------------------------- +# Run properties helper +# --------------------------------------------------------------------------- + + +def _run_is_bold(rpr: Optional[ET.Element]) -> bool: + if rpr is None: + return False + b = rpr.find(_tag("w", "b")) + if b is None: + return False + val = b.get(_tag("w", "val"), "true") + return val.lower() not in ("false", "0", "off") + + +def _run_is_italic(rpr: Optional[ET.Element]) -> bool: + if rpr is None: + return False + i = rpr.find(_tag("w", "i")) + if i is None: + return False + val = i.get(_tag("w", "val"), "true") + return val.lower() not in ("false", "0", "off") + + +def _run_is_underline(rpr: Optional[ET.Element]) -> bool: + if rpr is None: + return False + u = rpr.find(_tag("w", "u")) + if u is None: + return False + val = u.get(_tag("w", "val"), "single") + return val.lower() not in ("none", "false", "0") + + +def _run_is_strike(rpr: Optional[ET.Element]) -> bool: + if rpr is None: + return False + return rpr.find(_tag("w", "strike")) is not None or rpr.find(_tag("w", "dstrike")) is not None + + +def _run_is_code(rpr: Optional[ET.Element]) -> bool: + """Detect monospace / code font by rStyle or font name.""" + if rpr is None: + return False + rstyle = rpr.find(_tag("w", "rStyle")) + if rstyle is not None: + val = rstyle.get(_tag("w", "val"), "") + if val.lower() in ("verbatimchar", "code", "codechar", "inlinecode"): + return True + fonts = rpr.find(_tag("w", "rFonts")) + if fonts is not None: + for attr in fonts.attrib.values(): + if any(m in attr.lower() for m in ("courier", "consolas", "mono", "code")): + return True + return False + + +def _run_is_super(rpr: Optional[ET.Element]) -> bool: + if rpr is None: + return False + vert = rpr.find(_tag("w", "vertAlign")) + return vert is not None and vert.get(_tag("w", "val"), "") == "superscript" + + +def _run_is_sub(rpr: Optional[ET.Element]) -> bool: + if rpr is None: + return False + vert = rpr.find(_tag("w", "vertAlign")) + return vert is not None and vert.get(_tag("w", "val"), "") == "subscript" + + +# --------------------------------------------------------------------------- +# Main parser +# --------------------------------------------------------------------------- + + +class DocxParser: + """Parses a .docx file into a GFM-oriented AST. + + Parameters + ---------- + docx_path: + Path to the .docx file. + """ + + def __init__(self, docx_path: str | Path) -> None: + self._path = Path(docx_path) + self._styles: dict[str, StyleInfo] = {} + self._numbering: dict[tuple[str, str], NumFmt] = {} + self._rels: dict[str, str] = {} + # Track sibling position for XPath generation: stack of {tag: count} + self._position_stack: list[dict[str, int]] = [] + # Running path segments for XPath + self._xpath_parts: list[tuple[str, int]] = [] + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def parse(self) -> Document: + """Parse the docx and return the root Document AST node.""" + with zipfile.ZipFile(self._path) as zf: + names = zf.namelist() + + # Load support files + if "word/styles.xml" in names: + self._styles = _parse_styles(zf.read("word/styles.xml")) + if "word/numbering.xml" in names: + self._numbering = _parse_numbering(zf.read("word/numbering.xml")) + if "word/_rels/document.xml.rels" in names: + self._rels = _parse_rels(zf.read("word/_rels/document.xml.rels")) + + doc_xml = zf.read("word/document.xml") + + root = ET.fromstring(doc_xml) + body = root.find(_tag("w", "body")) + if body is None: + return Document(source_path=str(self._path)) + + body_xpath = "/w:document[1]/w:body[1]" + children = self._parse_body(body, body_xpath) + return Document( + children=children, + xpath=body_xpath, + source_path=str(self._path), + ) + + # ------------------------------------------------------------------ + # Body-level parsing + # ------------------------------------------------------------------ + + def _parse_body(self, body: ET.Element, body_xpath: str) -> list[BlockNode]: + """Convert children into a list of BlockNodes. + + Lists are detected by scanning consecutive paragraphs that share a + numId and grouping them into BulletList / OrderedList nodes. + """ + raw_blocks = self._collect_raw_blocks(body, body_xpath) + return self._group_lists(raw_blocks) + + def _collect_raw_blocks( + self, parent: ET.Element, parent_xpath: str + ) -> list[BlockNode]: + """Convert each direct child of *parent* to a BlockNode (ungrouped).""" + blocks: list[BlockNode] = [] + tag_counts: dict[str, int] = {} + + for child in parent: + tag = child.tag + tag_counts[tag] = tag_counts.get(tag, 0) + 1 + idx = tag_counts[tag] + # Compute simple local tag name for XPath + local = tag + for prefix, uri in NS.items(): + uri_braced = f"{{{uri}}}" + if tag.startswith(uri_braced): + local = f"{prefix}:{tag[len(uri_braced):]}" + break + child_xpath = f"{parent_xpath}/{local}[{idx}]" + + if tag == _tag("w", "p"): + block = self._parse_paragraph(child, child_xpath) + if block is not None: + blocks.append(block) + elif tag == _tag("w", "tbl"): + blocks.append(self._parse_table(child, child_xpath)) + elif tag == _tag("w", "sdt"): + # Structured document tag – descend into content + content = child.find(_tag("w", "sdtContent")) + if content is not None: + inner = self._collect_raw_blocks(content, child_xpath + "/w:sdtContent[1]") + blocks.extend(inner) + # w:sectPr and other body-level elements are silently skipped + + return blocks + + # ------------------------------------------------------------------ + # Paragraph parsing + # ------------------------------------------------------------------ + + def _parse_paragraph( + self, para: ET.Element, xpath: str + ) -> Optional[BlockNode]: + """Convert a element to the appropriate BlockNode.""" + ppr = para.find(_tag("w", "pPr")) + style_id = "" + num_id: Optional[str] = None + ilvl: str = "0" + + if ppr is not None: + pstyle = ppr.find(_tag("w", "pStyle")) + if pstyle is not None: + style_id = pstyle.get(_tag("w", "val"), "") + numpr = ppr.find(_tag("w", "numPr")) + if numpr is not None: + nid_el = numpr.find(_tag("w", "numId")) + ilvl_el = numpr.find(_tag("w", "ilvl")) + if nid_el is not None: + num_id = nid_el.get(_tag("w", "val"), None) + if ilvl_el is not None: + ilvl = ilvl_el.get(_tag("w", "val"), "0") + + style_info = self._styles.get(style_id) + inlines = self._parse_inlines(para, xpath) + + # Skip truly empty paragraphs (no text at all) + if not inlines: + # Check for page break + for r in para.findall(f".//{_tag('w','r')}"): + br = r.find(_tag("w", "br")) + if br is not None: + br_type = br.get(_tag("w", "type"), "") + if br_type == "page": + return ThematicBreak(xpath=xpath) + return None + + # List paragraph via numPr (explicit numbering in XML) + if num_id and num_id != "0": + fmt = self._numbering.get((num_id, ilvl)) + return _ListParagraph( + inlines=inlines, + xpath=xpath, + style_id=style_id, + num_id=num_id, + ilvl=int(ilvl), + is_ordered=fmt.is_ordered if fmt else False, + start_val=fmt.start_val if fmt else 1, + ) + + if style_info is not None: + if style_info.heading_level is not None: + return Heading( + level=style_info.heading_level, + children=inlines, + xpath=xpath, + style_id=style_id, + ) + if style_info.is_title: + # Document title → treat as h1 + return Heading(level=1, children=inlines, xpath=xpath, style_id=style_id) + if style_info.is_code: + text = "".join( + t.text for t in inlines if isinstance(t, TextRun) + ) + return CodeBlock(code=text, xpath=xpath) + if style_info.is_quote: + inner_para = Paragraph(children=inlines, xpath=xpath, style_id=style_id) + return BlockQuote(children=[inner_para], xpath=xpath) + # List via named style (e.g. ListBullet, ListNumber from python-docx) + if style_info.is_bullet_list: + return _ListParagraph( + inlines=inlines, + xpath=xpath, + style_id=style_id, + num_id="", + ilvl=style_info.list_depth, + is_ordered=False, + ) + if style_info.is_ordered_list: + return _ListParagraph( + inlines=inlines, + xpath=xpath, + style_id=style_id, + num_id="", + ilvl=style_info.list_depth, + is_ordered=True, + ) + + return Paragraph(children=inlines, xpath=xpath, style_id=style_id) + + # ------------------------------------------------------------------ + # Inline parsing + # ------------------------------------------------------------------ + + def _parse_inlines(self, para: ET.Element, para_xpath: str) -> list[InlineNode]: + """Extract inline nodes from a element.""" + inlines: list[InlineNode] = [] + run_counts: dict[str, int] = {} + + for child in para: + tag = child.tag + run_counts[tag] = run_counts.get(tag, 0) + 1 + idx = run_counts[tag] + local = self._clark_to_prefix(tag) + child_xpath = f"{para_xpath}/{local}[{idx}]" + + if tag == _tag("w", "r"): + inlines.extend(self._parse_run(child, child_xpath)) + elif tag == _tag("w", "hyperlink"): + inlines.append(self._parse_hyperlink(child, child_xpath)) + elif tag == _tag("w", "ins"): + # Track-change insertion – treat as normal content + for sub in child: + if sub.tag == _tag("w", "r"): + inlines.extend(self._parse_run(sub, child_xpath)) + elif tag == _tag("w", "del"): + # Track-change deletion – skip deleted text + pass + elif tag == _tag("w", "bookmarkStart"): + pass # skip + elif tag == _tag("w", "bookmarkEnd"): + pass + # Other inline elements (smart tags, etc.) are skipped silently + + return inlines + + def _parse_run(self, run: ET.Element, xpath: str) -> list[InlineNode]: + """Convert a element to one or more InlineNodes.""" + rpr = run.find(_tag("w", "rPr")) + bold = _run_is_bold(rpr) + italic = _run_is_italic(rpr) + underline = _run_is_underline(rpr) + strike = _run_is_strike(rpr) + code = _run_is_code(rpr) + sup = _run_is_super(rpr) + sub = _run_is_sub(rpr) + + nodes: list[InlineNode] = [] + child_counts: dict[str, int] = {} + + for child in run: + tag = child.tag + child_counts[tag] = child_counts.get(tag, 0) + 1 + ci = child_counts[tag] + local = self._clark_to_prefix(tag) + child_xpath = f"{xpath}/{local}[{ci}]" + + if tag == _tag("w", "t"): + text = child.text or "" + if text: + nodes.append( + TextRun( + text=text, + xpath=child_xpath, + bold=bold, + italic=italic, + underline=underline, + strikethrough=strike, + code=code, + superscript=sup, + subscript=sub, + ) + ) + elif tag == _tag("w", "br"): + br_type = child.get(_tag("w", "type"), "") + if br_type == "textWrapping": + nodes.append(LineBreak(xpath=child_xpath)) + elif br_type == "page": + nodes.append(SoftBreak(xpath=child_xpath)) + else: + nodes.append(SoftBreak(xpath=child_xpath)) + elif tag == _tag("w", "drawing"): + img = self._parse_drawing(child, child_xpath) + if img is not None: + nodes.append(img) + elif tag == _tag("w", "tab"): + nodes.append(TextRun(text="\t", xpath=child_xpath, bold=bold, italic=italic)) + + return nodes + + def _parse_hyperlink(self, el: ET.Element, xpath: str) -> Link: + """Convert to a Link node.""" + rid = el.get(_tag("r", "id"), "") + href = self._rels.get(rid, "") + if not href: + # Inline anchor + href = "#" + el.get(_tag("w", "anchor"), "") + + children: list[InlineNode] = [] + run_counts: dict[str, int] = {} + for child in el: + if child.tag == _tag("w", "r"): + run_counts[child.tag] = run_counts.get(child.tag, 0) + 1 + idx = run_counts[child.tag] + run_xpath = f"{xpath}/w:r[{idx}]" + children.extend(self._parse_run(child, run_xpath)) + + return Link(href=href, children=children, xpath=xpath) + + def _parse_drawing(self, el: ET.Element, xpath: str) -> Optional[Image]: + """Extract image info from .""" + # Look for blip (image reference) + blip = el.find(f".//{_tag('a','blip')}") + rid = "" + if blip is not None: + # a:blip r:embed="rIdX" + r_ns = NS["r"] + rid = blip.get(f"{{{r_ns}}}embed", "") + + src = self._rels.get(rid, rid) + # Try to get alt text + docpr = el.find(f".//{_tag('wp','docPr')}") + alt = "" + if docpr is not None: + alt = docpr.get("descr", docpr.get("name", "")) + + return Image(alt=alt, src=src, xpath=xpath) + + # ------------------------------------------------------------------ + # Table parsing + # ------------------------------------------------------------------ + + def _parse_table(self, tbl: ET.Element, xpath: str) -> Table: + rows: list[TableRow] = [] + tr_idx = 0 + for child in tbl: + if child.tag == _tag("w", "tr"): + tr_idx += 1 + row_xpath = f"{xpath}/w:tr[{tr_idx}]" + rows.append(self._parse_table_row(child, row_xpath, tr_idx == 1)) + return Table(rows=rows, xpath=xpath) + + def _parse_table_row( + self, tr: ET.Element, xpath: str, is_first_row: bool + ) -> TableRow: + cells: list[TableCell] = [] + tc_idx = 0 + for child in tr: + if child.tag == _tag("w", "tc"): + tc_idx += 1 + cell_xpath = f"{xpath}/w:tc[{tc_idx}]" + cells.append(self._parse_table_cell(child, cell_xpath, is_first_row)) + is_header = is_first_row + return TableRow(cells=cells, xpath=xpath, is_header=is_header) + + def _parse_table_cell( + self, tc: ET.Element, xpath: str, is_header_row: bool + ) -> TableCell: + children: list[BlockNode] = [] + raw = self._collect_raw_blocks(tc, xpath) + children = self._group_lists(raw) + + # Detect grid span (colspan) + tcpr = tc.find(_tag("w", "tcPr")) + colspan = 1 + if tcpr is not None: + gspan = tcpr.find(_tag("w", "gridSpan")) + if gspan is not None: + colspan = int(gspan.get(_tag("w", "val"), "1")) + + return TableCell( + children=children, + xpath=xpath, + colspan=colspan, + is_header=is_header_row, + ) + + # ------------------------------------------------------------------ + # List grouping + # ------------------------------------------------------------------ + + def _group_lists(self, blocks: list[BlockNode]) -> list[BlockNode]: + """Group consecutive _ListParagraph nodes into BulletList / OrderedList.""" + result: list[BlockNode] = [] + i = 0 + while i < len(blocks): + block = blocks[i] + if isinstance(block, _ListParagraph): + # Collect a run of list paragraphs + group: list[_ListParagraph] = [] + while i < len(blocks) and isinstance(blocks[i], _ListParagraph): + group.append(blocks[i]) # type: ignore[arg-type] + i += 1 + result.extend(self._build_list_nodes(group)) + else: + result.append(block) + i += 1 + return result + + def _build_list_nodes( + self, group: list[_ListParagraph] + ) -> list[BlockNode]: + """Convert a flat list of _ListParagraph items into nested list nodes. + + Simple single-level approach: each item becomes a ListItem whose + single child is a Paragraph. Nested depth is tracked but not + recursively nested for simplicity in this experimental version. + """ + if not group: + return [] + + # Determine whether the top-level list is ordered + first = group[0] + is_ordered = first.is_ordered + start_val = first.start_val + + items: list[ListItem] = [] + for lp in group: + para = Paragraph(children=lp.inlines, xpath=lp.xpath, style_id=lp.style_id) + items.append(ListItem(children=[para], xpath=lp.xpath, depth=lp.ilvl)) + + if is_ordered: + return [OrderedList(items=items, start=start_val, xpath=group[0].xpath)] + else: + return [BulletList(items=items, xpath=group[0].xpath)] + + # ------------------------------------------------------------------ + # Utility + # ------------------------------------------------------------------ + + def _clark_to_prefix(self, clark_tag: str) -> str: + """Convert Clark-notation tag to prefix:local for use in XPath strings.""" + for prefix, uri in NS.items(): + uri_braced = f"{{{uri}}}" + if clark_tag.startswith(uri_braced): + return f"{prefix}:{clark_tag[len(uri_braced):]}" + return clark_tag + + +# --------------------------------------------------------------------------- +# Internal-only dataclass for list detection (not part of public AST) +# --------------------------------------------------------------------------- + + +@dataclass +class _ListParagraph: + """Intermediate node used during list grouping — never appears in the final AST.""" + + inlines: list[InlineNode] + xpath: str + style_id: str + num_id: str + ilvl: int + is_ordered: bool + start_val: int = 1 + + def to_dict(self) -> dict: # pragma: no cover + raise NotImplementedError("_ListParagraph is internal only") diff --git a/extradocx/src/extradocx/serializers.py b/extradocx/src/extradocx/serializers.py new file mode 100644 index 00000000..bf427ef0 --- /dev/null +++ b/extradocx/src/extradocx/serializers.py @@ -0,0 +1,320 @@ +""" +AST serializers: JSON and GFM Markdown. + +Two public functions: + + to_json(doc: Document) -> str + Full-fidelity JSON serialization preserving all XPath pointers, + formatting flags, and node types. + + to_markdown(doc: Document) -> str + Lossy but human-readable GFM markdown. Formatting information that + has no GFM equivalent (e.g. underline, superscript) is silently dropped. + +Both accept the root `Document` node from `ast_nodes.py`. +""" + +from __future__ import annotations + +import json +import re +from typing import Union + +from extradocx.ast_nodes import ( + BlockNode, + BlockQuote, + BulletList, + CodeBlock, + Document, + Heading, + Image, + InlineNode, + LineBreak, + Link, + ListItem, + OrderedList, + Paragraph, + RawBlock, + SoftBreak, + Table, + TableCell, + TableRow, + TextRun, + ThematicBreak, +) + +# --------------------------------------------------------------------------- +# JSON serializer +# --------------------------------------------------------------------------- + + +def to_json(doc: Document, *, indent: int = 2) -> str: + """Serialize the AST to a JSON string. + + The JSON is fully self-describing: every node carries a ``type`` key and + an ``xpath`` key. The output can be used to reconstruct the AST or to + trace any node back to the source DOCX XML. + """ + return json.dumps(doc.to_dict(), ensure_ascii=False, indent=indent) + + +# --------------------------------------------------------------------------- +# Markdown serializer +# --------------------------------------------------------------------------- + +# Characters that need escaping in GFM inline context. +# Only escape chars that alter rendering mid-sentence. +# NOT escaping: - . + ! # (only meaningful at line start) +_MD_ESCAPE_RE = re.compile(r"([\\`*_{}\[\]()|])") + + +def _escape(text: str) -> str: + """Escape GFM special characters in plain text (inline context).""" + return _MD_ESCAPE_RE.sub(r"\\\1", text) + + +def to_markdown(doc: Document) -> str: + """Serialize the AST to GFM markdown. + + Conventions: + - Headings: ATX style (``# Heading``) + - Bold: ``**text**`` + - Italic: ``*text*`` + - Strikethrough: ``~~text~~`` + - Code spans: `` `text` `` + - Links: ``[text](href)`` + - Images: ``![alt](src)`` + - Bullet lists: ``- item`` + - Ordered lists: ``1. item`` + - Tables: GFM pipe tables + - Code blocks: fenced (``` ``` ```) + - Thematic break: ``---`` + - Block quote: ``> text`` + """ + lines = _blocks_to_lines(doc.children, depth=0) + return "\n".join(lines).rstrip() + "\n" + + +# --------------------------------------------------------------------------- +# Block rendering +# --------------------------------------------------------------------------- + + +def _blocks_to_lines(blocks: list[BlockNode], depth: int) -> list[str]: + """Render a list of block nodes to a list of text lines.""" + out: list[str] = [] + for i, block in enumerate(blocks): + block_lines = _block_to_lines(block, depth) + if block_lines: + if out: # blank line between blocks + out.append("") + out.extend(block_lines) + return out + + +def _block_to_lines(block: BlockNode, depth: int) -> list[str]: + if isinstance(block, Heading): + return _heading_to_lines(block) + elif isinstance(block, Paragraph): + return _paragraph_to_lines(block) + elif isinstance(block, CodeBlock): + return _codeblock_to_lines(block) + elif isinstance(block, BlockQuote): + return _blockquote_to_lines(block, depth) + elif isinstance(block, BulletList): + return _bulletlist_to_lines(block, depth) + elif isinstance(block, OrderedList): + return _orderedlist_to_lines(block, depth) + elif isinstance(block, Table): + return _table_to_lines(block) + elif isinstance(block, ThematicBreak): + return ["---"] + elif isinstance(block, RawBlock): + # Wrap in a comment so it's visible but doesn't break rendering + return [f""] + else: + return [] + + +def _heading_to_lines(h: Heading) -> list[str]: + level = max(1, min(6, h.level)) + prefix = "#" * level + text = _inlines_to_md(h.children) + return [f"{prefix} {text}"] + + +def _paragraph_to_lines(p: Paragraph) -> list[str]: + text = _inlines_to_md(p.children) + if not text.strip(): + return [] + # Wrap long paragraphs at 100 chars (soft wrap, preserve words) + return [text] + + +def _codeblock_to_lines(cb: CodeBlock) -> list[str]: + fence = "```" + lang = cb.language or "" + lines = cb.code.split("\n") + return [f"{fence}{lang}"] + lines + [fence] + + +def _blockquote_to_lines(bq: BlockQuote, depth: int) -> list[str]: + inner = _blocks_to_lines(bq.children, depth + 1) + return [f"> {line}" for line in inner] + + +def _bulletlist_to_lines(bl: BulletList, depth: int) -> list[str]: + lines: list[str] = [] + indent = " " * depth + for item in bl.items: + item_lines = _listitem_to_lines(item, depth, ordered=False, number=0) + for i, line in enumerate(item_lines): + if i == 0: + lines.append(f"{indent}- {line}") + else: + lines.append(f"{indent} {line}") + return lines + + +def _orderedlist_to_lines(ol: OrderedList, depth: int) -> list[str]: + lines: list[str] = [] + indent = " " * depth + for n, item in enumerate(ol.items, start=ol.start): + item_lines = _listitem_to_lines(item, depth, ordered=True, number=n) + for i, line in enumerate(item_lines): + if i == 0: + lines.append(f"{indent}{n}. {line}") + else: + lines.append(f"{indent} {line}") + return lines + + +def _listitem_to_lines( + item: ListItem, depth: int, ordered: bool, number: int +) -> list[str]: + """Render list item content (without the bullet/number prefix).""" + lines: list[str] = [] + for block in item.children: + block_lines = _block_to_lines(block, depth + 1) + lines.extend(block_lines) + return lines if lines else [""] + + +def _table_to_lines(tbl: Table) -> list[str]: + if not tbl.rows: + return [] + + # Collect cell texts + cell_texts: list[list[str]] = [] + for row in tbl.rows: + row_texts: list[str] = [] + for cell in row.cells: + # Flatten cell content to a single-line string + text = _blocks_to_cell_text(cell.children) + row_texts.append(text.replace("|", "\\|").replace("\n", " ")) + cell_texts.append(row_texts) + + if not cell_texts: + return [] + + # Determine column count + col_count = max(len(row) for row in cell_texts) + + # Pad rows + for row in cell_texts: + while len(row) < col_count: + row.append("") + + # Column widths + col_widths = [ + max(len(cell_texts[r][c]) for r in range(len(cell_texts))) + for c in range(col_count) + ] + col_widths = [max(w, 3) for w in col_widths] # min width 3 for separator + + def fmt_row(cells: list[str]) -> str: + parts = [cell.ljust(col_widths[i]) for i, cell in enumerate(cells)] + return "| " + " | ".join(parts) + " |" + + lines: list[str] = [] + lines.append(fmt_row(cell_texts[0])) + # Separator row + sep = ["-" * w for w in col_widths] + lines.append("| " + " | ".join(sep) + " |") + for row in cell_texts[1:]: + lines.append(fmt_row(row)) + + return lines + + +def _blocks_to_cell_text(blocks: list[BlockNode]) -> str: + """Flatten block content to a single string for table cells.""" + parts: list[str] = [] + for block in blocks: + if isinstance(block, Paragraph): + parts.append(_inlines_to_md(block.children)) + elif isinstance(block, Heading): + parts.append(_inlines_to_md(block.children)) + elif isinstance(block, CodeBlock): + parts.append(f"`{block.code}`") + else: + sub = _block_to_lines(block, 0) + parts.extend(sub) + return " ".join(p for p in parts if p) + + +# --------------------------------------------------------------------------- +# Inline rendering +# --------------------------------------------------------------------------- + + +def _inlines_to_md(inlines: list[InlineNode]) -> str: + """Render a list of inline nodes to a markdown string.""" + return "".join(_inline_to_md(n) for n in inlines) + + +def _inline_to_md(node: InlineNode) -> str: + if isinstance(node, TextRun): + return _textrun_to_md(node) + elif isinstance(node, Link): + inner = _inlines_to_md(node.children) + href = node.href + if node.title: + return f'[{inner}]({href} "{node.title}")' + return f"[{inner}]({href})" + elif isinstance(node, Image): + return f"![{node.alt}]({node.src})" + elif isinstance(node, LineBreak): + return " \n" + elif isinstance(node, SoftBreak): + return "\n" + else: + return "" + + +def _textrun_to_md(run: TextRun) -> str: + """Apply GFM markup for bold / italic / strikethrough / code.""" + text = run.text + + # Tab → spaces + text = text.replace("\t", " ") + + if run.code: + # Inline code — no further escaping or wrapping + # Use double backtick if text contains a backtick + if "`" in text: + return f"`` {text} ``" + return f"`{text}`" + + text = _escape(text) + + if run.strikethrough: + text = f"~~{text}~~" + if run.bold and run.italic: + text = f"***{text}***" + elif run.bold: + text = f"**{text}**" + elif run.italic: + text = f"*{text}*" + + return text diff --git a/extradocx/testdata/formatting.docx b/extradocx/testdata/formatting.docx new file mode 100644 index 00000000..356dc1ea Binary files /dev/null and b/extradocx/testdata/formatting.docx differ diff --git a/extradocx/testdata/generate_test_docx.py b/extradocx/testdata/generate_test_docx.py new file mode 100644 index 00000000..34e6ce68 --- /dev/null +++ b/extradocx/testdata/generate_test_docx.py @@ -0,0 +1,390 @@ +""" +Generate a rich test.docx with: + - Cover page (title, subtitle) + - Table of Contents section + - 6 chapters with h1/h2/h3 headings + - Body paragraphs, bold/italic text + - Bullet and numbered lists + - A table per chapter + - Code-style paragraph + - Links + - 20+ pages total +""" + +from docx import Document +from docx.shared import Inches, Pt, RGBColor +from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.oxml.ns import qn +from docx.oxml import OxmlElement + +doc = Document() + +# --------------------------------------------------------------------------- +# Cover Page +# --------------------------------------------------------------------------- +title = doc.add_heading("Comprehensive Software Engineering Report", 0) +title.alignment = WD_ALIGN_PARAGRAPH.CENTER + +subtitle = doc.add_paragraph("A Practical Guide to Modern Software Development Practices") +subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER +run = subtitle.runs[0] +run.italic = True +run.font.size = Pt(14) + +doc.add_paragraph( + "Author: Jane Smith\nDate: 2025-04-08\nVersion: 3.1" +).alignment = WD_ALIGN_PARAGRAPH.CENTER +doc.add_page_break() + +# --------------------------------------------------------------------------- +# Helper: add a lorem ipsum paragraph +# --------------------------------------------------------------------------- +LOREM = ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " + "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " + "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris " + "nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in " + "reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla " + "pariatur. Excepteur sint occaecat cupidatat non proident, sunt in " + "culpa qui officia deserunt mollit anim id est laborum. " +) + +def lorem(doc, n=1): + for _ in range(n): + doc.add_paragraph(LOREM) + +def add_bullet_list(doc, items): + for item in items: + p = doc.add_paragraph(item, style='List Bullet') + +def add_numbered_list(doc, items): + for item in items: + p = doc.add_paragraph(item, style='List Number') + +def add_table(doc, headers, rows): + table = doc.add_table(rows=1 + len(rows), cols=len(headers)) + table.style = 'Table Grid' + hdr = table.rows[0].cells + for i, h in enumerate(headers): + hdr[i].text = h + hdr[i].paragraphs[0].runs[0].bold = True + for ri, row in enumerate(rows): + cells = table.rows[ri + 1].cells + for ci, val in enumerate(row): + cells[ci].text = val + doc.add_paragraph("") + +def add_mixed_paragraph(doc, text): + """Paragraph with bold and italic runs.""" + p = doc.add_paragraph() + p.add_run("Note: ").bold = True + p.add_run(text) + p.add_run(" — see appendix for details.").italic = True + +# --------------------------------------------------------------------------- +# Chapter 1: Introduction +# --------------------------------------------------------------------------- +doc.add_heading("Chapter 1: Introduction to Software Engineering", 1) +doc.add_heading("1.1 Overview", 2) +lorem(doc, 3) +add_mixed_paragraph(doc, "Software engineering encompasses a wide range of disciplines " + "from requirements analysis to deployment and maintenance.") + +doc.add_heading("1.2 Historical Context", 2) +lorem(doc, 2) +add_bullet_list(doc, [ + "1960s: Birth of structured programming", + "1970s: Software crisis and the rise of methodologies", + "1980s: Object-oriented programming emerges", + "1990s: Agile manifesto and iterative development", + "2000s: DevOps, cloud computing, microservices", + "2010s: AI/ML integration in software workflows", + "2020s: LLM-assisted development", +]) + +doc.add_heading("1.3 Core Principles", 2) +lorem(doc, 2) +add_numbered_list(doc, [ + "Separation of concerns", + "DRY (Don't Repeat Yourself)", + "SOLID principles", + "Fail fast, fail loudly", + "Composability over inheritance", +]) + +add_table(doc, + ["Principle", "Description", "Example"], + [ + ["SRP", "Single Responsibility Principle", "One class per concern"], + ["OCP", "Open/Closed Principle", "Extend, don't modify"], + ["LSP", "Liskov Substitution Principle", "Subtypes are substitutable"], + ["ISP", "Interface Segregation", "Many small interfaces"], + ["DIP", "Dependency Inversion", "Depend on abstractions"], + ]) +doc.add_page_break() + +# --------------------------------------------------------------------------- +# Chapter 2: Requirements Engineering +# --------------------------------------------------------------------------- +doc.add_heading("Chapter 2: Requirements Engineering", 1) +doc.add_heading("2.1 Elicitation Techniques", 2) +lorem(doc, 3) + +doc.add_heading("2.1.1 Interviews", 3) +lorem(doc, 2) +add_bullet_list(doc, [ + "Structured interviews: fixed questions, quantitative data", + "Semi-structured: guided conversation with flexibility", + "Unstructured: open exploration of stakeholder needs", +]) + +doc.add_heading("2.1.2 Workshops", 3) +lorem(doc, 2) + +doc.add_heading("2.2 Specification Formats", 2) +lorem(doc, 2) + +add_table(doc, + ["Format", "Formality", "Use Case", "Tooling"], + [ + ["User Stories", "Low", "Agile sprints", "Jira, Linear"], + ["Use Cases", "Medium", "UML modeling", "Enterprise Architect"], + ["SRS Document", "High", "Regulated industries", "Confluence, Word"], + ["BDD Scenarios", "Medium", "Test-driven", "Cucumber, Behave"], + ]) + +doc.add_heading("2.3 Acceptance Criteria", 2) +lorem(doc, 3) +add_mixed_paragraph(doc, "Acceptance criteria must be measurable, verifiable, and unambiguous.") +doc.add_page_break() + +# --------------------------------------------------------------------------- +# Chapter 3: System Design +# --------------------------------------------------------------------------- +doc.add_heading("Chapter 3: System Design", 1) +doc.add_heading("3.1 Architectural Patterns", 2) +lorem(doc, 2) + +doc.add_heading("3.1.1 Monolithic Architecture", 3) +lorem(doc, 2) +p = doc.add_paragraph() +p.add_run("Advantages: ").bold = True +p.add_run("Simple deployment, easy debugging, low operational overhead.") +p = doc.add_paragraph() +p.add_run("Disadvantages: ").bold = True +p.add_run("Tight coupling, difficult to scale horizontally, long build times.") + +doc.add_heading("3.1.2 Microservices", 3) +lorem(doc, 2) +add_bullet_list(doc, [ + "Independent deployability per service", + "Polyglot persistence (each service owns its data store)", + "Failure isolation through circuit breakers", + "Service mesh for traffic management (Istio, Linkerd)", +]) + +doc.add_heading("3.1.3 Event-Driven Architecture", 3) +lorem(doc, 2) +add_table(doc, + ["Pattern", "Broker", "Use Case"], + [ + ["Pub/Sub", "Kafka, Pub/Sub", "Stream processing"], + ["Event Sourcing", "EventStore", "Audit trail, CQRS"], + ["Saga", "Conductor", "Distributed transactions"], + ["Outbox Pattern", "Debezium", "Reliable messaging"], + ]) + +doc.add_heading("3.2 Data Modeling", 2) +lorem(doc, 3) +doc.add_page_break() + +# --------------------------------------------------------------------------- +# Chapter 4: Development Practices +# --------------------------------------------------------------------------- +doc.add_heading("Chapter 4: Development Practices", 1) +doc.add_heading("4.1 Version Control Strategies", 2) +lorem(doc, 2) + +doc.add_heading("4.1.1 Branching Models", 3) +add_numbered_list(doc, [ + "Trunk-based development: single long-lived main branch", + "Git Flow: feature/release/hotfix branch model", + "GitHub Flow: simplified flow with feature branches and PRs", + "GitLab Flow: environment-based branching", +]) +lorem(doc, 2) + +doc.add_heading("4.2 Code Review", 2) +lorem(doc, 2) +add_table(doc, + ["Practice", "Goal", "Anti-Pattern"], + [ + ["Pair Review", "Knowledge sharing", "Rubber-stamping"], + ["Automated Checks", "Consistency", "Gate-keeping"], + ["Author Self-Review", "Catch obvious bugs", "Skipping"], + ["Async Reviews", "Parallel work", "Long-pending PRs"], + ]) + +doc.add_heading("4.3 Testing Pyramid", 2) +lorem(doc, 2) +p = doc.add_paragraph() +p.add_run("Unit tests ").bold = True +p.add_run("form the base: fast, isolated, cheap. ") +p.add_run("Integration tests ").bold = True +p.add_run("verify component interaction. ") +p.add_run("End-to-end tests ").bold = True +p.add_run("validate user journeys but are slow and expensive.") + +add_numbered_list(doc, [ + "Unit: 70% of test suite — sub-millisecond execution", + "Integration: 20% — test real dependencies (DB, queues)", + "E2E: 10% — critical user paths only", +]) +doc.add_page_break() + +# --------------------------------------------------------------------------- +# Chapter 5: DevOps & CI/CD +# --------------------------------------------------------------------------- +doc.add_heading("Chapter 5: DevOps and Continuous Delivery", 1) +doc.add_heading("5.1 CI Pipeline Design", 2) +lorem(doc, 2) +add_bullet_list(doc, [ + "Lint and format check (ruff, ESLint, etc.)", + "Static type checking (mypy, TypeScript)", + "Unit test execution with coverage gate", + "Build artifact (Docker image, wheel, binary)", + "Integration test against ephemeral environment", + "Security scan (Trivy, Snyk, Dependabot)", + "Publish to staging registry", +]) + +doc.add_heading("5.2 Deployment Strategies", 2) +lorem(doc, 2) +add_table(doc, + ["Strategy", "Rollout", "Rollback", "Downtime"], + [ + ["Big Bang", "Immediate", "Manual restore", "Yes"], + ["Blue/Green", "Switch traffic", "Switch back", "No"], + ["Canary", "Gradual %", "Reduce %", "No"], + ["Shadow", "Duplicate traffic", "Remove shadow", "No"], + ["Rolling", "Pod-by-pod", "Version revert", "Minimal"], + ]) + +doc.add_heading("5.3 Observability", 2) +lorem(doc, 2) + +doc.add_heading("5.3.1 The Three Pillars", 3) +add_numbered_list(doc, [ + "Logs: structured, searchable event records", + "Metrics: time-series aggregates (Prometheus, Datadog)", + "Traces: distributed request span correlation (OpenTelemetry)", +]) +lorem(doc, 2) +doc.add_page_break() + +# --------------------------------------------------------------------------- +# Chapter 6: Security Engineering +# --------------------------------------------------------------------------- +doc.add_heading("Chapter 6: Security Engineering", 1) +doc.add_heading("6.1 OWASP Top 10", 2) +lorem(doc, 2) +add_table(doc, + ["#", "Vulnerability", "Mitigation"], + [ + ["A01", "Broken Access Control", "RBAC, least privilege"], + ["A02", "Cryptographic Failures", "TLS 1.3, modern ciphers"], + ["A03", "Injection", "Parameterized queries, input validation"], + ["A04", "Insecure Design", "Threat modeling, secure by design"], + ["A05", "Security Misconfiguration", "Hardened defaults, IaC"], + ["A06", "Vulnerable Components", "Dependency scanning, SCA"], + ["A07", "Auth Failures", "MFA, secure session management"], + ["A08", "Software Integrity Failures", "Supply chain verification"], + ["A09", "Logging Failures", "Centralized SIEM, audit trails"], + ["A10", "SSRF", "Allowlist outbound connections"], + ]) + +doc.add_heading("6.2 Secure Development Lifecycle", 2) +lorem(doc, 3) +add_bullet_list(doc, [ + "Threat modeling at design phase (STRIDE, PASTA)", + "SAST: static analysis before merge (CodeQL, Semgrep)", + "DAST: dynamic scanning in staging (OWASP ZAP, Burp Suite)", + "Penetration testing annually or after major releases", + "Security champions program — embed sec in dev teams", +]) + +doc.add_heading("6.3 Secrets Management", 2) +lorem(doc, 2) +p = doc.add_paragraph() +p.add_run("Never ").bold = True +p.add_run("store secrets in source code. Use a secrets manager: ") +p.add_run("HashiCorp Vault, AWS Secrets Manager, GCP Secret Manager").italic = True +p.add_run(". Rotate secrets regularly and audit access logs.") +doc.add_page_break() + +# --------------------------------------------------------------------------- +# Chapter 7: Performance Engineering +# --------------------------------------------------------------------------- +doc.add_heading("Chapter 7: Performance Engineering", 1) +doc.add_heading("7.1 Performance Testing Types", 2) +lorem(doc, 2) +add_table(doc, + ["Type", "Goal", "Tool"], + [ + ["Load Test", "Verify at expected load", "k6, JMeter, Locust"], + ["Stress Test", "Find breaking point", "k6, Gatling"], + ["Soak Test", "Detect memory leaks", "Grafana k6"], + ["Spike Test", "Handle sudden traffic", "k6, Artillery"], + ["Chaos Test", "Failure resilience", "Chaos Monkey, Gremlin"], + ]) + +doc.add_heading("7.2 Profiling and Optimization", 2) +lorem(doc, 3) +add_numbered_list(doc, [ + "Profile first, optimize second — never guess", + "Database query optimization: explain plans, index design", + "Caching strategy: CDN, application cache, DB query cache", + "Async processing: offload heavy work to queues", + "Horizontal scaling: stateless services behind load balancer", + "Connection pooling: reuse DB connections", +]) +doc.add_page_break() + +# --------------------------------------------------------------------------- +# Appendix A: Glossary +# --------------------------------------------------------------------------- +doc.add_heading("Appendix A: Glossary", 1) +lorem(doc) +add_table(doc, + ["Term", "Definition"], + [ + ["API", "Application Programming Interface"], + ["CI/CD", "Continuous Integration / Continuous Delivery"], + ["CQRS", "Command Query Responsibility Segregation"], + ["DDD", "Domain-Driven Design"], + ["IaC", "Infrastructure as Code"], + ["MTTR", "Mean Time To Recovery"], + ["SLA", "Service Level Agreement"], + ["SLI", "Service Level Indicator"], + ["SLO", "Service Level Objective"], + ["TTL", "Time To Live"], + ]) + +# --------------------------------------------------------------------------- +# Appendix B: Bibliography +# --------------------------------------------------------------------------- +doc.add_heading("Appendix B: Bibliography", 1) +add_numbered_list(doc, [ + "Martin, Robert C. Clean Code. Prentice Hall, 2008.", + "Fowler, Martin. Refactoring. Addison-Wesley, 2018.", + "Newman, Sam. Building Microservices. O'Reilly, 2021.", + "Kim, Gene et al. The DevOps Handbook. IT Revolution Press, 2016.", + "Evans, Eric. Domain-Driven Design. Addison-Wesley, 2003.", + "OWASP Foundation. OWASP Top 10 2021. https://owasp.org/Top10/", + "Google SRE Team. Site Reliability Engineering. O'Reilly, 2016.", +]) +lorem(doc, 2) + +# Save +doc.save("test_report.docx") +print("Generated test_report.docx") diff --git a/extradocx/testdata/large_report.docx b/extradocx/testdata/large_report.docx new file mode 100644 index 00000000..f99ea249 Binary files /dev/null and b/extradocx/testdata/large_report.docx differ diff --git a/extradocx/testdata/output/pandoc_reference.md b/extradocx/testdata/output/pandoc_reference.md new file mode 100644 index 00000000..cef9de40 --- /dev/null +++ b/extradocx/testdata/output/pandoc_reference.md @@ -0,0 +1,763 @@ +*A Practical Guide to Modern Software Development Practices* + +Author: Jane Smith +Date: 2025-04-08 +Version: 3.1 + +# Chapter 1: Introduction to Software Engineering + +## 1.1 Overview + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +**Note:** Software engineering encompasses a wide range of disciplines +from requirements analysis to deployment and maintenance. *— see +appendix for details.* + +## 1.2 Historical Context + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +- 1960s: Birth of structured programming + +- 1970s: Software crisis and the rise of methodologies + +- 1980s: Object-oriented programming emerges + +- 1990s: Agile manifesto and iterative development + +- 2000s: DevOps, cloud computing, microservices + +- 2010s: AI/ML integration in software workflows + +- 2020s: LLM-assisted development + +## 1.3 Core Principles + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +1. Separation of concerns + +2. DRY (Don't Repeat Yourself) + +3. SOLID principles + +4. Fail fast, fail loudly + +5. Composability over inheritance + +| **Principle** | **Description** | **Example** | +|---------------|---------------------------------|----------------------------| +| SRP | Single Responsibility Principle | One class per concern | +| OCP | Open/Closed Principle | Extend, don't modify | +| LSP | Liskov Substitution Principle | Subtypes are substitutable | +| ISP | Interface Segregation | Many small interfaces | +| DIP | Dependency Inversion | Depend on abstractions | + +# Chapter 2: Requirements Engineering + +## 2.1 Elicitation Techniques + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +### 2.1.1 Interviews + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +- Structured interviews: fixed questions, quantitative data + +- Semi-structured: guided conversation with flexibility + +- Unstructured: open exploration of stakeholder needs + +### 2.1.2 Workshops + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +## 2.2 Specification Formats + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +| **Format** | **Formality** | **Use Case** | **Tooling** | +|---------------|---------------|----------------------|----------------------| +| User Stories | Low | Agile sprints | Jira, Linear | +| Use Cases | Medium | UML modeling | Enterprise Architect | +| SRS Document | High | Regulated industries | Confluence, Word | +| BDD Scenarios | Medium | Test-driven | Cucumber, Behave | + +## 2.3 Acceptance Criteria + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +**Note:** Acceptance criteria must be measurable, verifiable, and +unambiguous. *— see appendix for details.* + +# Chapter 3: System Design + +## 3.1 Architectural Patterns + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +### 3.1.1 Monolithic Architecture + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +**Advantages:** Simple deployment, easy debugging, low operational +overhead. + +**Disadvantages:** Tight coupling, difficult to scale horizontally, long +build times. + +### 3.1.2 Microservices + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +- Independent deployability per service + +- Polyglot persistence (each service owns its data store) + +- Failure isolation through circuit breakers + +- Service mesh for traffic management (Istio, Linkerd) + +### 3.1.3 Event-Driven Architecture + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +| **Pattern** | **Broker** | **Use Case** | +|----------------|----------------|--------------------------| +| Pub/Sub | Kafka, Pub/Sub | Stream processing | +| Event Sourcing | EventStore | Audit trail, CQRS | +| Saga | Conductor | Distributed transactions | +| Outbox Pattern | Debezium | Reliable messaging | + +## 3.2 Data Modeling + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +# Chapter 4: Development Practices + +## 4.1 Version Control Strategies + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +### 4.1.1 Branching Models + +6. Trunk-based development: single long-lived main branch + +7. Git Flow: feature/release/hotfix branch model + +8. GitHub Flow: simplified flow with feature branches and PRs + +9. GitLab Flow: environment-based branching + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +## 4.2 Code Review + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +| **Practice** | **Goal** | **Anti-Pattern** | +|--------------------|--------------------|------------------| +| Pair Review | Knowledge sharing | Rubber-stamping | +| Automated Checks | Consistency | Gate-keeping | +| Author Self-Review | Catch obvious bugs | Skipping | +| Async Reviews | Parallel work | Long-pending PRs | + +## 4.3 Testing Pyramid + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +**Unit tests** form the base: fast, isolated, cheap. **Integration +tests** verify component interaction. **End-to-end tests** validate user +journeys but are slow and expensive. + +10. Unit: 70% of test suite — sub-millisecond execution + +11. Integration: 20% — test real dependencies (DB, queues) + +12. E2E: 10% — critical user paths only + +# Chapter 5: DevOps and Continuous Delivery + +## 5.1 CI Pipeline Design + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +- Lint and format check (ruff, ESLint, etc.) + +- Static type checking (mypy, TypeScript) + +- Unit test execution with coverage gate + +- Build artifact (Docker image, wheel, binary) + +- Integration test against ephemeral environment + +- Security scan (Trivy, Snyk, Dependabot) + +- Publish to staging registry + +## 5.2 Deployment Strategies + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +| **Strategy** | **Rollout** | **Rollback** | **Downtime** | +|--------------|-------------------|----------------|--------------| +| Big Bang | Immediate | Manual restore | Yes | +| Blue/Green | Switch traffic | Switch back | No | +| Canary | Gradual % | Reduce % | No | +| Shadow | Duplicate traffic | Remove shadow | No | +| Rolling | Pod-by-pod | Version revert | Minimal | + +## 5.3 Observability + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +### 5.3.1 The Three Pillars + +13. Logs: structured, searchable event records + +14. Metrics: time-series aggregates (Prometheus, Datadog) + +15. Traces: distributed request span correlation (OpenTelemetry) + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +# Chapter 6: Security Engineering + +## 6.1 OWASP Top 10 + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +| **\#** | **Vulnerability** | **Mitigation** | +|--------|-----------------------------|-----------------------------------------| +| A01 | Broken Access Control | RBAC, least privilege | +| A02 | Cryptographic Failures | TLS 1.3, modern ciphers | +| A03 | Injection | Parameterized queries, input validation | +| A04 | Insecure Design | Threat modeling, secure by design | +| A05 | Security Misconfiguration | Hardened defaults, IaC | +| A06 | Vulnerable Components | Dependency scanning, SCA | +| A07 | Auth Failures | MFA, secure session management | +| A08 | Software Integrity Failures | Supply chain verification | +| A09 | Logging Failures | Centralized SIEM, audit trails | +| A10 | SSRF | Allowlist outbound connections | + +## 6.2 Secure Development Lifecycle + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +- Threat modeling at design phase (STRIDE, PASTA) + +- SAST: static analysis before merge (CodeQL, Semgrep) + +- DAST: dynamic scanning in staging (OWASP ZAP, Burp Suite) + +- Penetration testing annually or after major releases + +- Security champions program — embed sec in dev teams + +## 6.3 Secrets Management + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +**Never** store secrets in source code. Use a secrets manager: +*HashiCorp Vault, AWS Secrets Manager, GCP Secret Manager*. Rotate +secrets regularly and audit access logs. + +# Chapter 7: Performance Engineering + +## 7.1 Performance Testing Types + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +| **Type** | **Goal** | **Tool** | +|-------------|-------------------------|-----------------------| +| Load Test | Verify at expected load | k6, JMeter, Locust | +| Stress Test | Find breaking point | k6, Gatling | +| Soak Test | Detect memory leaks | Grafana k6 | +| Spike Test | Handle sudden traffic | k6, Artillery | +| Chaos Test | Failure resilience | Chaos Monkey, Gremlin | + +## 7.2 Profiling and Optimization + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +16. Profile first, optimize second — never guess + +17. Database query optimization: explain plans, index design + +18. Caching strategy: CDN, application cache, DB query cache + +19. Async processing: offload heavy work to queues + +20. Horizontal scaling: stateless services behind load balancer + +21. Connection pooling: reuse DB connections + +# Appendix A: Glossary + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +| **Term** | **Definition** | +|----------|----------------------------------------------| +| API | Application Programming Interface | +| CI/CD | Continuous Integration / Continuous Delivery | +| CQRS | Command Query Responsibility Segregation | +| DDD | Domain-Driven Design | +| IaC | Infrastructure as Code | +| MTTR | Mean Time To Recovery | +| SLA | Service Level Agreement | +| SLI | Service Level Indicator | +| SLO | Service Level Objective | +| TTL | Time To Live | + +# Appendix B: Bibliography + +22. Martin, Robert C. Clean Code. Prentice Hall, 2008. + +23. Fowler, Martin. Refactoring. Addison-Wesley, 2018. + +24. Newman, Sam. Building Microservices. O'Reilly, 2021. + +25. Kim, Gene et al. The DevOps Handbook. IT Revolution Press, 2016. + +26. Evans, Eric. Domain-Driven Design. Addison-Wesley, 2003. + +27. OWASP Foundation. OWASP Top 10 2021. https://owasp.org/Top10/ + +28. Google SRE Team. Site Reliability Engineering. O'Reilly, 2016. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod +tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim +veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea +commodo consequat. Duis aute irure dolor in reprehenderit in voluptate +velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint +occaecat cupidatat non proident, sunt in culpa qui officia deserunt +mollit anim id est laborum. diff --git a/extradocx/testdata/output/test_report.md b/extradocx/testdata/output/test_report.md new file mode 100644 index 00000000..2203197c --- /dev/null +++ b/extradocx/testdata/output/test_report.md @@ -0,0 +1,347 @@ +# Comprehensive Software Engineering Report + +*A Practical Guide to Modern Software Development Practices* + +Author: Jane Smith +Date: 2025-04-08 +Version: 3.1 + +# Chapter 1: Introduction to Software Engineering + +## 1.1 Overview + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +**Note: **Software engineering encompasses a wide range of disciplines from requirements analysis to deployment and maintenance.* — see appendix for details.* + +## 1.2 Historical Context + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +- 1960s: Birth of structured programming +- 1970s: Software crisis and the rise of methodologies +- 1980s: Object-oriented programming emerges +- 1990s: Agile manifesto and iterative development +- 2000s: DevOps, cloud computing, microservices +- 2010s: AI/ML integration in software workflows +- 2020s: LLM-assisted development + +## 1.3 Core Principles + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +1. Separation of concerns +2. DRY \(Don't Repeat Yourself\) +3. SOLID principles +4. Fail fast, fail loudly +5. Composability over inheritance + +| **Principle** | **Description** | **Example** | +| ------------- | ------------------------------- | -------------------------- | +| SRP | Single Responsibility Principle | One class per concern | +| OCP | Open/Closed Principle | Extend, don't modify | +| LSP | Liskov Substitution Principle | Subtypes are substitutable | +| ISP | Interface Segregation | Many small interfaces | +| DIP | Dependency Inversion | Depend on abstractions | + +# Chapter 2: Requirements Engineering + +## 2.1 Elicitation Techniques + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +### 2.1.1 Interviews + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +- Structured interviews: fixed questions, quantitative data +- Semi-structured: guided conversation with flexibility +- Unstructured: open exploration of stakeholder needs + +### 2.1.2 Workshops + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +## 2.2 Specification Formats + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +| **Format** | **Formality** | **Use Case** | **Tooling** | +| ------------- | ------------- | -------------------- | -------------------- | +| User Stories | Low | Agile sprints | Jira, Linear | +| Use Cases | Medium | UML modeling | Enterprise Architect | +| SRS Document | High | Regulated industries | Confluence, Word | +| BDD Scenarios | Medium | Test-driven | Cucumber, Behave | + +## 2.3 Acceptance Criteria + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +**Note: **Acceptance criteria must be measurable, verifiable, and unambiguous.* — see appendix for details.* + +# Chapter 3: System Design + +## 3.1 Architectural Patterns + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +### 3.1.1 Monolithic Architecture + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +**Advantages: **Simple deployment, easy debugging, low operational overhead. + +**Disadvantages: **Tight coupling, difficult to scale horizontally, long build times. + +### 3.1.2 Microservices + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +- Independent deployability per service +- Polyglot persistence \(each service owns its data store\) +- Failure isolation through circuit breakers +- Service mesh for traffic management \(Istio, Linkerd\) + +### 3.1.3 Event-Driven Architecture + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +| **Pattern** | **Broker** | **Use Case** | +| -------------- | -------------- | ------------------------ | +| Pub/Sub | Kafka, Pub/Sub | Stream processing | +| Event Sourcing | EventStore | Audit trail, CQRS | +| Saga | Conductor | Distributed transactions | +| Outbox Pattern | Debezium | Reliable messaging | + +## 3.2 Data Modeling + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +# Chapter 4: Development Practices + +## 4.1 Version Control Strategies + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +### 4.1.1 Branching Models + +1. Trunk-based development: single long-lived main branch +2. Git Flow: feature/release/hotfix branch model +3. GitHub Flow: simplified flow with feature branches and PRs +4. GitLab Flow: environment-based branching + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +## 4.2 Code Review + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +| **Practice** | **Goal** | **Anti-Pattern** | +| ------------------ | ------------------ | ---------------- | +| Pair Review | Knowledge sharing | Rubber-stamping | +| Automated Checks | Consistency | Gate-keeping | +| Author Self-Review | Catch obvious bugs | Skipping | +| Async Reviews | Parallel work | Long-pending PRs | + +## 4.3 Testing Pyramid + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +**Unit tests **form the base: fast, isolated, cheap. **Integration tests **verify component interaction. **End-to-end tests **validate user journeys but are slow and expensive. + +1. Unit: 70% of test suite — sub-millisecond execution +2. Integration: 20% — test real dependencies \(DB, queues\) +3. E2E: 10% — critical user paths only + +# Chapter 5: DevOps and Continuous Delivery + +## 5.1 CI Pipeline Design + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +- Lint and format check \(ruff, ESLint, etc.\) +- Static type checking \(mypy, TypeScript\) +- Unit test execution with coverage gate +- Build artifact \(Docker image, wheel, binary\) +- Integration test against ephemeral environment +- Security scan \(Trivy, Snyk, Dependabot\) +- Publish to staging registry + +## 5.2 Deployment Strategies + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +| **Strategy** | **Rollout** | **Rollback** | **Downtime** | +| ------------ | ----------------- | -------------- | ------------ | +| Big Bang | Immediate | Manual restore | Yes | +| Blue/Green | Switch traffic | Switch back | No | +| Canary | Gradual % | Reduce % | No | +| Shadow | Duplicate traffic | Remove shadow | No | +| Rolling | Pod-by-pod | Version revert | Minimal | + +## 5.3 Observability + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +### 5.3.1 The Three Pillars + +1. Logs: structured, searchable event records +2. Metrics: time-series aggregates \(Prometheus, Datadog\) +3. Traces: distributed request span correlation \(OpenTelemetry\) + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +# Chapter 6: Security Engineering + +## 6.1 OWASP Top 10 + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +| **#** | **Vulnerability** | **Mitigation** | +| ----- | --------------------------- | --------------------------------------- | +| A01 | Broken Access Control | RBAC, least privilege | +| A02 | Cryptographic Failures | TLS 1.3, modern ciphers | +| A03 | Injection | Parameterized queries, input validation | +| A04 | Insecure Design | Threat modeling, secure by design | +| A05 | Security Misconfiguration | Hardened defaults, IaC | +| A06 | Vulnerable Components | Dependency scanning, SCA | +| A07 | Auth Failures | MFA, secure session management | +| A08 | Software Integrity Failures | Supply chain verification | +| A09 | Logging Failures | Centralized SIEM, audit trails | +| A10 | SSRF | Allowlist outbound connections | + +## 6.2 Secure Development Lifecycle + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +- Threat modeling at design phase \(STRIDE, PASTA\) +- SAST: static analysis before merge \(CodeQL, Semgrep\) +- DAST: dynamic scanning in staging \(OWASP ZAP, Burp Suite\) +- Penetration testing annually or after major releases +- Security champions program — embed sec in dev teams + +## 6.3 Secrets Management + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +**Never **store secrets in source code. Use a secrets manager: *HashiCorp Vault, AWS Secrets Manager, GCP Secret Manager*. Rotate secrets regularly and audit access logs. + +# Chapter 7: Performance Engineering + +## 7.1 Performance Testing Types + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +| **Type** | **Goal** | **Tool** | +| ----------- | ----------------------- | --------------------- | +| Load Test | Verify at expected load | k6, JMeter, Locust | +| Stress Test | Find breaking point | k6, Gatling | +| Soak Test | Detect memory leaks | Grafana k6 | +| Spike Test | Handle sudden traffic | k6, Artillery | +| Chaos Test | Failure resilience | Chaos Monkey, Gremlin | + +## 7.2 Profiling and Optimization + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +1. Profile first, optimize second — never guess +2. Database query optimization: explain plans, index design +3. Caching strategy: CDN, application cache, DB query cache +4. Async processing: offload heavy work to queues +5. Horizontal scaling: stateless services behind load balancer +6. Connection pooling: reuse DB connections + +# Appendix A: Glossary + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +| **Term** | **Definition** | +| -------- | -------------------------------------------- | +| API | Application Programming Interface | +| CI/CD | Continuous Integration / Continuous Delivery | +| CQRS | Command Query Responsibility Segregation | +| DDD | Domain-Driven Design | +| IaC | Infrastructure as Code | +| MTTR | Mean Time To Recovery | +| SLA | Service Level Agreement | +| SLI | Service Level Indicator | +| SLO | Service Level Objective | +| TTL | Time To Live | + +# Appendix B: Bibliography + +1. Martin, Robert C. Clean Code. Prentice Hall, 2008. +2. Fowler, Martin. Refactoring. Addison-Wesley, 2018. +3. Newman, Sam. Building Microservices. O'Reilly, 2021. +4. Kim, Gene et al. The DevOps Handbook. IT Revolution Press, 2016. +5. Evans, Eric. Domain-Driven Design. Addison-Wesley, 2003. +6. OWASP Foundation. OWASP Top 10 2021. https://owasp.org/Top10/ +7. Google SRE Team. Site Reliability Engineering. O'Reilly, 2016. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. diff --git a/extradocx/testdata/test_report.docx b/extradocx/testdata/test_report.docx new file mode 100644 index 00000000..8b0cdb69 Binary files /dev/null and b/extradocx/testdata/test_report.docx differ diff --git a/extradocx/tests/test_basic.py b/extradocx/tests/test_basic.py new file mode 100644 index 00000000..10e4756c --- /dev/null +++ b/extradocx/tests/test_basic.py @@ -0,0 +1,274 @@ +""" +Basic tests for the extradocx DOCX → GFM AST converter. + +Tests are organized around the public API: DocxParser, to_json, to_markdown. +""" + +from __future__ import annotations + +import json +import pathlib +import re +import zipfile +import xml.etree.ElementTree as ET + +import pytest + +from extradocx import DocxParser, to_json, to_markdown +from extradocx.ast_nodes import ( + BulletList, + Document, + Heading, + OrderedList, + Paragraph, + Table, + TextRun, +) + +TESTDATA = pathlib.Path(__file__).parent.parent / "testdata" +REPORT_DOCX = TESTDATA / "test_report.docx" + +NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def find_by_xpath(xpath_str: str, root: ET.Element) -> ET.Element | None: + """Resolve a /w:document[1]/... XPath from the document root element.""" + parts = xpath_str.lstrip("/").split("/") + current = root + for part in parts[1:]: # root IS w:document, skip first segment + m = re.match(r"(\w+):(\w+)\[(\d+)\]", part) + if not m: + return None + prefix, local, idx = m.group(1), m.group(2), int(m.group(3)) + uri = NS.get(prefix, "") + tag = f"{{{uri}}}{local}" + children = [c for c in current if c.tag == tag] + if idx > len(children): + return None + current = children[idx - 1] + return current + + +@pytest.fixture(scope="module") +def parsed_doc() -> Document: + return DocxParser(REPORT_DOCX).parse() + + +@pytest.fixture(scope="module") +def docx_root() -> ET.Element: + with zipfile.ZipFile(REPORT_DOCX) as zf: + return ET.fromstring(zf.read("word/document.xml")) + + +@pytest.fixture(scope="module") +def markdown_output(parsed_doc: Document) -> str: + return to_markdown(parsed_doc) + + +@pytest.fixture(scope="module") +def json_output(parsed_doc: Document) -> dict: + return json.loads(to_json(parsed_doc)) + + +# --------------------------------------------------------------------------- +# Parser tests +# --------------------------------------------------------------------------- + + +class TestParser: + def test_returns_document(self, parsed_doc): + assert isinstance(parsed_doc, Document) + + def test_has_children(self, parsed_doc): + assert len(parsed_doc.children) > 0 + + def test_detects_headings(self, parsed_doc): + headings = [c for c in parsed_doc.children if isinstance(c, Heading)] + assert len(headings) >= 10, "Should have at least 10 headings" + + def test_heading_levels(self, parsed_doc): + headings = [c for c in parsed_doc.children if isinstance(c, Heading)] + levels = {h.level for h in headings} + assert 1 in levels + assert 2 in levels + + def test_title_as_h1(self, parsed_doc): + first = parsed_doc.children[0] + assert isinstance(first, Heading) + assert first.level == 1 + text = "".join(r.text for r in first.children if isinstance(r, TextRun)) + assert "Software Engineering" in text + + def test_detects_bullet_lists(self, parsed_doc): + lists = [c for c in parsed_doc.children if isinstance(c, BulletList)] + assert len(lists) >= 3 + + def test_detects_ordered_lists(self, parsed_doc): + lists = [c for c in parsed_doc.children if isinstance(c, OrderedList)] + assert len(lists) >= 3 + + def test_detects_tables(self, parsed_doc): + tables = [c for c in parsed_doc.children if isinstance(c, Table)] + assert len(tables) >= 5 + + def test_paragraphs_have_text_runs(self, parsed_doc): + """Most body paragraphs should contain at least one TextRun. + (A small number may contain only structural breaks — those are skipped.)""" + paras = [c for c in parsed_doc.children if isinstance(c, Paragraph)] + paras_with_runs = [ + p for p in paras if any(isinstance(r, TextRun) for r in p.children) + ] + assert len(paras_with_runs) >= 10, "Expected at least 10 paragraphs with text runs" + + def test_bold_runs(self, parsed_doc): + """Verify that some runs have bold=True (from the DOCX content).""" + all_runs: list[TextRun] = [] + for node in parsed_doc.children: + if isinstance(node, Paragraph): + all_runs.extend(r for r in node.children if isinstance(r, TextRun)) + elif isinstance(node, Heading): + all_runs.extend(r for r in node.children if isinstance(r, TextRun)) + bold_runs = [r for r in all_runs if r.bold] + assert len(bold_runs) >= 1, "Expected at least one bold text run" + + +# --------------------------------------------------------------------------- +# XPath traceability tests +# --------------------------------------------------------------------------- + + +class TestXPathPointers: + def test_body_xpath(self, parsed_doc): + assert parsed_doc.xpath == "/w:document[1]/w:body[1]" + + def test_paragraph_xpaths_are_unique(self, parsed_doc): + paras = [c for c in parsed_doc.children if isinstance(c, Paragraph)] + xpaths = [p.xpath for p in paras] + assert len(xpaths) == len(set(xpaths)), "Paragraph XPaths must be unique" + + def test_heading_xpaths_are_unique(self, parsed_doc): + headings = [c for c in parsed_doc.children if isinstance(c, Heading)] + xpaths = [h.xpath for h in headings] + assert len(xpaths) == len(set(xpaths)) + + def test_text_run_xpath_resolves_to_correct_text(self, parsed_doc, docx_root): + """XPaths in TextRun nodes must point to elements with matching text.""" + # Check title + first = parsed_doc.children[0] + assert isinstance(first, Heading) + for run in first.children: + if isinstance(run, TextRun): + el = find_by_xpath(run.xpath, docx_root) + assert el is not None, f"XPath not found: {run.xpath}" + assert el.text == run.text, f"Text mismatch at {run.xpath}" + + def test_table_cell_xpath_resolves(self, parsed_doc, docx_root): + """Table cell XPaths must resolve to w:tc elements.""" + tables = [c for c in parsed_doc.children if isinstance(c, Table)] + assert tables + tbl = tables[0] + for row in tbl.rows: + for cell in row.cells: + el = find_by_xpath(cell.xpath, docx_root) + assert el is not None, f"Cell XPath not found: {cell.xpath}" + # The element should be w:tc + assert el.tag.endswith("}tc"), f"Expected w:tc at {cell.xpath}" + + def test_list_item_xpath_resolves(self, parsed_doc, docx_root): + """List item XPaths must resolve to paragraph elements.""" + blists = [c for c in parsed_doc.children if isinstance(c, BulletList)] + assert blists + for item in blists[0].items[:3]: + el = find_by_xpath(item.xpath, docx_root) + assert el is not None, f"List item XPath not found: {item.xpath}" + assert el.tag.endswith("}p"), f"Expected w:p at {item.xpath}" + + +# --------------------------------------------------------------------------- +# Markdown serializer tests +# --------------------------------------------------------------------------- + + +class TestMarkdownSerializer: + def test_produces_non_empty_string(self, markdown_output): + assert len(markdown_output) > 1000 + + def test_headings_use_atx_syntax(self, markdown_output): + assert "# Chapter 1" in markdown_output + assert "## 1.1" in markdown_output + + def test_bullet_list_items(self, markdown_output): + assert "- 1960s:" in markdown_output + + def test_ordered_list_items(self, markdown_output): + assert re.search(r"^\d+\. ", markdown_output, re.MULTILINE) + + def test_table_pipe_syntax(self, markdown_output): + assert "|" in markdown_output + # Check for separator row + assert re.search(r"\| -+", markdown_output) + + def test_italic_runs(self, markdown_output): + # The subtitle is italic + assert "*A Practical Guide" in markdown_output + + def test_ends_with_newline(self, markdown_output): + assert markdown_output.endswith("\n") + + +# --------------------------------------------------------------------------- +# JSON serializer tests +# --------------------------------------------------------------------------- + + +class TestJsonSerializer: + def test_root_type(self, json_output): + assert json_output["type"] == "document" + + def test_root_has_xpath(self, json_output): + assert json_output["xpath"] == "/w:document[1]/w:body[1]" + + def test_nodes_have_type_and_xpath(self, json_output): + for child in json_output["children"]: + assert "type" in child, f"Missing type: {child}" + assert "xpath" in child, f"Missing xpath: {child}" + + def test_text_runs_have_text_field(self, json_output): + def walk(node): + if node.get("type") == "text_run": + assert "text" in node + assert "xpath" in node + for child in node.get("children", []): + walk(child) + for item in node.get("items", []): + walk(item) + for row in node.get("rows", []): + walk(row) + for cell in row.get("cells", []) if isinstance(node.get("rows"), list) else []: + walk(cell) + + for child in json_output["children"]: + walk(child) + + def test_heading_has_level(self, json_output): + headings = [c for c in json_output["children"] if c["type"] == "heading"] + assert headings + for h in headings: + assert "level" in h + assert 1 <= h["level"] <= 6 + + def test_table_structure(self, json_output): + tables = [c for c in json_output["children"] if c["type"] == "table"] + assert tables + tbl = tables[0] + assert "rows" in tbl + assert tbl["rows"] + for row in tbl["rows"]: + assert "cells" in row + for cell in row["cells"]: + assert "children" in cell diff --git a/extradocx/uv.lock b/extradocx/uv.lock new file mode 100644 index 00000000..65418028 --- /dev/null +++ b/extradocx/uv.lock @@ -0,0 +1,108 @@ +version = 1 +revision = 3 +requires-python = ">=3.11" + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "extradocx" +version = "0.1.0" +source = { editable = "." } + +[package.dev-dependencies] +dev = [ + { name = "pytest" }, + { name = "ruff" }, +] + +[package.metadata] + +[package.metadata.requires-dev] +dev = [ + { name = "pytest", specifier = ">=8.0" }, + { name = "ruff", specifier = ">=0.4" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "pygments" +version = "2.20.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, +] + +[[package]] +name = "pytest" +version = "9.0.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" }, +] + +[[package]] +name = "ruff" +version = "0.15.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e6/97/e9f1ca355108ef7194e38c812ef40ba98c7208f47b13ad78d023caa583da/ruff-0.15.9.tar.gz", hash = "sha256:29cbb1255a9797903f6dde5ba0188c707907ff44a9006eb273b5a17bfa0739a2", size = 4617361, upload-time = "2026-04-02T18:17:20.829Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/1f/9cdfd0ac4b9d1e5a6cf09bedabdf0b56306ab5e333c85c87281273e7b041/ruff-0.15.9-py3-none-linux_armv6l.whl", hash = "sha256:6efbe303983441c51975c243e26dff328aca11f94b70992f35b093c2e71801e1", size = 10511206, upload-time = "2026-04-02T18:16:41.574Z" }, + { url = "https://files.pythonhosted.org/packages/3d/f6/32bfe3e9c136b35f02e489778d94384118bb80fd92c6d92e7ccd97db12ce/ruff-0.15.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:4965bac6ac9ea86772f4e23587746f0b7a395eccabb823eb8bfacc3fa06069f7", size = 10923307, upload-time = "2026-04-02T18:17:08.645Z" }, + { url = "https://files.pythonhosted.org/packages/ca/25/de55f52ab5535d12e7aaba1de37a84be6179fb20bddcbe71ec091b4a3243/ruff-0.15.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:eaf05aad70ca5b5a0a4b0e080df3a6b699803916d88f006efd1f5b46302daab8", size = 10316722, upload-time = "2026-04-02T18:16:44.206Z" }, + { url = "https://files.pythonhosted.org/packages/48/11/690d75f3fd6278fe55fff7c9eb429c92d207e14b25d1cae4064a32677029/ruff-0.15.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9439a342adb8725f32f92732e2bafb6d5246bd7a5021101166b223d312e8fc59", size = 10623674, upload-time = "2026-04-02T18:16:50.951Z" }, + { url = "https://files.pythonhosted.org/packages/bd/ec/176f6987be248fc5404199255522f57af1b4a5a1b57727e942479fec98ad/ruff-0.15.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9c5e6faf9d97c8edc43877c3f406f47446fc48c40e1442d58cfcdaba2acea745", size = 10351516, upload-time = "2026-04-02T18:16:57.206Z" }, + { url = "https://files.pythonhosted.org/packages/b2/fc/51cffbd2b3f240accc380171d51446a32aa2ea43a40d4a45ada67368fbd2/ruff-0.15.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b34a9766aeec27a222373d0b055722900fbc0582b24f39661aa96f3fe6ad901", size = 11150202, upload-time = "2026-04-02T18:17:06.452Z" }, + { url = "https://files.pythonhosted.org/packages/d6/d4/25292a6dfc125f6b6528fe6af31f5e996e19bf73ca8e3ce6eb7fa5b95885/ruff-0.15.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89dd695bc72ae76ff484ae54b7e8b0f6b50f49046e198355e44ea656e521fef9", size = 11988891, upload-time = "2026-04-02T18:17:18.575Z" }, + { url = "https://files.pythonhosted.org/packages/13/e1/1eebcb885c10e19f969dcb93d8413dfee8172578709d7ee933640f5e7147/ruff-0.15.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce187224ef1de1bd225bc9a152ac7102a6171107f026e81f317e4257052916d5", size = 11480576, upload-time = "2026-04-02T18:16:52.986Z" }, + { url = "https://files.pythonhosted.org/packages/ff/6b/a1548ac378a78332a4c3dcf4a134c2475a36d2a22ddfa272acd574140b50/ruff-0.15.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b0c7c341f68adb01c488c3b7d4b49aa8ea97409eae6462d860a79cf55f431b6", size = 11254525, upload-time = "2026-04-02T18:17:02.041Z" }, + { url = "https://files.pythonhosted.org/packages/42/aa/4bb3af8e61acd9b1281db2ab77e8b2c3c5e5599bf2a29d4a942f1c62b8d6/ruff-0.15.9-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:55cc15eee27dc0eebdfcb0d185a6153420efbedc15eb1d38fe5e685657b0f840", size = 11204072, upload-time = "2026-04-02T18:17:13.581Z" }, + { url = "https://files.pythonhosted.org/packages/69/48/d550dc2aa6e423ea0bcc1d0ff0699325ffe8a811e2dba156bd80750b86dc/ruff-0.15.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a6537f6eed5cda688c81073d46ffdfb962a5f29ecb6f7e770b2dc920598997ed", size = 10594998, upload-time = "2026-04-02T18:16:46.369Z" }, + { url = "https://files.pythonhosted.org/packages/63/47/321167e17f5344ed5ec6b0aa2cff64efef5f9e985af8f5622cfa6536043f/ruff-0.15.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:6d3fcbca7388b066139c523bda744c822258ebdcfbba7d24410c3f454cc9af71", size = 10359769, upload-time = "2026-04-02T18:17:10.994Z" }, + { url = "https://files.pythonhosted.org/packages/67/5e/074f00b9785d1d2c6f8c22a21e023d0c2c1817838cfca4c8243200a1fa87/ruff-0.15.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:058d8e99e1bfe79d8a0def0b481c56059ee6716214f7e425d8e737e412d69677", size = 10850236, upload-time = "2026-04-02T18:16:48.749Z" }, + { url = "https://files.pythonhosted.org/packages/76/37/804c4135a2a2caf042925d30d5f68181bdbd4461fd0d7739da28305df593/ruff-0.15.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:8e1ddb11dbd61d5983fa2d7d6370ef3eb210951e443cace19594c01c72abab4c", size = 11358343, upload-time = "2026-04-02T18:16:55.068Z" }, + { url = "https://files.pythonhosted.org/packages/88/3d/1364fcde8656962782aa9ea93c92d98682b1ecec2f184e625a965ad3b4a6/ruff-0.15.9-py3-none-win32.whl", hash = "sha256:bde6ff36eaf72b700f32b7196088970bf8fdb2b917b7accd8c371bfc0fd573ec", size = 10583382, upload-time = "2026-04-02T18:17:04.261Z" }, + { url = "https://files.pythonhosted.org/packages/4c/56/5c7084299bd2cacaa07ae63a91c6f4ba66edc08bf28f356b24f6b717c799/ruff-0.15.9-py3-none-win_amd64.whl", hash = "sha256:45a70921b80e1c10cf0b734ef09421f71b5aa11d27404edc89d7e8a69505e43d", size = 11744969, upload-time = "2026-04-02T18:16:59.611Z" }, + { url = "https://files.pythonhosted.org/packages/03/36/76704c4f312257d6dbaae3c959add2a622f63fcca9d864659ce6d8d97d3d/ruff-0.15.9-py3-none-win_arm64.whl", hash = "sha256:0694e601c028fd97dc5c6ee244675bc241aeefced7ef80cd9c6935a871078f53", size = 11005870, upload-time = "2026-04-02T18:17:15.773Z" }, +]