HTMLTrust · jt55401 · May 13, 2026 · Apr 11, 2026 · Apr 29, 2026
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,20 @@ go/go.sum
 php/vendor/
 php/composer.lock
 
+# Python
+python/.venv/
+python/.pytest_cache/
+python/build/
+python/dist/
+python/*.egg-info/
+python/htmltrust_canonicalization.egg-info/
+__pycache__/
+*.pyc
+
+# Rust
+rust/target/
+rust/Cargo.lock
+
 # IDE
 .idea/
 .vscode/

diff --git a/python/README.md b/python/README.md
@@ -0,0 +1,66 @@
+# HTMLTrust Canonicalization -- Python
+
+Python binding for the HTMLTrust canonical text normalization library. Produces byte-identical output to the JavaScript, Go, PHP, and Rust implementations for every test vector in the shared conformance suite.
+
+## Status
+
+Implemented. The 18-case normalization conformance suite from the JavaScript reference (`javascript/test.js`) passes. `extract_canonical_text` and `canonicalize_claims` have parity tests against the JavaScript / Go / PHP reference behaviour.
+
+Out of scope for this package: signature verification and key resolution. Those live in the higher-level HTMLTrust client libraries (and will arrive in a follow-up PR for the Python binding once the JS surface area lands on `main`).
+
+## Scope
+
+This package provides three functions:
+
+1. **`normalize_text(text: str, preserve_whitespace: bool = False) -> str`** -- applies the 8-phase canonicalization defined in [`../spec.md`](../spec.md) to a UTF-8 string. Mirrors the existing JavaScript/Go/PHP signatures.
+2. **`extract_canonical_text(html: str, preserve_whitespace: bool = False) -> str`** -- parses an HTML fragment with BeautifulSoup, walks the DOM, emits text nodes in document order with single-space separators between block elements, and applies `normalize_text` to the result. This is the HTML -> canonical text extraction defined in the paper's §2.1.
+3. **`canonicalize_claims(claims: Mapping[str, object]) -> str`** -- serializes a claim map to the canonical, hashable string used by the `claims-hash` field of the signature binding (each entry normalized, sorted lexically by name, joined with `\n` as `name=value`).
+
+All three are pure functions: no network, no file I/O, deterministic output for the same input.
+
+## Dependencies
+
+- `unicodedata` (stdlib) for NFKC normalization
+- `beautifulsoup4 >= 4.12` for HTML parsing in `extract_canonical_text`
+- No other runtime dependencies
+
+## Conformance
+
+`tests/test_normalize.py` runs all 18 normalization vectors from `javascript/test.js`. `tests/test_extract.py` and `tests/test_claims.py` cover the HTML extraction and claim canonicalization contracts. Output MUST stay byte-identical to the JavaScript / Go / PHP / Rust bindings.
+
+## Installation
+
+```bash
+pip install htmltrust-canonicalization
+# or for development:
+cd python && pip install -e '.[dev]'
+```
+
+## Usage
+
+```python
+from htmltrust_canonicalization import (
+    normalize_text,
+    extract_canonical_text,
+    canonicalize_claims,
+)
+
+canonical = normalize_text('He said, "Hello…"')
+# -> 'He said, "Hello..."'
+
+from_html = extract_canonical_text('<p>Hello <em>world</em>!</p>')
+# -> 'Hello world!'
+
+claims_str = canonicalize_claims({
+    'License': 'CC-BY-4.0',
+    'AIAssistance': 'None',
+})
+# -> 'AIAssistance=None\nLicense=CC-BY-4.0'
+```
+
+## Tests
+
+```bash
+pip install -e '.[dev]'
+pytest
+```
diff --git a/python/htmltrust_canonicalization/__init__.py b/python/htmltrust_canonicalization/__init__.py
@@ -0,0 +1,22 @@
+"""HTMLTrust canonicalization (Python binding).
+
+Public API:
+    - normalize_text(text, preserve_whitespace=False) -> str
+    - extract_canonical_text(html, preserve_whitespace=False) -> str
+    - canonicalize_claims(claims) -> str
+
+This binding produces byte-identical output to the JavaScript, Go, PHP,
+and Rust implementations of the HTMLTrust canonicalization library.
+"""
+
+from ._normalize import normalize_text
+from ._extract import extract_canonical_text
+from ._claims import canonicalize_claims
+
+__all__ = [
+    "normalize_text",
+    "extract_canonical_text",
+    "canonicalize_claims",
+]
+
+__version__ = "0.1.0"
diff --git a/python/htmltrust_canonicalization/_claims.py b/python/htmltrust_canonicalization/_claims.py
@@ -0,0 +1,39 @@
+"""Canonical claims serialization (HTMLTrust spec §2.1).
+
+Direct port of ``canonicalizeClaims`` from the JavaScript reference
+implementation. Claims are normalized through the same pipeline as
+content text and emitted as a sorted list of ``name=value`` pairs joined
+by newlines. The caller is responsible for hashing the result.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+from ._normalize import normalize_text
+
+
+def canonicalize_claims(claims: Mapping[str, object]) -> str:
+    """Serialize ``claims`` to the canonical, sortable, hashable string form.
+
+    Each claim name and value is run through ``normalize_text`` so that
+    Unicode equivalents collapse to identical bytes. Entries are then
+    sorted lexically by name and joined with newlines as ``name=value``.
+
+    Args:
+        claims: Mapping of claim name to value. Values are coerced to
+            ``str`` before normalization so callers may pass simple
+            scalar types.
+
+    Returns:
+        Canonical serialized string ready to be hashed.
+    """
+    if not isinstance(claims, Mapping):
+        raise TypeError("canonicalize_claims expects a Mapping")
+
+    entries = [
+        (normalize_text(name), normalize_text(str(value)))
+        for name, value in claims.items()
+    ]
+    entries.sort(key=lambda nv: nv[0])
+    return "\n".join(f"{name}={value}" for name, value in entries)
diff --git a/python/htmltrust_canonicalization/_extract.py b/python/htmltrust_canonicalization/_extract.py
@@ -0,0 +1,101 @@
+"""HTML -> canonical text extraction (HTMLTrust spec §2.1).
+
+Direct semantic port of ``extractCanonicalText`` from the JavaScript
+reference implementation. The Python binding uses BeautifulSoup
+(html.parser backend, stdlib) for parsing because real HTML is messy
+and a forgiving parser produces more reliable output than the JS
+binding's regex pipeline. The text-output contract (which elements
+contribute, where whitespace separators go) is identical.
+"""
+
+from __future__ import annotations
+
+from bs4 import BeautifulSoup, NavigableString, Tag
+
+from ._normalize import normalize_text
+
+# Elements whose text content is NEVER part of the signed content.
+# `<meta>` is excluded because, inside a signed-section, it carries
+# claim metadata, not signed content (claims are hashed separately into
+# the claims-hash field).
+_EXCLUDED_TAGS = frozenset({
+    "script", "style", "meta", "link", "head", "noscript",
+})
+
+# Block-level elements whose boundaries become whitespace separators.
+# Inline elements (em, strong, a, span, etc.) do NOT introduce separators,
+# so "<p>hello <em>world</em></p>" canonicalizes to "hello world".
+_BLOCK_TAGS = frozenset({
+    "address", "article", "aside", "blockquote", "canvas", "dd", "div",
+    "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form",
+    "h1", "h2", "h3", "h4", "h5", "h6",
+    "header", "hr", "li", "main", "nav", "noscript", "ol", "output",
+    "p", "pre", "section", "table", "tfoot", "thead",
+    "tr", "td", "th", "ul", "video",
+})
+
+
+def extract_canonical_text(html: str, preserve_whitespace: bool = False) -> str:
+    """Extract canonical text content from an HTML fragment.
+
+    Given an HTML fragment (typically the inner contents of a
+    ``<signed-section>`` element), this:
+
+      1. Strips excluded elements (script, style, meta, link, head, noscript)
+         and their contents.
+      2. Walks the remaining tree in document order, inserting a single
+         space at every block-element boundary so that ``<p>A</p><p>B</p>``
+         extracts to ``"A B"`` and not ``"AB"``.
+      3. Emits text nodes verbatim (entity-decoded by the parser).
+      4. Applies the full text-normalization pipeline (``normalize_text``).
+
+    Args:
+        html: HTML fragment to canonicalize.
+        preserve_whitespace: Passed through to ``normalize_text``.
+            Defaults to ``False``.
+
+    Returns:
+        Canonical text, ready to be hashed. Trimmed of leading/trailing
+        whitespace.
+    """
+    if not isinstance(html, str):
+        raise TypeError("extract_canonical_text expects a str")
+
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Remove excluded elements (and their text content) outright.
+    for tag_name in _EXCLUDED_TAGS:
+        for elem in soup.find_all(tag_name):
+            elem.decompose()
+
+    parts: list[str] = []
+    _walk(soup, parts)
+
+    text = "".join(parts)
+    return normalize_text(text, preserve_whitespace).strip()
+
+
+def _walk(node, out: list[str]) -> None:
+    """Walk ``node`` in document order, appending text and block-boundary
+    spaces to ``out`` in place.
+    """
+    for child in getattr(node, "children", ()):
+        if isinstance(child, NavigableString):
+            # bs4 navigable strings include comments / doctypes / cdata.
+            # We only want plain text, not Comment / Doctype / CData.
+            # Comment is a NavigableString subclass; check the type name.
+            cls_name = type(child).__name__
+            if cls_name in ("Comment", "Doctype", "CData", "ProcessingInstruction"):
+                continue
+            out.append(str(child))
+        elif isinstance(child, Tag):
+            name = child.name.lower() if child.name else ""
+            is_block = name in _BLOCK_TAGS
+            if is_block:
+                out.append(" ")
+            _walk(child, out)
+            if is_block:
+                out.append(" ")
+            # Void elements (br, hr, img, etc.) within inline context: hr is
+            # already in _BLOCK_TAGS; br is treated as inline (no separator),
+            # matching the JS reference which strips br via ANY_TAG_RE.