diff --git a/.gitignore b/.gitignore
index e00c140..199736c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,20 @@ go/go.sum
 php/vendor/
 php/composer.lock
 
+# Python
+python/.venv/
+python/.pytest_cache/
+python/build/
+python/dist/
+python/*.egg-info/
+python/htmltrust_canonicalization.egg-info/
+__pycache__/
+*.pyc
+
+# Rust
+rust/target/
+rust/Cargo.lock
+
 # IDE
 .idea/
 .vscode/
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..1c443aa
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,66 @@
+# HTMLTrust Canonicalization -- Python
+
+Python binding for the HTMLTrust canonical text normalization library. Produces byte-identical output to the JavaScript, Go, PHP, and Rust implementations for every test vector in the shared conformance suite.
+
+## Status
+
+Implemented. The 18-case normalization conformance suite from the JavaScript reference (`javascript/test.js`) passes. `extract_canonical_text` and `canonicalize_claims` have parity tests against the JavaScript / Go / PHP reference behaviour.
+
+Out of scope for this package: signature verification and key resolution. Those live in the higher-level HTMLTrust client libraries (and will arrive in a follow-up PR for the Python binding once the JS surface area lands on `main`).
+
+## Scope
+
+This package provides three functions:
+
+1. **`normalize_text(text: str, preserve_whitespace: bool = False) -> str`** -- applies the 8-phase canonicalization defined in [`../spec.md`](../spec.md) to a UTF-8 string. Mirrors the existing JavaScript/Go/PHP signatures.
+2. **`extract_canonical_text(html: str, preserve_whitespace: bool = False) -> str`** -- parses an HTML fragment with BeautifulSoup, walks the DOM, emits text nodes in document order with single-space separators between block elements, and applies `normalize_text` to the result. This is the HTML -> canonical text extraction defined in the paper's §2.1.
+3. **`canonicalize_claims(claims: Mapping[str, object]) -> str`** -- serializes a claim map to the canonical, hashable string used by the `claims-hash` field of the signature binding (each entry normalized, sorted lexically by name, joined with `\n` as `name=value`).
+
+All three are pure functions: no network, no file I/O, deterministic output for the same input.
+
+## Dependencies
+
+- `unicodedata` (stdlib) for NFKC normalization
+- `beautifulsoup4 >= 4.12` for HTML parsing in `extract_canonical_text`
+- No other runtime dependencies
+
+## Conformance
+
+`tests/test_normalize.py` runs all 18 normalization vectors from `javascript/test.js`. `tests/test_extract.py` and `tests/test_claims.py` cover the HTML extraction and claim canonicalization contracts. Output MUST stay byte-identical to the JavaScript / Go / PHP / Rust bindings.
+
+## Installation
+
+```bash
+pip install htmltrust-canonicalization
+# or for development:
+cd python && pip install -e '.[dev]'
+```
+
+## Usage
+
+```python
+from htmltrust_canonicalization import (
+    normalize_text,
+    extract_canonical_text,
+    canonicalize_claims,
+)
+
+canonical = normalize_text('He said, "Hello…"')
+# -> 'He said, "Hello..."'
+
+from_html = extract_canonical_text('<p>Hello <em>world</em>!</p>')
+# -> 'Hello world!'
+
+claims_str = canonicalize_claims({
+    'License': 'CC-BY-4.0',
+    'AIAssistance': 'None',
+})
+# -> 'AIAssistance=None\nLicense=CC-BY-4.0'
+```
+
+## Tests
+
+```bash
+pip install -e '.[dev]'
+pytest
+```
diff --git a/python/htmltrust_canonicalization/__init__.py b/python/htmltrust_canonicalization/__init__.py
new file mode 100644
index 0000000..7309b6a
--- /dev/null
+++ b/python/htmltrust_canonicalization/__init__.py
@@ -0,0 +1,22 @@
+"""HTMLTrust canonicalization (Python binding).
+
+Public API:
+    - normalize_text(text, preserve_whitespace=False) -> str
+    - extract_canonical_text(html, preserve_whitespace=False) -> str
+    - canonicalize_claims(claims) -> str
+
+This binding produces byte-identical output to the JavaScript, Go, PHP,
+and Rust implementations of the HTMLTrust canonicalization library.
+"""
+
+from ._normalize import normalize_text
+from ._extract import extract_canonical_text
+from ._claims import canonicalize_claims
+
+__all__ = [
+    "normalize_text",
+    "extract_canonical_text",
+    "canonicalize_claims",
+]
+
+__version__ = "0.1.0"
diff --git a/python/htmltrust_canonicalization/_claims.py b/python/htmltrust_canonicalization/_claims.py
new file mode 100644
index 0000000..e39aa93
--- /dev/null
+++ b/python/htmltrust_canonicalization/_claims.py
@@ -0,0 +1,39 @@
+"""Canonical claims serialization (HTMLTrust spec §2.1).
+
+Direct port of ``canonicalizeClaims`` from the JavaScript reference
+implementation. Claims are normalized through the same pipeline as
+content text and emitted as a sorted list of ``name=value`` pairs joined
+by newlines. The caller is responsible for hashing the result.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+from ._normalize import normalize_text
+
+
+def canonicalize_claims(claims: Mapping[str, object]) -> str:
+    """Serialize ``claims`` to the canonical, sortable, hashable string form.
+
+    Each claim name and value is run through ``normalize_text`` so that
+    Unicode equivalents collapse to identical bytes. Entries are then
+    sorted lexically by name and joined with newlines as ``name=value``.
+
+    Args:
+        claims: Mapping of claim name to value. Values are coerced to
+            ``str`` before normalization so callers may pass simple
+            scalar types.
+
+    Returns:
+        Canonical serialized string ready to be hashed.
+    """
+    if not isinstance(claims, Mapping):
+        raise TypeError("canonicalize_claims expects a Mapping")
+
+    entries = [
+        (normalize_text(name), normalize_text(str(value)))
+        for name, value in claims.items()
+    ]
+    entries.sort(key=lambda nv: nv[0])
+    return "\n".join(f"{name}={value}" for name, value in entries)
diff --git a/python/htmltrust_canonicalization/_extract.py b/python/htmltrust_canonicalization/_extract.py
new file mode 100644
index 0000000..56a516e
--- /dev/null
+++ b/python/htmltrust_canonicalization/_extract.py
@@ -0,0 +1,101 @@
+"""HTML -> canonical text extraction (HTMLTrust spec §2.1).
+
+Direct semantic port of ``extractCanonicalText`` from the JavaScript
+reference implementation. The Python binding uses BeautifulSoup
+(html.parser backend, stdlib) for parsing because real HTML is messy
+and a forgiving parser produces more reliable output than the JS
+binding's regex pipeline. The text-output contract (which elements
+contribute, where whitespace separators go) is identical.
+"""
+
+from __future__ import annotations
+
+from bs4 import BeautifulSoup, NavigableString, Tag
+
+from ._normalize import normalize_text
+
+# Elements whose text content is NEVER part of the signed content.
+# `<meta>` is excluded because, inside a signed-section, it carries
+# claim metadata, not signed content (claims are hashed separately into
+# the claims-hash field).
+_EXCLUDED_TAGS = frozenset({
+    "script", "style", "meta", "link", "head", "noscript",
+})
+
+# Block-level elements whose boundaries become whitespace separators.
+# Inline elements (em, strong, a, span, etc.) do NOT introduce separators,
+# so "<p>hello <em>world</em></p>" canonicalizes to "hello world".
+_BLOCK_TAGS = frozenset({
+    "address", "article", "aside", "blockquote", "canvas", "dd", "div",
+    "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form",
+    "h1", "h2", "h3", "h4", "h5", "h6",
+    "header", "hr", "li", "main", "nav", "noscript", "ol", "output",
+    "p", "pre", "section", "table", "tfoot", "thead",
+    "tr", "td", "th", "ul", "video",
+})
+
+
+def extract_canonical_text(html: str, preserve_whitespace: bool = False) -> str:
+    """Extract canonical text content from an HTML fragment.
+
+    Given an HTML fragment (typically the inner contents of a
+    ``<signed-section>`` element), this:
+
+      1. Strips excluded elements (script, style, meta, link, head, noscript)
+         and their contents.
+      2. Walks the remaining tree in document order, inserting a single
+         space at every block-element boundary so that ``<p>A</p><p>B</p>``
+         extracts to ``"A B"`` and not ``"AB"``.
+      3. Emits text nodes verbatim (entity-decoded by the parser).
+      4. Applies the full text-normalization pipeline (``normalize_text``).
+
+    Args:
+        html: HTML fragment to canonicalize.
+        preserve_whitespace: Passed through to ``normalize_text``.
+            Defaults to ``False``.
+
+    Returns:
+        Canonical text, ready to be hashed. Trimmed of leading/trailing
+        whitespace.
+    """
+    if not isinstance(html, str):
+        raise TypeError("extract_canonical_text expects a str")
+
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Remove excluded elements (and their text content) outright.
+    for tag_name in _EXCLUDED_TAGS:
+        for elem in soup.find_all(tag_name):
+            elem.decompose()
+
+    parts: list[str] = []
+    _walk(soup, parts)
+
+    text = "".join(parts)
+    return normalize_text(text, preserve_whitespace).strip()
+
+
+def _walk(node, out: list[str]) -> None:
+    """Walk ``node`` in document order, appending text and block-boundary
+    spaces to ``out`` in place.
+    """
+    for child in getattr(node, "children", ()):
+        if isinstance(child, NavigableString):
+            # bs4 navigable strings include comments / doctypes / cdata.
+            # We only want plain text, not Comment / Doctype / CData.
+            # Comment is a NavigableString subclass; check the type name.
+            cls_name = type(child).__name__
+            if cls_name in ("Comment", "Doctype", "CData", "ProcessingInstruction"):
+                continue
+            out.append(str(child))
+        elif isinstance(child, Tag):
+            name = child.name.lower() if child.name else ""
+            is_block = name in _BLOCK_TAGS
+            if is_block:
+                out.append(" ")
+            _walk(child, out)
+            if is_block:
+                out.append(" ")
+            # Void elements (br, hr, img, etc.) within inline context: hr is
+            # already in _BLOCK_TAGS; br is treated as inline (no separator),
+            # matching the JS reference which strips br via ANY_TAG_RE.
diff --git a/python/htmltrust_canonicalization/_normalize.py b/python/htmltrust_canonicalization/_normalize.py
new file mode 100644
index 0000000..dd97408
--- /dev/null
+++ b/python/htmltrust_canonicalization/_normalize.py
@@ -0,0 +1,192 @@
+"""Text normalization (the 8-phase HTMLTrust canonicalization pipeline).
+
+Direct port of the JavaScript reference implementation
+(htmltrust-canonicalization/javascript/index.js, function ``normalizeText``).
+
+The character classes below are byte-for-byte the same Unicode
+codepoint sets used by the JavaScript and Go bindings; output MUST be
+byte-identical across language implementations.
+
+To keep this source file pure-ASCII and immune to editor mangling, the
+character sets are built programmatically from explicit codepoint
+ranges via ``chr()``. Each list entry is an ``(int, int)`` pair giving
+an inclusive range, OR a single ``int`` for a single codepoint.
+"""
+
+from __future__ import annotations
+
+import re
+import unicodedata
+from typing import Iterable, Union
+
+_RangeOrPoint = Union[int, tuple[int, int]]
+
+
+def _build_class(items: Iterable[_RangeOrPoint]) -> str:
+    """Build a regex character class string from ``items``.
+
+    Each item is either a single codepoint (``int``) or an inclusive
+    ``(start, end)`` range. Returns ``"[<chars>]"`` ready for ``re.compile``.
+    """
+    parts: list[str] = []
+    for item in items:
+        if isinstance(item, int):
+            parts.append(re.escape(chr(item)))
+        else:
+            start, end = item
+            # No re.escape on the dash separator; we want the literal '-'.
+            parts.append(f"{re.escape(chr(start))}-{re.escape(chr(end))}")
+    return "[" + "".join(parts) + "]"
+
+
+# ---------------------------------------------------------------------------
+# Phase 6 + 7: Invisible / formatting / bidi characters to strip.
+#
+# Mirrors JS reference STRIP_RE byte-for-byte. ZWNJ (U+200C) and ZWJ
+# (U+200D) are deliberately preserved -- they are semantic in Persian,
+# Indic, and emoji.
+# ---------------------------------------------------------------------------
+_STRIP_CODEPOINTS: list[_RangeOrPoint] = [
+    0x00AD,                 # soft hyphen
+    0x200B,                 # zero-width space
+    0x200E,                 # LRM
+    0x200F,                 # RLM
+    0x2060,                 # word joiner
+    0xFEFF,                 # BOM / ZWNBSP
+    0x034F,                 # combining grapheme joiner
+    0x061C,                 # arabic letter mark
+    0x180E,                 # mongolian vowel separator
+    0x0640,                 # arabic tatweel
+    (0xFE00, 0xFE0F),       # variation selectors 1-16
+    (0x202A, 0x202E),       # bidi embedding controls
+    (0x2066, 0x2069),       # bidi isolate controls
+    (0x2061, 0x2064),       # invisible math operators
+    (0xFFF9, 0xFFFC),       # interlinear annotation + obj replacement
+]
+_STRIP_RE = re.compile(_build_class(_STRIP_CODEPOINTS))
+
+# Supplementary plane: variation selectors 17-256, tag characters.
+_STRIP_SUPPLEMENTARY_RE = re.compile(
+    _build_class([(0xE0001, 0xE007F), (0xE0100, 0xE01EF)])
+)
+
+# ---------------------------------------------------------------------------
+# Phase 2: Unicode whitespace -> U+0020.
+#
+# Mirrors JS reference WHITESPACE_RE byte-for-byte.
+# ---------------------------------------------------------------------------
+_WHITESPACE_CODEPOINTS: list[_RangeOrPoint] = [
+    (0x0009, 0x000D),       # HT, LF, VT, FF, CR
+    0x0020,                 # SPACE
+    0x0085,                 # NEL
+    0x00A0,                 # NBSP
+    0x1680,                 # ogham space mark
+    (0x2000, 0x200A),       # en quad .. hair space
+    0x2028,                 # line separator
+    0x2029,                 # paragraph separator
+    0x202F,                 # narrow no-break space
+    0x205F,                 # medium mathematical space
+    0x3000,                 # ideographic space
+]
+_WHITESPACE_RE = re.compile(_build_class(_WHITESPACE_CODEPOINTS))
+_RUN_OF_SPACES_RE = re.compile(r" {2,}")
+
+# ---------------------------------------------------------------------------
+# Phase 3: Quotation marks.
+# Mirrors JS SINGLE_QUOTE_RE / DOUBLE_QUOTE_RE / CJK_QUOTE_RE byte-for-byte.
+# ---------------------------------------------------------------------------
+_SINGLE_QUOTE_CODEPOINTS: list[_RangeOrPoint] = [
+    0x2018,  # left single quote
+    0x2019,  # right single quote
+    0x201B,  # single high-reversed-9
+    0x2039,  # single left guillemet
+    0x203A,  # single right guillemet
+    0x0060,  # grave accent
+    0x00B4,  # acute accent
+    0x2032,  # prime
+]
+_SINGLE_QUOTE_RE = re.compile(_build_class(_SINGLE_QUOTE_CODEPOINTS))
+
+_DOUBLE_QUOTE_CODEPOINTS: list[_RangeOrPoint] = [
+    0x201A,  # single low-9 quote (intentionally mapped to double)
+    0x201C,  # left double quote
+    0x201D,  # right double quote
+    0x201E,  # low double quote
+    0x201F,  # double high-reversed-9
+    0x00AB,  # left guillemet
+    0x00BB,  # right guillemet
+    0x2033,  # double prime
+    0x301D,  # reversed double prime quotation mark
+    0x301E,  # double prime quotation mark
+    0x301F,  # low double prime quotation mark
+]
+_DOUBLE_QUOTE_RE = re.compile(_build_class(_DOUBLE_QUOTE_CODEPOINTS))
+
+_CJK_QUOTE_CODEPOINTS: list[_RangeOrPoint] = [
+    0x300C,                 # left corner bracket
+    0x300D,                 # right corner bracket
+    0x300E,                 # left white corner bracket
+    0x300F,                 # right white corner bracket
+    (0xFE41, 0xFE44),       # presentation forms for vertical corner brackets
+]
+_CJK_QUOTE_RE = re.compile(_build_class(_CJK_QUOTE_CODEPOINTS))
+
+# ---------------------------------------------------------------------------
+# Phase 4: Dashes -> U+002D (includes minus sign from Phase 5).
+# Mirrors JS DASH_RE byte-for-byte.
+# ---------------------------------------------------------------------------
+_DASH_CODEPOINTS: list[_RangeOrPoint] = [
+    (0x2010, 0x2015),       # hyphen .. horizontal bar
+    0x2212,                 # minus sign
+    0xFE58,                 # small em dash
+    0xFE63,                 # small hyphen-minus
+]
+_DASH_RE = re.compile(_build_class(_DASH_CODEPOINTS))
+
+# Phase 5: Ellipsis -> three periods.
+_ELLIPSIS_RE = re.compile(re.escape(chr(0x2026)))
+
+
+def normalize_text(text: str, preserve_whitespace: bool = False) -> str:
+    """Apply the HTMLTrust 8-phase canonicalization pipeline to ``text``.
+
+    Order matches the JavaScript reference implementation precisely.
+
+    Args:
+        text: Raw text content (typically the output of
+            ``extract_canonical_text``).
+        preserve_whitespace: Set ``True`` for ``<pre>`` content where
+            whitespace is significant. Defaults to ``False``.
+
+    Returns:
+        Normalized text, suitable for hashing.
+    """
+    if not isinstance(text, str):
+        raise TypeError("normalize_text expects a str")
+
+    # Phase 1: NFKC -- ligatures, fullwidth/halfwidth, presentation forms,
+    # superscripts, CJK compatibility, Jamo composition.
+    text = unicodedata.normalize("NFKC", text)
+
+    # Phases 6 + 7: strip invisible / formatting / bidi characters.
+    # Done early so they don't interfere with later phases.
+    text = _STRIP_RE.sub("", text)
+    text = _STRIP_SUPPLEMENTARY_RE.sub("", text)
+
+    # Phase 2: whitespace normalization.
+    if not preserve_whitespace:
+        text = _WHITESPACE_RE.sub(" ", text)
+        text = _RUN_OF_SPACES_RE.sub(" ", text)
+
+    # Phase 3: quotation marks.
+    text = _SINGLE_QUOTE_RE.sub("'", text)
+    text = _DOUBLE_QUOTE_RE.sub('"', text)
+    text = _CJK_QUOTE_RE.sub('"', text)
+
+    # Phase 4: dashes / hyphens / minus.
+    text = _DASH_RE.sub("-", text)
+
+    # Phase 5: ellipsis.
+    text = _ELLIPSIS_RE.sub("...", text)
+
+    return text
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..71b2062
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,45 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "htmltrust-canonicalization"
+version = "0.1.0"
+description = "Canonical text normalization and HTML extraction for HTMLTrust signed content. Byte-identical output to the JavaScript, Go, PHP, and Rust bindings."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+authors = [
+    { name = "HTMLTrust contributors" },
+]
+keywords = ["htmltrust", "canonicalization", "signing", "html", "unicode"]
+classifiers = [
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
+    "Topic :: Text Processing",
+    "Topic :: Security :: Cryptography",
+]
+dependencies = [
+    "beautifulsoup4>=4.12",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0",
+]
+
+[project.urls]
+Homepage = "https://github.com/HTMLTrust/htmltrust-canonicalization"
+Repository = "https://github.com/HTMLTrust/htmltrust-canonicalization"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["htmltrust_canonicalization*"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/python/tests/__init__.py b/python/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/tests/test_claims.py b/python/tests/test_claims.py
new file mode 100644
index 0000000..b1961db
--- /dev/null
+++ b/python/tests/test_claims.py
@@ -0,0 +1,59 @@
+"""Conformance tests for ``canonicalize_claims``.
+
+Mirrors the JavaScript reference ``canonicalizeClaims``. Output MUST be
+byte-identical across language bindings: claim entries serialize as
+``name=value`` lines, sorted lexically by name, joined by ``\\n``.
+"""
+
+import pytest
+
+from htmltrust_canonicalization import canonicalize_claims
+
+
+def test_empty_claims():
+    assert canonicalize_claims({}) == ""
+
+
+def test_single_claim():
+    assert canonicalize_claims({"License": "CC-BY-4.0"}) == "License=CC-BY-4.0"
+
+
+def test_sorted_by_name():
+    """Order in -> sorted out, regardless of source ordering."""
+    out = canonicalize_claims({
+        "License": "CC-BY-4.0",
+        "AIAssistance": "None",
+        "ContentType": "Article",
+    })
+    assert out == (
+        "AIAssistance=None\n"
+        "ContentType=Article\n"
+        "License=CC-BY-4.0"
+    )
+
+
+def test_normalizes_values():
+    """Values run through normalize_text -- curly quotes collapse."""
+    out = canonicalize_claims({"author": "“Alice”"})
+    assert out == 'author="Alice"'
+
+
+def test_normalizes_names():
+    """Claim names also normalize -- ensures hash determinism."""
+    # An ellipsis in a claim name is exotic but tests the contract.
+    out = canonicalize_claims({"odd…name": "x"})
+    assert out == "odd...name=x"
+
+
+def test_coerces_value_to_string():
+    out = canonicalize_claims({"count": 42, "enabled": True})
+    # Booleans serialize as "True" / "False" via str(); that's fine for
+    # this layer -- callers should pre-stringify if they need different
+    # representations.
+    assert "count=42" in out
+    assert "enabled=True" in out
+
+
+def test_rejects_non_mapping():
+    with pytest.raises(TypeError):
+        canonicalize_claims([("a", "b")])  # type: ignore[arg-type]
diff --git a/python/tests/test_extract.py b/python/tests/test_extract.py
new file mode 100644
index 0000000..8a3d574
--- /dev/null
+++ b/python/tests/test_extract.py
@@ -0,0 +1,94 @@
+"""Conformance tests for ``extract_canonical_text``.
+
+These cases mirror the contract of the JavaScript reference
+``extractCanonicalText`` and confirm that block-element boundaries
+become whitespace, inline elements do not, excluded elements vanish
+entirely, and HTML entities are decoded by the parser before
+normalization.
+"""
+
+import pytest
+
+from htmltrust_canonicalization import extract_canonical_text
+
+
+def test_inline_no_separator():
+    """Inline elements like <em> must NOT introduce extra whitespace."""
+    assert (
+        extract_canonical_text("<p>hello <em>world</em></p>")
+        == "hello world"
+    )
+
+
+def test_block_boundary_inserts_space():
+    """<p>A</p><p>B</p> -> "A B" (not "AB")."""
+    assert (
+        extract_canonical_text("<p>A</p><p>B</p>") == "A B"
+    )
+
+
+def test_excluded_elements_removed():
+    """script/style/meta content must vanish entirely."""
+    html = (
+        "<p>before</p>"
+        "<script>alert(1)</script>"
+        "<style>.x{color:red}</style>"
+        "<meta name='claim:License' content='CC-BY-4.0'>"
+        "<p>after</p>"
+    )
+    assert extract_canonical_text(html) == "before after"
+
+
+def test_entity_decoding():
+    """HTML entities must be decoded by the parser."""
+    assert (
+        extract_canonical_text("<p>A &amp; B &mdash; C</p>")
+        == "A & B - C"
+    )
+
+
+def test_normalization_pipeline_applied():
+    """The canonicalization pipeline must run on the extracted text."""
+    # Curly quotes inside HTML get extracted then normalized to straight.
+    assert (
+        extract_canonical_text("<p>“Hello”</p>") == '"Hello"'
+    )
+
+
+def test_nested_blocks():
+    """Deeply nested block structure still produces single-space joins."""
+    html = (
+        "<article>"
+        "<header><h1>Title</h1></header>"
+        "<section><p>Para one.</p><p>Para two.</p></section>"
+        "</article>"
+    )
+    out = extract_canonical_text(html)
+    # We don't pin the exact spacing count beyond "single-space collapsed",
+    # since multiple block-boundary spaces must collapse via phase 2.
+    assert out == "Title Para one. Para two."
+
+
+def test_list_items_separated():
+    assert (
+        extract_canonical_text("<ul><li>a</li><li>b</li><li>c</li></ul>")
+        == "a b c"
+    )
+
+
+def test_extract_rejects_non_string():
+    with pytest.raises(TypeError):
+        extract_canonical_text(123)  # type: ignore[arg-type]
+
+
+def test_table_cells_separated():
+    html = "<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>"
+    assert extract_canonical_text(html) == "a b c d"
+
+
+def test_inline_link_no_separator():
+    """Anchor tags are inline; they must NOT add separators."""
+    assert (
+        extract_canonical_text('<p>see <a href="x">here</a> now</p>')
+        == "see here now"
+    )
diff --git a/python/tests/test_normalize.py b/python/tests/test_normalize.py
new file mode 100644
index 0000000..2c5cd86
--- /dev/null
+++ b/python/tests/test_normalize.py
@@ -0,0 +1,88 @@
+"""Conformance tests for ``normalize_text``.
+
+The 18 test cases below are a direct port of
+``htmltrust-canonicalization/javascript/test.js`` and MUST produce
+byte-identical results across all language bindings.
+"""
+
+import pytest
+
+from htmltrust_canonicalization import normalize_text
+
+
+# (input_a, input_b, should_match, description)
+NORMALIZATION_CASES = [
+    ("“Hello”", '"Hello"', True, "Curly double quotes -> straight"),
+    ("café", "café", True, "Precomposed vs combining (NFKC)"),
+    ("ﬁnd", "find", True, "fi ligature (NFKC)"),
+    ("word — word", "word - word", True, "Em dash -> hyphen-minus"),
+    ("«Bonjour»", '"Bonjour"', True, "Guillemets -> double quotes"),
+    (
+        "「東京」",
+        '"東京"',
+        True,
+        "CJK corner brackets -> double quotes",
+    ),
+    (
+        "می‌خواهم",
+        "میخواهم",
+        False,
+        "ZWNJ is semantic (Persian)",
+    ),
+    (
+        "كتـــاب",
+        "كتاب",
+        True,
+        "Arabic tatweel stripped",
+    ),
+    ("Ａ１", "A1", True, "Fullwidth ASCII (NFKC)"),
+    ("①", "1", True, "Circled digit (NFKC)"),
+    ("word​word", "wordword", True, "ZWSP stripped"),
+    ("word‌word", "wordword", False, "ZWNJ preserved (different)"),
+    ("Hello…", "Hello...", True, "Ellipsis -> three dots"),
+    ("‘Hello’", "'Hello'", True, "Curly single quotes -> straight"),
+    ("‚German“", '"German"', True, "Low-9 quotes -> straight"),
+    ("a b", "a b", True, "No-break space -> space"),
+    ("a　b", "a b", True, "Ideographic space -> space"),
+    ("a  \t  b", "a b", True, "Whitespace collapse"),
+]
+
+
+@pytest.mark.parametrize("a,b,should_match,desc", NORMALIZATION_CASES)
+def test_normalize_match(a: str, b: str, should_match: bool, desc: str):
+    norm_a = normalize_text(a)
+    norm_b = normalize_text(b)
+    if should_match:
+        assert norm_a == norm_b, (
+            f"{desc!r}: expected match but got\n"
+            f"  A={norm_a!r}\n  B={norm_b!r}"
+        )
+    else:
+        assert norm_a != norm_b, (
+            f"{desc!r}: expected mismatch but both normalized to {norm_a!r}"
+        )
+
+
+def test_preserve_whitespace():
+    """``preserve_whitespace=True`` must skip phase-2 collapsing."""
+    src = "line1\n    line2\t\tline3"
+    assert normalize_text(src, preserve_whitespace=True) == src
+
+
+def test_normalize_text_rejects_non_string():
+    with pytest.raises(TypeError):
+        normalize_text(123)  # type: ignore[arg-type]
+
+
+def test_zwj_preserved_emoji():
+    """Family ZWJ sequence must survive normalization."""
+    family = "\U0001F468‍\U0001F469‍\U0001F467"
+    assert normalize_text(family) == family
+
+
+def test_idempotent():
+    """Normalization must be a fixed-point operation."""
+    src = "“Café—test…”"
+    once = normalize_text(src)
+    twice = normalize_text(once)
+    assert once == twice
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
new file mode 100644
index 0000000..146ecc1
--- /dev/null
+++ b/rust/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "htmltrust-canonicalization"
+version = "0.1.0"
+edition = "2021"
+rust-version = "1.74"
+description = "Canonical text normalization and HTML extraction for HTMLTrust signed content. Byte-identical output to the JavaScript, Go, PHP, and Python bindings."
+license = "MIT"
+repository = "https://github.com/HTMLTrust/htmltrust-canonicalization"
+readme = "README.md"
+keywords = ["htmltrust", "canonicalization", "signing", "html", "unicode"]
+categories = ["text-processing", "cryptography"]
+
+[lib]
+name = "htmltrust_canonicalization"
+path = "src/lib.rs"
+
+[dependencies]
+unicode-normalization = "0.1"
+scraper = "0.20"
+ego-tree = "0.6"
+
+[dev-dependencies]
+# pure stdlib tests; nothing extra required.
diff --git a/rust/README.md b/rust/README.md
new file mode 100644
index 0000000..41c797b
--- /dev/null
+++ b/rust/README.md
@@ -0,0 +1,63 @@
+# HTMLTrust Canonicalization -- Rust
+
+Rust crate for the HTMLTrust canonical text normalization library. Produces byte-identical output to the JavaScript, Go, PHP, and Python implementations for every test vector in the shared conformance suite.
+
+## Status
+
+Implemented. The 18-case normalization conformance suite from the JavaScript reference (`javascript/test.js`) passes, along with parity tests for `extract_canonical_text` and `canonicalize_claims`.
+
+Out of scope for this crate: signature verification and key resolution. Those will arrive in a follow-up PR alongside the Python binding once the JavaScript surface area lands on `main`.
+
+## Scope
+
+This crate provides three functions:
+
+1. **`normalize_text(text: &str, preserve_whitespace: bool) -> String`** -- applies the 8-phase canonicalization defined in [`../spec.md`](../spec.md) to a UTF-8 string.
+2. **`extract_canonical_text(html: &str) -> String`** -- parses an HTML fragment with `scraper` (html5ever), walks the DOM, emits text nodes in document order with single-space separators between block elements, and applies `normalize_text` to the result.
+3. **`canonicalize_claims(claims: &BTreeMap<String, String>) -> String`** -- serializes a claim map to the canonical, hashable string used by the `claims-hash` field of the signature binding.
+
+All three are pure functions: no I/O, deterministic output for the same input.
+
+## Dependencies
+
+- `unicode-normalization` for NFKC
+- `scraper` (html5ever-backed) for HTML parsing in `extract_canonical_text`
+- `ego-tree` for the DOM walk types re-exported by scraper
+
+## Conformance
+
+`tests/conformance.rs` runs all 18 normalization vectors from `javascript/test.js`, plus `extract_canonical_text` and `canonicalize_claims` parity cases. Output MUST stay byte-identical to the JavaScript / Go / PHP / Python bindings.
+
+## Installation
+
+```toml
+[dependencies]
+htmltrust-canonicalization = "0.1"
+```
+
+## Usage
+
+```rust
+use std::collections::BTreeMap;
+use htmltrust_canonicalization::{
+    normalize_text, extract_canonical_text, canonicalize_claims,
+};
+
+let canonical = normalize_text("He said, \"Hello\u{2026}\"", false);
+// -> "He said, \"Hello...\""
+
+let from_html = extract_canonical_text("<p>Hello <em>world</em>!</p>");
+// -> "Hello world!"
+
+let mut claims = BTreeMap::new();
+claims.insert("License".to_string(), "CC-BY-4.0".to_string());
+claims.insert("AIAssistance".to_string(), "None".to_string());
+let claims_str = canonicalize_claims(&claims);
+// -> "AIAssistance=None\nLicense=CC-BY-4.0"
+```
+
+## Tests
+
+```bash
+cargo test
+```
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
new file mode 100644
index 0000000..be5b5c5
--- /dev/null
+++ b/rust/src/lib.rs
@@ -0,0 +1,313 @@
+//! HTMLTrust canonicalization (Rust binding).
+//!
+//! Public API:
+//!
+//! - [`normalize_text`] -- the 8-phase HTMLTrust canonicalization pipeline.
+//! - [`extract_canonical_text`] -- HTML -> canonical text extraction
+//!   (spec §2.1), parses with `scraper` (html5ever) and walks the DOM.
+//! - [`canonicalize_claims`] -- canonical serialization of claim metadata
+//!   for the `claims-hash` field of the signature binding.
+//!
+//! All three functions produce byte-identical output to the JavaScript,
+//! Go, PHP, and Python bindings. The 18 conformance cases in
+//! `tests/conformance.rs` are a direct port of the shared test suite
+//! (`htmltrust-canonicalization/javascript/test.js`).
+
+use std::collections::BTreeMap;
+
+use scraper::{node::Node, Html};
+use ego_tree::NodeRef;
+use unicode_normalization::UnicodeNormalization;
+
+// ---------------------------------------------------------------------------
+// Codepoint ranges, mirroring the JS reference regex character classes
+// byte-for-byte. Inclusive ranges. Single codepoints expressed as
+// (cp, cp).
+// ---------------------------------------------------------------------------
+
+/// Phase 6 + 7: invisible / formatting / bidi characters to strip.
+/// ZWNJ (U+200C) and ZWJ (U+200D) are deliberately preserved -- they are
+/// semantic in Persian, Indic, and emoji.
+const STRIP_RANGES: &[(u32, u32)] = &[
+    (0x00AD, 0x00AD), // soft hyphen
+    (0x200B, 0x200B), // zero-width space
+    (0x200E, 0x200E), // LRM
+    (0x200F, 0x200F), // RLM
+    (0x2060, 0x2060), // word joiner
+    (0xFEFF, 0xFEFF), // BOM / ZWNBSP
+    (0x034F, 0x034F), // combining grapheme joiner
+    (0x061C, 0x061C), // arabic letter mark
+    (0x180E, 0x180E), // mongolian vowel separator
+    (0x0640, 0x0640), // arabic tatweel
+    (0xFE00, 0xFE0F), // variation selectors 1-16
+    (0x202A, 0x202E), // bidi embedding controls
+    (0x2066, 0x2069), // bidi isolate controls
+    (0x2061, 0x2064), // invisible math operators
+    (0xFFF9, 0xFFFC), // interlinear annotation + obj replacement
+    // Supplementary plane: variation selectors 17-256, tag characters.
+    (0xE0001, 0xE007F),
+    (0xE0100, 0xE01EF),
+];
+
+/// Phase 2: Unicode whitespace -> U+0020.
+const WHITESPACE_RANGES: &[(u32, u32)] = &[
+    (0x0009, 0x000D), // HT, LF, VT, FF, CR
+    (0x0020, 0x0020), // SPACE
+    (0x0085, 0x0085), // NEL
+    (0x00A0, 0x00A0), // NBSP
+    (0x1680, 0x1680), // ogham space mark
+    (0x2000, 0x200A), // en quad .. hair space
+    (0x2028, 0x2028), // line separator
+    (0x2029, 0x2029), // paragraph separator
+    (0x202F, 0x202F), // narrow no-break space
+    (0x205F, 0x205F), // medium mathematical space
+    (0x3000, 0x3000), // ideographic space
+];
+
+/// Phase 3: single quotes -> ASCII apostrophe.
+const SINGLE_QUOTE_POINTS: &[u32] = &[
+    0x2018, // left single quote
+    0x2019, // right single quote
+    0x201B, // single high-reversed-9
+    0x2039, // single left guillemet
+    0x203A, // single right guillemet
+    0x0060, // grave accent
+    0x00B4, // acute accent
+    0x2032, // prime
+];
+
+/// Phase 3: double quotes -> ASCII double quote.
+const DOUBLE_QUOTE_POINTS: &[u32] = &[
+    0x201A, // single low-9 quote (intentionally mapped to double)
+    0x201C, // left double quote
+    0x201D, // right double quote
+    0x201E, // low double quote
+    0x201F, // double high-reversed-9
+    0x00AB, // left guillemet
+    0x00BB, // right guillemet
+    0x2033, // double prime
+    0x301D, // reversed double prime quotation mark
+    0x301E, // double prime quotation mark
+    0x301F, // low double prime quotation mark
+];
+
+/// Phase 3: CJK corner brackets -> ASCII double quote.
+const CJK_QUOTE_RANGES: &[(u32, u32)] = &[
+    (0x300C, 0x300F), // CJK corner brackets
+    (0xFE41, 0xFE44), // presentation forms for vertical corner brackets
+];
+
+/// Phase 4: dashes -> ASCII hyphen-minus.
+const DASH_POINTS: &[u32] = &[
+    0x2212, // minus sign
+    0xFE58, // small em dash
+    0xFE63, // small hyphen-minus
+];
+const DASH_RANGES: &[(u32, u32)] = &[
+    (0x2010, 0x2015), // hyphen .. horizontal bar
+];
+
+/// Phase 5: ellipsis -> three periods.
+const ELLIPSIS: char = '\u{2026}';
+
+// ---------------------------------------------------------------------------
+// Range / point membership helpers (linear; the sets are tiny).
+// ---------------------------------------------------------------------------
+
+fn in_ranges(c: char, ranges: &[(u32, u32)]) -> bool {
+    let cp = c as u32;
+    ranges.iter().any(|&(start, end)| cp >= start && cp <= end)
+}
+
+fn in_points(c: char, points: &[u32]) -> bool {
+    points.contains(&(c as u32))
+}
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+/// Apply the HTMLTrust 8-phase canonicalization pipeline to `text`.
+///
+/// Order matches the JavaScript reference implementation precisely.
+///
+/// # Arguments
+///
+/// * `text` -- raw text content (typically the output of
+///   [`extract_canonical_text`]).
+/// * `preserve_whitespace` -- `true` for `<pre>` content where whitespace
+///   is significant; otherwise `false`.
+///
+/// # Returns
+///
+/// Normalized text, suitable for hashing.
+pub fn normalize_text(text: &str, preserve_whitespace: bool) -> String {
+    // Phase 1: NFKC.
+    let nfkc: String = text.nfkc().collect();
+
+    // Phases 6 + 7: strip invisible / formatting / bidi characters.
+    let stripped: String = nfkc.chars().filter(|&c| !in_ranges(c, STRIP_RANGES)).collect();
+
+    // Phase 2: whitespace normalization.
+    let ws: String = if preserve_whitespace {
+        stripped
+    } else {
+        let mut buf = String::with_capacity(stripped.len());
+        let mut prev_space = false;
+        for c in stripped.chars() {
+            if in_ranges(c, WHITESPACE_RANGES) {
+                if !prev_space {
+                    buf.push(' ');
+                    prev_space = true;
+                }
+            } else {
+                buf.push(c);
+                prev_space = false;
+            }
+        }
+        buf
+    };
+
+    // Phases 3, 4, 5 in a single pass.
+    let mut out = String::with_capacity(ws.len());
+    for c in ws.chars() {
+        if in_points(c, SINGLE_QUOTE_POINTS) {
+            out.push('\'');
+        } else if in_points(c, DOUBLE_QUOTE_POINTS) || in_ranges(c, CJK_QUOTE_RANGES) {
+            out.push('"');
+        } else if in_points(c, DASH_POINTS) || in_ranges(c, DASH_RANGES) {
+            out.push('-');
+        } else if c == ELLIPSIS {
+            out.push_str("...");
+        } else {
+            out.push(c);
+        }
+    }
+    out
+}
+
+/// Extract canonical text from an HTML fragment.
+///
+/// Implements the HTML -> canonical text extraction defined in spec §2.1
+/// and ports the contract of the JavaScript `extractCanonicalText`. Uses
+/// `scraper` (html5ever under the hood) for parsing.
+///
+/// # Arguments
+///
+/// * `html` -- HTML fragment to canonicalize.
+///
+/// # Returns
+///
+/// Canonical text, ready to be hashed. Trimmed of leading/trailing
+/// whitespace.
+pub fn extract_canonical_text(html: &str) -> String {
+    let document = Html::parse_fragment(html);
+
+    let mut out = String::new();
+    walk(document.tree.root(), &mut out);
+
+    normalize_text(&out, false).trim().to_string()
+}
+
+fn is_excluded_tag(name: &str) -> bool {
+    matches!(
+        name,
+        "script" | "style" | "meta" | "link" | "head" | "noscript"
+    )
+}
+
+fn is_block_tag(name: &str) -> bool {
+    matches!(
+        name,
+        "address"
+            | "article"
+            | "aside"
+            | "blockquote"
+            | "canvas"
+            | "dd"
+            | "div"
+            | "dl"
+            | "dt"
+            | "fieldset"
+            | "figcaption"
+            | "figure"
+            | "footer"
+            | "form"
+            | "h1"
+            | "h2"
+            | "h3"
+            | "h4"
+            | "h5"
+            | "h6"
+            | "header"
+            | "hr"
+            | "li"
+            | "main"
+            | "nav"
+            | "noscript"
+            | "ol"
+            | "output"
+            | "p"
+            | "pre"
+            | "section"
+            | "table"
+            | "tfoot"
+            | "thead"
+            | "tr"
+            | "td"
+            | "th"
+            | "ul"
+            | "video"
+    )
+}
+
+fn walk<'a>(node: NodeRef<'a, Node>, out: &mut String) {
+    for child in node.children() {
+        match child.value() {
+            Node::Text(t) => {
+                out.push_str(&t.text);
+            }
+            Node::Element(e) => {
+                let name = e.name();
+                if is_excluded_tag(name) {
+                    continue;
+                }
+                let block = is_block_tag(name);
+                if block {
+                    out.push(' ');
+                }
+                walk(child, out);
+                if block {
+                    out.push(' ');
+                }
+            }
+            _ => {
+                // Comments, doctypes, processing instructions -- not signed.
+            }
+        }
+    }
+}
+
+/// Compute the canonical serialization of a claim map.
+///
+/// Each name and value is run through [`normalize_text`] and entries are
+/// sorted lexically by name, then joined by `\n` as `name=value` pairs.
+/// The caller is responsible for hashing the result.
+///
+/// `BTreeMap` is used as the input type because its iteration order is
+/// already lexicographic, which makes the determinism property obvious
+/// at the type level. Callers with other map types can pass via
+/// `BTreeMap::from_iter(...)`.
+pub fn canonicalize_claims(claims: &BTreeMap<String, String>) -> String {
+    let mut entries: Vec<(String, String)> = claims
+        .iter()
+        .map(|(k, v)| (normalize_text(k, false), normalize_text(v, false)))
+        .collect();
+    // Re-sort after normalization in case normalization changes name order.
+    entries.sort_by(|a, b| a.0.cmp(&b.0));
+    entries
+        .into_iter()
+        .map(|(k, v)| format!("{}={}", k, v))
+        .collect::<Vec<_>>()
+        .join("\n")
+}
diff --git a/rust/tests/conformance.rs b/rust/tests/conformance.rs
new file mode 100644
index 0000000..4d3a04f
--- /dev/null
+++ b/rust/tests/conformance.rs
@@ -0,0 +1,208 @@
+//! Conformance tests for the HTMLTrust Rust binding.
+//!
+//! The 18 normalization cases below are a direct port of
+//! `htmltrust-canonicalization/javascript/test.js` and MUST produce
+//! byte-identical results across all language bindings.
+
+use std::collections::BTreeMap;
+
+use htmltrust_canonicalization::{
+    canonicalize_claims, extract_canonical_text, normalize_text,
+};
+
+/// One conformance vector. `(input_a, input_b, should_match, description)`.
+type Case = (&'static str, &'static str, bool, &'static str);
+
+const NORMALIZATION_CASES: &[Case] = &[
+    (
+        "\u{201C}Hello\u{201D}",
+        "\"Hello\"",
+        true,
+        "Curly double quotes -> straight",
+    ),
+    (
+        "caf\u{00E9}",
+        "cafe\u{0301}",
+        true,
+        "Precomposed vs combining (NFKC)",
+    ),
+    ("\u{FB01}nd", "find", true, "fi ligature (NFKC)"),
+    (
+        "word \u{2014} word",
+        "word - word",
+        true,
+        "Em dash -> hyphen-minus",
+    ),
+    (
+        "\u{00AB}Bonjour\u{00BB}",
+        "\"Bonjour\"",
+        true,
+        "Guillemets -> double quotes",
+    ),
+    (
+        "\u{300C}\u{6771}\u{4EAC}\u{300D}",
+        "\"\u{6771}\u{4EAC}\"",
+        true,
+        "CJK corner brackets -> double quotes",
+    ),
+    (
+        "\u{0645}\u{06CC}\u{200C}\u{062E}\u{0648}\u{0627}\u{0647}\u{0645}",
+        "\u{0645}\u{06CC}\u{062E}\u{0648}\u{0627}\u{0647}\u{0645}",
+        false,
+        "ZWNJ is semantic (Persian)",
+    ),
+    (
+        "\u{0643}\u{062A}\u{0640}\u{0640}\u{0640}\u{0627}\u{0628}",
+        "\u{0643}\u{062A}\u{0627}\u{0628}",
+        true,
+        "Arabic tatweel stripped",
+    ),
+    ("\u{FF21}\u{FF11}", "A1", true, "Fullwidth ASCII (NFKC)"),
+    ("\u{2460}", "1", true, "Circled digit (NFKC)"),
+    ("word\u{200B}word", "wordword", true, "ZWSP stripped"),
+    (
+        "word\u{200C}word",
+        "wordword",
+        false,
+        "ZWNJ preserved (different)",
+    ),
+    ("Hello\u{2026}", "Hello...", true, "Ellipsis -> three dots"),
+    (
+        "\u{2018}Hello\u{2019}",
+        "'Hello'",
+        true,
+        "Curly single quotes -> straight",
+    ),
+    (
+        "\u{201A}German\u{201C}",
+        "\"German\"",
+        true,
+        "Low-9 quotes -> straight",
+    ),
+    ("a\u{00A0}b", "a b", true, "No-break space -> space"),
+    ("a\u{3000}b", "a b", true, "Ideographic space -> space"),
+    ("a  \t  b", "a b", true, "Whitespace collapse"),
+];
+
+#[test]
+fn normalization_conformance() {
+    let mut failures = Vec::<String>::new();
+    for &(a, b, should_match, desc) in NORMALIZATION_CASES {
+        let na = normalize_text(a, false);
+        let nb = normalize_text(b, false);
+        let matched = na == nb;
+        if matched != should_match {
+            failures.push(format!(
+                "  {desc}: A={na:?} B={nb:?} expected match={should_match}, got match={matched}",
+            ));
+        }
+    }
+    assert!(
+        failures.is_empty(),
+        "{} failure(s):\n{}",
+        failures.len(),
+        failures.join("\n"),
+    );
+}
+
+#[test]
+fn preserve_whitespace_skips_collapse() {
+    let src = "line1\n    line2\t\tline3";
+    assert_eq!(normalize_text(src, true), src);
+}
+
+#[test]
+fn idempotent_for_typical_input() {
+    let src = "\u{201C}Caf\u{00E9}\u{2014}test\u{2026}\u{201D}";
+    let once = normalize_text(src, false);
+    let twice = normalize_text(&once, false);
+    assert_eq!(once, twice);
+}
+
+#[test]
+fn extract_inline_no_separator() {
+    assert_eq!(
+        extract_canonical_text("<p>hello <em>world</em></p>"),
+        "hello world",
+    );
+}
+
+#[test]
+fn extract_block_boundary_inserts_space() {
+    assert_eq!(extract_canonical_text("<p>A</p><p>B</p>"), "A B");
+}
+
+#[test]
+fn extract_excluded_elements_removed() {
+    let html = "\
+<p>before</p>\
+<script>alert(1)</script>\
+<style>.x{color:red}</style>\
+<meta name=\"claim:License\" content=\"CC-BY-4.0\">\
+<p>after</p>";
+    assert_eq!(extract_canonical_text(html), "before after");
+}
+
+#[test]
+fn extract_entity_decoding() {
+    assert_eq!(
+        extract_canonical_text("<p>A &amp; B &mdash; C</p>"),
+        "A & B - C",
+    );
+}
+
+#[test]
+fn extract_normalization_pipeline_applied() {
+    assert_eq!(
+        extract_canonical_text("<p>\u{201C}Hello\u{201D}</p>"),
+        "\"Hello\"",
+    );
+}
+
+#[test]
+fn extract_nested_blocks() {
+    let html = "<article><header><h1>Title</h1></header>\
+<section><p>Para one.</p><p>Para two.</p></section></article>";
+    assert_eq!(extract_canonical_text(html), "Title Para one. Para two.");
+}
+
+#[test]
+fn extract_list_items_separated() {
+    assert_eq!(
+        extract_canonical_text("<ul><li>a</li><li>b</li><li>c</li></ul>"),
+        "a b c",
+    );
+}
+
+#[test]
+fn extract_inline_link_no_separator() {
+    assert_eq!(
+        extract_canonical_text("<p>see <a href=\"x\">here</a> now</p>"),
+        "see here now",
+    );
+}
+
+#[test]
+fn claims_empty() {
+    let claims: BTreeMap<String, String> = BTreeMap::new();
+    assert_eq!(canonicalize_claims(&claims), "");
+}
+
+#[test]
+fn claims_sorted_by_name() {
+    let mut claims = BTreeMap::new();
+    claims.insert("License".to_string(), "CC-BY-4.0".to_string());
+    claims.insert("AIAssistance".to_string(), "None".to_string());
+    claims.insert("ContentType".to_string(), "Article".to_string());
+    assert_eq!(
+        canonicalize_claims(&claims),
+        "AIAssistance=None\nContentType=Article\nLicense=CC-BY-4.0",
+    );
+}
+
+#[test]
+fn claims_normalize_values() {
+    let mut claims = BTreeMap::new();
+    claims.insert("author".to_string(), "\u{201C}Alice\u{201D}".to_string());
+    assert_eq!(canonicalize_claims(&claims), "author=\"Alice\"");
+}