diff --git a/.gitignore b/.gitignore index e00c140..199736c 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,20 @@ go/go.sum php/vendor/ php/composer.lock +# Python +python/.venv/ +python/.pytest_cache/ +python/build/ +python/dist/ +python/*.egg-info/ +python/htmltrust_canonicalization.egg-info/ +__pycache__/ +*.pyc + +# Rust +rust/target/ +rust/Cargo.lock + # IDE .idea/ .vscode/ diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..1c443aa --- /dev/null +++ b/python/README.md @@ -0,0 +1,66 @@ +# HTMLTrust Canonicalization -- Python + +Python binding for the HTMLTrust canonical text normalization library. Produces byte-identical output to the JavaScript, Go, PHP, and Rust implementations for every test vector in the shared conformance suite. + +## Status + +Implemented. The 18-case normalization conformance suite from the JavaScript reference (`javascript/test.js`) passes. `extract_canonical_text` and `canonicalize_claims` have parity tests against the JavaScript / Go / PHP reference behaviour. + +Out of scope for this package: signature verification and key resolution. Those live in the higher-level HTMLTrust client libraries (and will arrive in a follow-up PR for the Python binding once the JS surface area lands on `main`). + +## Scope + +This package provides three functions: + +1. **`normalize_text(text: str, preserve_whitespace: bool = False) -> str`** -- applies the 8-phase canonicalization defined in [`../spec.md`](../spec.md) to a UTF-8 string. Mirrors the existing JavaScript/Go/PHP signatures. +2. **`extract_canonical_text(html: str, preserve_whitespace: bool = False) -> str`** -- parses an HTML fragment with BeautifulSoup, walks the DOM, emits text nodes in document order with single-space separators between block elements, and applies `normalize_text` to the result. This is the HTML -> canonical text extraction defined in the paper's §2.1. +3. **`canonicalize_claims(claims: Mapping[str, object]) -> str`** -- serializes a claim map to the canonical, hashable string used by the `claims-hash` field of the signature binding (each entry normalized, sorted lexically by name, joined with `\n` as `name=value`). + +All three are pure functions: no network, no file I/O, deterministic output for the same input. + +## Dependencies + +- `unicodedata` (stdlib) for NFKC normalization +- `beautifulsoup4 >= 4.12` for HTML parsing in `extract_canonical_text` +- No other runtime dependencies + +## Conformance + +`tests/test_normalize.py` runs all 18 normalization vectors from `javascript/test.js`. `tests/test_extract.py` and `tests/test_claims.py` cover the HTML extraction and claim canonicalization contracts. Output MUST stay byte-identical to the JavaScript / Go / PHP / Rust bindings. + +## Installation + +```bash +pip install htmltrust-canonicalization +# or for development: +cd python && pip install -e '.[dev]' +``` + +## Usage + +```python +from htmltrust_canonicalization import ( + normalize_text, + extract_canonical_text, + canonicalize_claims, +) + +canonical = normalize_text('He said, "Hello…"') +# -> 'He said, "Hello..."' + +from_html = extract_canonical_text('

Hello world!

') +# -> 'Hello world!' + +claims_str = canonicalize_claims({ + 'License': 'CC-BY-4.0', + 'AIAssistance': 'None', +}) +# -> 'AIAssistance=None\nLicense=CC-BY-4.0' +``` + +## Tests + +```bash +pip install -e '.[dev]' +pytest +``` diff --git a/python/htmltrust_canonicalization/__init__.py b/python/htmltrust_canonicalization/__init__.py new file mode 100644 index 0000000..7309b6a --- /dev/null +++ b/python/htmltrust_canonicalization/__init__.py @@ -0,0 +1,22 @@ +"""HTMLTrust canonicalization (Python binding). + +Public API: + - normalize_text(text, preserve_whitespace=False) -> str + - extract_canonical_text(html, preserve_whitespace=False) -> str + - canonicalize_claims(claims) -> str + +This binding produces byte-identical output to the JavaScript, Go, PHP, +and Rust implementations of the HTMLTrust canonicalization library. +""" + +from ._normalize import normalize_text +from ._extract import extract_canonical_text +from ._claims import canonicalize_claims + +__all__ = [ + "normalize_text", + "extract_canonical_text", + "canonicalize_claims", +] + +__version__ = "0.1.0" diff --git a/python/htmltrust_canonicalization/_claims.py b/python/htmltrust_canonicalization/_claims.py new file mode 100644 index 0000000..e39aa93 --- /dev/null +++ b/python/htmltrust_canonicalization/_claims.py @@ -0,0 +1,39 @@ +"""Canonical claims serialization (HTMLTrust spec §2.1). + +Direct port of ``canonicalizeClaims`` from the JavaScript reference +implementation. Claims are normalized through the same pipeline as +content text and emitted as a sorted list of ``name=value`` pairs joined +by newlines. The caller is responsible for hashing the result. +""" + +from __future__ import annotations + +from collections.abc import Mapping + +from ._normalize import normalize_text + + +def canonicalize_claims(claims: Mapping[str, object]) -> str: + """Serialize ``claims`` to the canonical, sortable, hashable string form. + + Each claim name and value is run through ``normalize_text`` so that + Unicode equivalents collapse to identical bytes. Entries are then + sorted lexically by name and joined with newlines as ``name=value``. + + Args: + claims: Mapping of claim name to value. Values are coerced to + ``str`` before normalization so callers may pass simple + scalar types. + + Returns: + Canonical serialized string ready to be hashed. + """ + if not isinstance(claims, Mapping): + raise TypeError("canonicalize_claims expects a Mapping") + + entries = [ + (normalize_text(name), normalize_text(str(value))) + for name, value in claims.items() + ] + entries.sort(key=lambda nv: nv[0]) + return "\n".join(f"{name}={value}" for name, value in entries) diff --git a/python/htmltrust_canonicalization/_extract.py b/python/htmltrust_canonicalization/_extract.py new file mode 100644 index 0000000..56a516e --- /dev/null +++ b/python/htmltrust_canonicalization/_extract.py @@ -0,0 +1,101 @@ +"""HTML -> canonical text extraction (HTMLTrust spec §2.1). + +Direct semantic port of ``extractCanonicalText`` from the JavaScript +reference implementation. The Python binding uses BeautifulSoup +(html.parser backend, stdlib) for parsing because real HTML is messy +and a forgiving parser produces more reliable output than the JS +binding's regex pipeline. The text-output contract (which elements +contribute, where whitespace separators go) is identical. +""" + +from __future__ import annotations + +from bs4 import BeautifulSoup, NavigableString, Tag + +from ._normalize import normalize_text + +# Elements whose text content is NEVER part of the signed content. +# `` is excluded because, inside a signed-section, it carries +# claim metadata, not signed content (claims are hashed separately into +# the claims-hash field). +_EXCLUDED_TAGS = frozenset({ + "script", "style", "meta", "link", "head", "noscript", +}) + +# Block-level elements whose boundaries become whitespace separators. +# Inline elements (em, strong, a, span, etc.) do NOT introduce separators, +# so "

hello world

" canonicalizes to "hello world". +_BLOCK_TAGS = frozenset({ + "address", "article", "aside", "blockquote", "canvas", "dd", "div", + "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", + "h1", "h2", "h3", "h4", "h5", "h6", + "header", "hr", "li", "main", "nav", "noscript", "ol", "output", + "p", "pre", "section", "table", "tfoot", "thead", + "tr", "td", "th", "ul", "video", +}) + + +def extract_canonical_text(html: str, preserve_whitespace: bool = False) -> str: + """Extract canonical text content from an HTML fragment. + + Given an HTML fragment (typically the inner contents of a + ```` element), this: + + 1. Strips excluded elements (script, style, meta, link, head, noscript) + and their contents. + 2. Walks the remaining tree in document order, inserting a single + space at every block-element boundary so that ``

A

B

`` + extracts to ``"A B"`` and not ``"AB"``. + 3. Emits text nodes verbatim (entity-decoded by the parser). + 4. Applies the full text-normalization pipeline (``normalize_text``). + + Args: + html: HTML fragment to canonicalize. + preserve_whitespace: Passed through to ``normalize_text``. + Defaults to ``False``. + + Returns: + Canonical text, ready to be hashed. Trimmed of leading/trailing + whitespace. + """ + if not isinstance(html, str): + raise TypeError("extract_canonical_text expects a str") + + soup = BeautifulSoup(html, "html.parser") + + # Remove excluded elements (and their text content) outright. + for tag_name in _EXCLUDED_TAGS: + for elem in soup.find_all(tag_name): + elem.decompose() + + parts: list[str] = [] + _walk(soup, parts) + + text = "".join(parts) + return normalize_text(text, preserve_whitespace).strip() + + +def _walk(node, out: list[str]) -> None: + """Walk ``node`` in document order, appending text and block-boundary + spaces to ``out`` in place. + """ + for child in getattr(node, "children", ()): + if isinstance(child, NavigableString): + # bs4 navigable strings include comments / doctypes / cdata. + # We only want plain text, not Comment / Doctype / CData. + # Comment is a NavigableString subclass; check the type name. + cls_name = type(child).__name__ + if cls_name in ("Comment", "Doctype", "CData", "ProcessingInstruction"): + continue + out.append(str(child)) + elif isinstance(child, Tag): + name = child.name.lower() if child.name else "" + is_block = name in _BLOCK_TAGS + if is_block: + out.append(" ") + _walk(child, out) + if is_block: + out.append(" ") + # Void elements (br, hr, img, etc.) within inline context: hr is + # already in _BLOCK_TAGS; br is treated as inline (no separator), + # matching the JS reference which strips br via ANY_TAG_RE. diff --git a/python/htmltrust_canonicalization/_normalize.py b/python/htmltrust_canonicalization/_normalize.py new file mode 100644 index 0000000..dd97408 --- /dev/null +++ b/python/htmltrust_canonicalization/_normalize.py @@ -0,0 +1,192 @@ +"""Text normalization (the 8-phase HTMLTrust canonicalization pipeline). + +Direct port of the JavaScript reference implementation +(htmltrust-canonicalization/javascript/index.js, function ``normalizeText``). + +The character classes below are byte-for-byte the same Unicode +codepoint sets used by the JavaScript and Go bindings; output MUST be +byte-identical across language implementations. + +To keep this source file pure-ASCII and immune to editor mangling, the +character sets are built programmatically from explicit codepoint +ranges via ``chr()``. Each list entry is an ``(int, int)`` pair giving +an inclusive range, OR a single ``int`` for a single codepoint. +""" + +from __future__ import annotations + +import re +import unicodedata +from typing import Iterable, Union + +_RangeOrPoint = Union[int, tuple[int, int]] + + +def _build_class(items: Iterable[_RangeOrPoint]) -> str: + """Build a regex character class string from ``items``. + + Each item is either a single codepoint (``int``) or an inclusive + ``(start, end)`` range. Returns ``"[]"`` ready for ``re.compile``. + """ + parts: list[str] = [] + for item in items: + if isinstance(item, int): + parts.append(re.escape(chr(item))) + else: + start, end = item + # No re.escape on the dash separator; we want the literal '-'. + parts.append(f"{re.escape(chr(start))}-{re.escape(chr(end))}") + return "[" + "".join(parts) + "]" + + +# --------------------------------------------------------------------------- +# Phase 6 + 7: Invisible / formatting / bidi characters to strip. +# +# Mirrors JS reference STRIP_RE byte-for-byte. ZWNJ (U+200C) and ZWJ +# (U+200D) are deliberately preserved -- they are semantic in Persian, +# Indic, and emoji. +# --------------------------------------------------------------------------- +_STRIP_CODEPOINTS: list[_RangeOrPoint] = [ + 0x00AD, # soft hyphen + 0x200B, # zero-width space + 0x200E, # LRM + 0x200F, # RLM + 0x2060, # word joiner + 0xFEFF, # BOM / ZWNBSP + 0x034F, # combining grapheme joiner + 0x061C, # arabic letter mark + 0x180E, # mongolian vowel separator + 0x0640, # arabic tatweel + (0xFE00, 0xFE0F), # variation selectors 1-16 + (0x202A, 0x202E), # bidi embedding controls + (0x2066, 0x2069), # bidi isolate controls + (0x2061, 0x2064), # invisible math operators + (0xFFF9, 0xFFFC), # interlinear annotation + obj replacement +] +_STRIP_RE = re.compile(_build_class(_STRIP_CODEPOINTS)) + +# Supplementary plane: variation selectors 17-256, tag characters. +_STRIP_SUPPLEMENTARY_RE = re.compile( + _build_class([(0xE0001, 0xE007F), (0xE0100, 0xE01EF)]) +) + +# --------------------------------------------------------------------------- +# Phase 2: Unicode whitespace -> U+0020. +# +# Mirrors JS reference WHITESPACE_RE byte-for-byte. +# --------------------------------------------------------------------------- +_WHITESPACE_CODEPOINTS: list[_RangeOrPoint] = [ + (0x0009, 0x000D), # HT, LF, VT, FF, CR + 0x0020, # SPACE + 0x0085, # NEL + 0x00A0, # NBSP + 0x1680, # ogham space mark + (0x2000, 0x200A), # en quad .. hair space + 0x2028, # line separator + 0x2029, # paragraph separator + 0x202F, # narrow no-break space + 0x205F, # medium mathematical space + 0x3000, # ideographic space +] +_WHITESPACE_RE = re.compile(_build_class(_WHITESPACE_CODEPOINTS)) +_RUN_OF_SPACES_RE = re.compile(r" {2,}") + +# --------------------------------------------------------------------------- +# Phase 3: Quotation marks. +# Mirrors JS SINGLE_QUOTE_RE / DOUBLE_QUOTE_RE / CJK_QUOTE_RE byte-for-byte. +# --------------------------------------------------------------------------- +_SINGLE_QUOTE_CODEPOINTS: list[_RangeOrPoint] = [ + 0x2018, # left single quote + 0x2019, # right single quote + 0x201B, # single high-reversed-9 + 0x2039, # single left guillemet + 0x203A, # single right guillemet + 0x0060, # grave accent + 0x00B4, # acute accent + 0x2032, # prime +] +_SINGLE_QUOTE_RE = re.compile(_build_class(_SINGLE_QUOTE_CODEPOINTS)) + +_DOUBLE_QUOTE_CODEPOINTS: list[_RangeOrPoint] = [ + 0x201A, # single low-9 quote (intentionally mapped to double) + 0x201C, # left double quote + 0x201D, # right double quote + 0x201E, # low double quote + 0x201F, # double high-reversed-9 + 0x00AB, # left guillemet + 0x00BB, # right guillemet + 0x2033, # double prime + 0x301D, # reversed double prime quotation mark + 0x301E, # double prime quotation mark + 0x301F, # low double prime quotation mark +] +_DOUBLE_QUOTE_RE = re.compile(_build_class(_DOUBLE_QUOTE_CODEPOINTS)) + +_CJK_QUOTE_CODEPOINTS: list[_RangeOrPoint] = [ + 0x300C, # left corner bracket + 0x300D, # right corner bracket + 0x300E, # left white corner bracket + 0x300F, # right white corner bracket + (0xFE41, 0xFE44), # presentation forms for vertical corner brackets +] +_CJK_QUOTE_RE = re.compile(_build_class(_CJK_QUOTE_CODEPOINTS)) + +# --------------------------------------------------------------------------- +# Phase 4: Dashes -> U+002D (includes minus sign from Phase 5). +# Mirrors JS DASH_RE byte-for-byte. +# --------------------------------------------------------------------------- +_DASH_CODEPOINTS: list[_RangeOrPoint] = [ + (0x2010, 0x2015), # hyphen .. horizontal bar + 0x2212, # minus sign + 0xFE58, # small em dash + 0xFE63, # small hyphen-minus +] +_DASH_RE = re.compile(_build_class(_DASH_CODEPOINTS)) + +# Phase 5: Ellipsis -> three periods. +_ELLIPSIS_RE = re.compile(re.escape(chr(0x2026))) + + +def normalize_text(text: str, preserve_whitespace: bool = False) -> str: + """Apply the HTMLTrust 8-phase canonicalization pipeline to ``text``. + + Order matches the JavaScript reference implementation precisely. + + Args: + text: Raw text content (typically the output of + ``extract_canonical_text``). + preserve_whitespace: Set ``True`` for ``
`` content where
+            whitespace is significant. Defaults to ``False``.
+
+    Returns:
+        Normalized text, suitable for hashing.
+    """
+    if not isinstance(text, str):
+        raise TypeError("normalize_text expects a str")
+
+    # Phase 1: NFKC -- ligatures, fullwidth/halfwidth, presentation forms,
+    # superscripts, CJK compatibility, Jamo composition.
+    text = unicodedata.normalize("NFKC", text)
+
+    # Phases 6 + 7: strip invisible / formatting / bidi characters.
+    # Done early so they don't interfere with later phases.
+    text = _STRIP_RE.sub("", text)
+    text = _STRIP_SUPPLEMENTARY_RE.sub("", text)
+
+    # Phase 2: whitespace normalization.
+    if not preserve_whitespace:
+        text = _WHITESPACE_RE.sub(" ", text)
+        text = _RUN_OF_SPACES_RE.sub(" ", text)
+
+    # Phase 3: quotation marks.
+    text = _SINGLE_QUOTE_RE.sub("'", text)
+    text = _DOUBLE_QUOTE_RE.sub('"', text)
+    text = _CJK_QUOTE_RE.sub('"', text)
+
+    # Phase 4: dashes / hyphens / minus.
+    text = _DASH_RE.sub("-", text)
+
+    # Phase 5: ellipsis.
+    text = _ELLIPSIS_RE.sub("...", text)
+
+    return text
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..71b2062
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,45 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "htmltrust-canonicalization"
+version = "0.1.0"
+description = "Canonical text normalization and HTML extraction for HTMLTrust signed content. Byte-identical output to the JavaScript, Go, PHP, and Rust bindings."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+authors = [
+    { name = "HTMLTrust contributors" },
+]
+keywords = ["htmltrust", "canonicalization", "signing", "html", "unicode"]
+classifiers = [
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
+    "Topic :: Text Processing",
+    "Topic :: Security :: Cryptography",
+]
+dependencies = [
+    "beautifulsoup4>=4.12",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0",
+]
+
+[project.urls]
+Homepage = "https://github.com/HTMLTrust/htmltrust-canonicalization"
+Repository = "https://github.com/HTMLTrust/htmltrust-canonicalization"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["htmltrust_canonicalization*"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/python/tests/__init__.py b/python/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/tests/test_claims.py b/python/tests/test_claims.py
new file mode 100644
index 0000000..b1961db
--- /dev/null
+++ b/python/tests/test_claims.py
@@ -0,0 +1,59 @@
+"""Conformance tests for ``canonicalize_claims``.
+
+Mirrors the JavaScript reference ``canonicalizeClaims``. Output MUST be
+byte-identical across language bindings: claim entries serialize as
+``name=value`` lines, sorted lexically by name, joined by ``\\n``.
+"""
+
+import pytest
+
+from htmltrust_canonicalization import canonicalize_claims
+
+
+def test_empty_claims():
+    assert canonicalize_claims({}) == ""
+
+
+def test_single_claim():
+    assert canonicalize_claims({"License": "CC-BY-4.0"}) == "License=CC-BY-4.0"
+
+
+def test_sorted_by_name():
+    """Order in -> sorted out, regardless of source ordering."""
+    out = canonicalize_claims({
+        "License": "CC-BY-4.0",
+        "AIAssistance": "None",
+        "ContentType": "Article",
+    })
+    assert out == (
+        "AIAssistance=None\n"
+        "ContentType=Article\n"
+        "License=CC-BY-4.0"
+    )
+
+
+def test_normalizes_values():
+    """Values run through normalize_text -- curly quotes collapse."""
+    out = canonicalize_claims({"author": "“Alice”"})
+    assert out == 'author="Alice"'
+
+
+def test_normalizes_names():
+    """Claim names also normalize -- ensures hash determinism."""
+    # An ellipsis in a claim name is exotic but tests the contract.
+    out = canonicalize_claims({"odd…name": "x"})
+    assert out == "odd...name=x"
+
+
+def test_coerces_value_to_string():
+    out = canonicalize_claims({"count": 42, "enabled": True})
+    # Booleans serialize as "True" / "False" via str(); that's fine for
+    # this layer -- callers should pre-stringify if they need different
+    # representations.
+    assert "count=42" in out
+    assert "enabled=True" in out
+
+
+def test_rejects_non_mapping():
+    with pytest.raises(TypeError):
+        canonicalize_claims([("a", "b")])  # type: ignore[arg-type]
diff --git a/python/tests/test_extract.py b/python/tests/test_extract.py
new file mode 100644
index 0000000..8a3d574
--- /dev/null
+++ b/python/tests/test_extract.py
@@ -0,0 +1,94 @@
+"""Conformance tests for ``extract_canonical_text``.
+
+These cases mirror the contract of the JavaScript reference
+``extractCanonicalText`` and confirm that block-element boundaries
+become whitespace, inline elements do not, excluded elements vanish
+entirely, and HTML entities are decoded by the parser before
+normalization.
+"""
+
+import pytest
+
+from htmltrust_canonicalization import extract_canonical_text
+
+
+def test_inline_no_separator():
+    """Inline elements like  must NOT introduce extra whitespace."""
+    assert (
+        extract_canonical_text("

hello world

") + == "hello world" + ) + + +def test_block_boundary_inserts_space(): + """

A

B

-> "A B" (not "AB").""" + assert ( + extract_canonical_text("

A

B

") == "A B" + ) + + +def test_excluded_elements_removed(): + """script/style/meta content must vanish entirely.""" + html = ( + "

before

" + "" + "" + "" + "

after

" + ) + assert extract_canonical_text(html) == "before after" + + +def test_entity_decoding(): + """HTML entities must be decoded by the parser.""" + assert ( + extract_canonical_text("

A & B — C

") + == "A & B - C" + ) + + +def test_normalization_pipeline_applied(): + """The canonicalization pipeline must run on the extracted text.""" + # Curly quotes inside HTML get extracted then normalized to straight. + assert ( + extract_canonical_text("

“Hello”

") == '"Hello"' + ) + + +def test_nested_blocks(): + """Deeply nested block structure still produces single-space joins.""" + html = ( + "
" + "

Title

" + "

Para one.

Para two.

" + "
" + ) + out = extract_canonical_text(html) + # We don't pin the exact spacing count beyond "single-space collapsed", + # since multiple block-boundary spaces must collapse via phase 2. + assert out == "Title Para one. Para two." + + +def test_list_items_separated(): + assert ( + extract_canonical_text("
  • a
  • b
  • c
") + == "a b c" + ) + + +def test_extract_rejects_non_string(): + with pytest.raises(TypeError): + extract_canonical_text(123) # type: ignore[arg-type] + + +def test_table_cells_separated(): + html = "
ab
cd
" + assert extract_canonical_text(html) == "a b c d" + + +def test_inline_link_no_separator(): + """Anchor tags are inline; they must NOT add separators.""" + assert ( + extract_canonical_text('

see here now

') + == "see here now" + ) diff --git a/python/tests/test_normalize.py b/python/tests/test_normalize.py new file mode 100644 index 0000000..2c5cd86 --- /dev/null +++ b/python/tests/test_normalize.py @@ -0,0 +1,88 @@ +"""Conformance tests for ``normalize_text``. + +The 18 test cases below are a direct port of +``htmltrust-canonicalization/javascript/test.js`` and MUST produce +byte-identical results across all language bindings. +""" + +import pytest + +from htmltrust_canonicalization import normalize_text + + +# (input_a, input_b, should_match, description) +NORMALIZATION_CASES = [ + ("“Hello”", '"Hello"', True, "Curly double quotes -> straight"), + ("café", "café", True, "Precomposed vs combining (NFKC)"), + ("find", "find", True, "fi ligature (NFKC)"), + ("word — word", "word - word", True, "Em dash -> hyphen-minus"), + ("«Bonjour»", '"Bonjour"', True, "Guillemets -> double quotes"), + ( + "「東京」", + '"東京"', + True, + "CJK corner brackets -> double quotes", + ), + ( + "می‌خواهم", + "میخواهم", + False, + "ZWNJ is semantic (Persian)", + ), + ( + "كتـــاب", + "كتاب", + True, + "Arabic tatweel stripped", + ), + ("A1", "A1", True, "Fullwidth ASCII (NFKC)"), + ("①", "1", True, "Circled digit (NFKC)"), + ("word​word", "wordword", True, "ZWSP stripped"), + ("word‌word", "wordword", False, "ZWNJ preserved (different)"), + ("Hello…", "Hello...", True, "Ellipsis -> three dots"), + ("‘Hello’", "'Hello'", True, "Curly single quotes -> straight"), + ("‚German“", '"German"', True, "Low-9 quotes -> straight"), + ("a b", "a b", True, "No-break space -> space"), + ("a b", "a b", True, "Ideographic space -> space"), + ("a \t b", "a b", True, "Whitespace collapse"), +] + + +@pytest.mark.parametrize("a,b,should_match,desc", NORMALIZATION_CASES) +def test_normalize_match(a: str, b: str, should_match: bool, desc: str): + norm_a = normalize_text(a) + norm_b = normalize_text(b) + if should_match: + assert norm_a == norm_b, ( + f"{desc!r}: expected match but got\n" + f" A={norm_a!r}\n B={norm_b!r}" + ) + else: + assert norm_a != norm_b, ( + f"{desc!r}: expected mismatch but both normalized to {norm_a!r}" + ) + + +def test_preserve_whitespace(): + """``preserve_whitespace=True`` must skip phase-2 collapsing.""" + src = "line1\n line2\t\tline3" + assert normalize_text(src, preserve_whitespace=True) == src + + +def test_normalize_text_rejects_non_string(): + with pytest.raises(TypeError): + normalize_text(123) # type: ignore[arg-type] + + +def test_zwj_preserved_emoji(): + """Family ZWJ sequence must survive normalization.""" + family = "\U0001F468‍\U0001F469‍\U0001F467" + assert normalize_text(family) == family + + +def test_idempotent(): + """Normalization must be a fixed-point operation.""" + src = "“Café—test…”" + once = normalize_text(src) + twice = normalize_text(once) + assert once == twice diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 0000000..146ecc1 --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "htmltrust-canonicalization" +version = "0.1.0" +edition = "2021" +rust-version = "1.74" +description = "Canonical text normalization and HTML extraction for HTMLTrust signed content. Byte-identical output to the JavaScript, Go, PHP, and Python bindings." +license = "MIT" +repository = "https://github.com/HTMLTrust/htmltrust-canonicalization" +readme = "README.md" +keywords = ["htmltrust", "canonicalization", "signing", "html", "unicode"] +categories = ["text-processing", "cryptography"] + +[lib] +name = "htmltrust_canonicalization" +path = "src/lib.rs" + +[dependencies] +unicode-normalization = "0.1" +scraper = "0.20" +ego-tree = "0.6" + +[dev-dependencies] +# pure stdlib tests; nothing extra required. diff --git a/rust/README.md b/rust/README.md new file mode 100644 index 0000000..41c797b --- /dev/null +++ b/rust/README.md @@ -0,0 +1,63 @@ +# HTMLTrust Canonicalization -- Rust + +Rust crate for the HTMLTrust canonical text normalization library. Produces byte-identical output to the JavaScript, Go, PHP, and Python implementations for every test vector in the shared conformance suite. + +## Status + +Implemented. The 18-case normalization conformance suite from the JavaScript reference (`javascript/test.js`) passes, along with parity tests for `extract_canonical_text` and `canonicalize_claims`. + +Out of scope for this crate: signature verification and key resolution. Those will arrive in a follow-up PR alongside the Python binding once the JavaScript surface area lands on `main`. + +## Scope + +This crate provides three functions: + +1. **`normalize_text(text: &str, preserve_whitespace: bool) -> String`** -- applies the 8-phase canonicalization defined in [`../spec.md`](../spec.md) to a UTF-8 string. +2. **`extract_canonical_text(html: &str) -> String`** -- parses an HTML fragment with `scraper` (html5ever), walks the DOM, emits text nodes in document order with single-space separators between block elements, and applies `normalize_text` to the result. +3. **`canonicalize_claims(claims: &BTreeMap) -> String`** -- serializes a claim map to the canonical, hashable string used by the `claims-hash` field of the signature binding. + +All three are pure functions: no I/O, deterministic output for the same input. + +## Dependencies + +- `unicode-normalization` for NFKC +- `scraper` (html5ever-backed) for HTML parsing in `extract_canonical_text` +- `ego-tree` for the DOM walk types re-exported by scraper + +## Conformance + +`tests/conformance.rs` runs all 18 normalization vectors from `javascript/test.js`, plus `extract_canonical_text` and `canonicalize_claims` parity cases. Output MUST stay byte-identical to the JavaScript / Go / PHP / Python bindings. + +## Installation + +```toml +[dependencies] +htmltrust-canonicalization = "0.1" +``` + +## Usage + +```rust +use std::collections::BTreeMap; +use htmltrust_canonicalization::{ + normalize_text, extract_canonical_text, canonicalize_claims, +}; + +let canonical = normalize_text("He said, \"Hello\u{2026}\"", false); +// -> "He said, \"Hello...\"" + +let from_html = extract_canonical_text("

Hello world!

"); +// -> "Hello world!" + +let mut claims = BTreeMap::new(); +claims.insert("License".to_string(), "CC-BY-4.0".to_string()); +claims.insert("AIAssistance".to_string(), "None".to_string()); +let claims_str = canonicalize_claims(&claims); +// -> "AIAssistance=None\nLicense=CC-BY-4.0" +``` + +## Tests + +```bash +cargo test +``` diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 0000000..be5b5c5 --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,313 @@ +//! HTMLTrust canonicalization (Rust binding). +//! +//! Public API: +//! +//! - [`normalize_text`] -- the 8-phase HTMLTrust canonicalization pipeline. +//! - [`extract_canonical_text`] -- HTML -> canonical text extraction +//! (spec §2.1), parses with `scraper` (html5ever) and walks the DOM. +//! - [`canonicalize_claims`] -- canonical serialization of claim metadata +//! for the `claims-hash` field of the signature binding. +//! +//! All three functions produce byte-identical output to the JavaScript, +//! Go, PHP, and Python bindings. The 18 conformance cases in +//! `tests/conformance.rs` are a direct port of the shared test suite +//! (`htmltrust-canonicalization/javascript/test.js`). + +use std::collections::BTreeMap; + +use scraper::{node::Node, Html}; +use ego_tree::NodeRef; +use unicode_normalization::UnicodeNormalization; + +// --------------------------------------------------------------------------- +// Codepoint ranges, mirroring the JS reference regex character classes +// byte-for-byte. Inclusive ranges. Single codepoints expressed as +// (cp, cp). +// --------------------------------------------------------------------------- + +/// Phase 6 + 7: invisible / formatting / bidi characters to strip. +/// ZWNJ (U+200C) and ZWJ (U+200D) are deliberately preserved -- they are +/// semantic in Persian, Indic, and emoji. +const STRIP_RANGES: &[(u32, u32)] = &[ + (0x00AD, 0x00AD), // soft hyphen + (0x200B, 0x200B), // zero-width space + (0x200E, 0x200E), // LRM + (0x200F, 0x200F), // RLM + (0x2060, 0x2060), // word joiner + (0xFEFF, 0xFEFF), // BOM / ZWNBSP + (0x034F, 0x034F), // combining grapheme joiner + (0x061C, 0x061C), // arabic letter mark + (0x180E, 0x180E), // mongolian vowel separator + (0x0640, 0x0640), // arabic tatweel + (0xFE00, 0xFE0F), // variation selectors 1-16 + (0x202A, 0x202E), // bidi embedding controls + (0x2066, 0x2069), // bidi isolate controls + (0x2061, 0x2064), // invisible math operators + (0xFFF9, 0xFFFC), // interlinear annotation + obj replacement + // Supplementary plane: variation selectors 17-256, tag characters. + (0xE0001, 0xE007F), + (0xE0100, 0xE01EF), +]; + +/// Phase 2: Unicode whitespace -> U+0020. +const WHITESPACE_RANGES: &[(u32, u32)] = &[ + (0x0009, 0x000D), // HT, LF, VT, FF, CR + (0x0020, 0x0020), // SPACE + (0x0085, 0x0085), // NEL + (0x00A0, 0x00A0), // NBSP + (0x1680, 0x1680), // ogham space mark + (0x2000, 0x200A), // en quad .. hair space + (0x2028, 0x2028), // line separator + (0x2029, 0x2029), // paragraph separator + (0x202F, 0x202F), // narrow no-break space + (0x205F, 0x205F), // medium mathematical space + (0x3000, 0x3000), // ideographic space +]; + +/// Phase 3: single quotes -> ASCII apostrophe. +const SINGLE_QUOTE_POINTS: &[u32] = &[ + 0x2018, // left single quote + 0x2019, // right single quote + 0x201B, // single high-reversed-9 + 0x2039, // single left guillemet + 0x203A, // single right guillemet + 0x0060, // grave accent + 0x00B4, // acute accent + 0x2032, // prime +]; + +/// Phase 3: double quotes -> ASCII double quote. +const DOUBLE_QUOTE_POINTS: &[u32] = &[ + 0x201A, // single low-9 quote (intentionally mapped to double) + 0x201C, // left double quote + 0x201D, // right double quote + 0x201E, // low double quote + 0x201F, // double high-reversed-9 + 0x00AB, // left guillemet + 0x00BB, // right guillemet + 0x2033, // double prime + 0x301D, // reversed double prime quotation mark + 0x301E, // double prime quotation mark + 0x301F, // low double prime quotation mark +]; + +/// Phase 3: CJK corner brackets -> ASCII double quote. +const CJK_QUOTE_RANGES: &[(u32, u32)] = &[ + (0x300C, 0x300F), // CJK corner brackets + (0xFE41, 0xFE44), // presentation forms for vertical corner brackets +]; + +/// Phase 4: dashes -> ASCII hyphen-minus. +const DASH_POINTS: &[u32] = &[ + 0x2212, // minus sign + 0xFE58, // small em dash + 0xFE63, // small hyphen-minus +]; +const DASH_RANGES: &[(u32, u32)] = &[ + (0x2010, 0x2015), // hyphen .. horizontal bar +]; + +/// Phase 5: ellipsis -> three periods. +const ELLIPSIS: char = '\u{2026}'; + +// --------------------------------------------------------------------------- +// Range / point membership helpers (linear; the sets are tiny). +// --------------------------------------------------------------------------- + +fn in_ranges(c: char, ranges: &[(u32, u32)]) -> bool { + let cp = c as u32; + ranges.iter().any(|&(start, end)| cp >= start && cp <= end) +} + +fn in_points(c: char, points: &[u32]) -> bool { + points.contains(&(c as u32)) +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/// Apply the HTMLTrust 8-phase canonicalization pipeline to `text`. +/// +/// Order matches the JavaScript reference implementation precisely. +/// +/// # Arguments +/// +/// * `text` -- raw text content (typically the output of +/// [`extract_canonical_text`]). +/// * `preserve_whitespace` -- `true` for `
` content where whitespace
+///   is significant; otherwise `false`.
+///
+/// # Returns
+///
+/// Normalized text, suitable for hashing.
+pub fn normalize_text(text: &str, preserve_whitespace: bool) -> String {
+    // Phase 1: NFKC.
+    let nfkc: String = text.nfkc().collect();
+
+    // Phases 6 + 7: strip invisible / formatting / bidi characters.
+    let stripped: String = nfkc.chars().filter(|&c| !in_ranges(c, STRIP_RANGES)).collect();
+
+    // Phase 2: whitespace normalization.
+    let ws: String = if preserve_whitespace {
+        stripped
+    } else {
+        let mut buf = String::with_capacity(stripped.len());
+        let mut prev_space = false;
+        for c in stripped.chars() {
+            if in_ranges(c, WHITESPACE_RANGES) {
+                if !prev_space {
+                    buf.push(' ');
+                    prev_space = true;
+                }
+            } else {
+                buf.push(c);
+                prev_space = false;
+            }
+        }
+        buf
+    };
+
+    // Phases 3, 4, 5 in a single pass.
+    let mut out = String::with_capacity(ws.len());
+    for c in ws.chars() {
+        if in_points(c, SINGLE_QUOTE_POINTS) {
+            out.push('\'');
+        } else if in_points(c, DOUBLE_QUOTE_POINTS) || in_ranges(c, CJK_QUOTE_RANGES) {
+            out.push('"');
+        } else if in_points(c, DASH_POINTS) || in_ranges(c, DASH_RANGES) {
+            out.push('-');
+        } else if c == ELLIPSIS {
+            out.push_str("...");
+        } else {
+            out.push(c);
+        }
+    }
+    out
+}
+
+/// Extract canonical text from an HTML fragment.
+///
+/// Implements the HTML -> canonical text extraction defined in spec §2.1
+/// and ports the contract of the JavaScript `extractCanonicalText`. Uses
+/// `scraper` (html5ever under the hood) for parsing.
+///
+/// # Arguments
+///
+/// * `html` -- HTML fragment to canonicalize.
+///
+/// # Returns
+///
+/// Canonical text, ready to be hashed. Trimmed of leading/trailing
+/// whitespace.
+pub fn extract_canonical_text(html: &str) -> String {
+    let document = Html::parse_fragment(html);
+
+    let mut out = String::new();
+    walk(document.tree.root(), &mut out);
+
+    normalize_text(&out, false).trim().to_string()
+}
+
+fn is_excluded_tag(name: &str) -> bool {
+    matches!(
+        name,
+        "script" | "style" | "meta" | "link" | "head" | "noscript"
+    )
+}
+
+fn is_block_tag(name: &str) -> bool {
+    matches!(
+        name,
+        "address"
+            | "article"
+            | "aside"
+            | "blockquote"
+            | "canvas"
+            | "dd"
+            | "div"
+            | "dl"
+            | "dt"
+            | "fieldset"
+            | "figcaption"
+            | "figure"
+            | "footer"
+            | "form"
+            | "h1"
+            | "h2"
+            | "h3"
+            | "h4"
+            | "h5"
+            | "h6"
+            | "header"
+            | "hr"
+            | "li"
+            | "main"
+            | "nav"
+            | "noscript"
+            | "ol"
+            | "output"
+            | "p"
+            | "pre"
+            | "section"
+            | "table"
+            | "tfoot"
+            | "thead"
+            | "tr"
+            | "td"
+            | "th"
+            | "ul"
+            | "video"
+    )
+}
+
+fn walk<'a>(node: NodeRef<'a, Node>, out: &mut String) {
+    for child in node.children() {
+        match child.value() {
+            Node::Text(t) => {
+                out.push_str(&t.text);
+            }
+            Node::Element(e) => {
+                let name = e.name();
+                if is_excluded_tag(name) {
+                    continue;
+                }
+                let block = is_block_tag(name);
+                if block {
+                    out.push(' ');
+                }
+                walk(child, out);
+                if block {
+                    out.push(' ');
+                }
+            }
+            _ => {
+                // Comments, doctypes, processing instructions -- not signed.
+            }
+        }
+    }
+}
+
+/// Compute the canonical serialization of a claim map.
+///
+/// Each name and value is run through [`normalize_text`] and entries are
+/// sorted lexically by name, then joined by `\n` as `name=value` pairs.
+/// The caller is responsible for hashing the result.
+///
+/// `BTreeMap` is used as the input type because its iteration order is
+/// already lexicographic, which makes the determinism property obvious
+/// at the type level. Callers with other map types can pass via
+/// `BTreeMap::from_iter(...)`.
+pub fn canonicalize_claims(claims: &BTreeMap) -> String {
+    let mut entries: Vec<(String, String)> = claims
+        .iter()
+        .map(|(k, v)| (normalize_text(k, false), normalize_text(v, false)))
+        .collect();
+    // Re-sort after normalization in case normalization changes name order.
+    entries.sort_by(|a, b| a.0.cmp(&b.0));
+    entries
+        .into_iter()
+        .map(|(k, v)| format!("{}={}", k, v))
+        .collect::>()
+        .join("\n")
+}
diff --git a/rust/tests/conformance.rs b/rust/tests/conformance.rs
new file mode 100644
index 0000000..4d3a04f
--- /dev/null
+++ b/rust/tests/conformance.rs
@@ -0,0 +1,208 @@
+//! Conformance tests for the HTMLTrust Rust binding.
+//!
+//! The 18 normalization cases below are a direct port of
+//! `htmltrust-canonicalization/javascript/test.js` and MUST produce
+//! byte-identical results across all language bindings.
+
+use std::collections::BTreeMap;
+
+use htmltrust_canonicalization::{
+    canonicalize_claims, extract_canonical_text, normalize_text,
+};
+
+/// One conformance vector. `(input_a, input_b, should_match, description)`.
+type Case = (&'static str, &'static str, bool, &'static str);
+
+const NORMALIZATION_CASES: &[Case] = &[
+    (
+        "\u{201C}Hello\u{201D}",
+        "\"Hello\"",
+        true,
+        "Curly double quotes -> straight",
+    ),
+    (
+        "caf\u{00E9}",
+        "cafe\u{0301}",
+        true,
+        "Precomposed vs combining (NFKC)",
+    ),
+    ("\u{FB01}nd", "find", true, "fi ligature (NFKC)"),
+    (
+        "word \u{2014} word",
+        "word - word",
+        true,
+        "Em dash -> hyphen-minus",
+    ),
+    (
+        "\u{00AB}Bonjour\u{00BB}",
+        "\"Bonjour\"",
+        true,
+        "Guillemets -> double quotes",
+    ),
+    (
+        "\u{300C}\u{6771}\u{4EAC}\u{300D}",
+        "\"\u{6771}\u{4EAC}\"",
+        true,
+        "CJK corner brackets -> double quotes",
+    ),
+    (
+        "\u{0645}\u{06CC}\u{200C}\u{062E}\u{0648}\u{0627}\u{0647}\u{0645}",
+        "\u{0645}\u{06CC}\u{062E}\u{0648}\u{0627}\u{0647}\u{0645}",
+        false,
+        "ZWNJ is semantic (Persian)",
+    ),
+    (
+        "\u{0643}\u{062A}\u{0640}\u{0640}\u{0640}\u{0627}\u{0628}",
+        "\u{0643}\u{062A}\u{0627}\u{0628}",
+        true,
+        "Arabic tatweel stripped",
+    ),
+    ("\u{FF21}\u{FF11}", "A1", true, "Fullwidth ASCII (NFKC)"),
+    ("\u{2460}", "1", true, "Circled digit (NFKC)"),
+    ("word\u{200B}word", "wordword", true, "ZWSP stripped"),
+    (
+        "word\u{200C}word",
+        "wordword",
+        false,
+        "ZWNJ preserved (different)",
+    ),
+    ("Hello\u{2026}", "Hello...", true, "Ellipsis -> three dots"),
+    (
+        "\u{2018}Hello\u{2019}",
+        "'Hello'",
+        true,
+        "Curly single quotes -> straight",
+    ),
+    (
+        "\u{201A}German\u{201C}",
+        "\"German\"",
+        true,
+        "Low-9 quotes -> straight",
+    ),
+    ("a\u{00A0}b", "a b", true, "No-break space -> space"),
+    ("a\u{3000}b", "a b", true, "Ideographic space -> space"),
+    ("a  \t  b", "a b", true, "Whitespace collapse"),
+];
+
+#[test]
+fn normalization_conformance() {
+    let mut failures = Vec::::new();
+    for &(a, b, should_match, desc) in NORMALIZATION_CASES {
+        let na = normalize_text(a, false);
+        let nb = normalize_text(b, false);
+        let matched = na == nb;
+        if matched != should_match {
+            failures.push(format!(
+                "  {desc}: A={na:?} B={nb:?} expected match={should_match}, got match={matched}",
+            ));
+        }
+    }
+    assert!(
+        failures.is_empty(),
+        "{} failure(s):\n{}",
+        failures.len(),
+        failures.join("\n"),
+    );
+}
+
+#[test]
+fn preserve_whitespace_skips_collapse() {
+    let src = "line1\n    line2\t\tline3";
+    assert_eq!(normalize_text(src, true), src);
+}
+
+#[test]
+fn idempotent_for_typical_input() {
+    let src = "\u{201C}Caf\u{00E9}\u{2014}test\u{2026}\u{201D}";
+    let once = normalize_text(src, false);
+    let twice = normalize_text(&once, false);
+    assert_eq!(once, twice);
+}
+
+#[test]
+fn extract_inline_no_separator() {
+    assert_eq!(
+        extract_canonical_text("

hello world

"), + "hello world", + ); +} + +#[test] +fn extract_block_boundary_inserts_space() { + assert_eq!(extract_canonical_text("

A

B

"), "A B"); +} + +#[test] +fn extract_excluded_elements_removed() { + let html = "\ +

before

\ +\ +\ +\ +

after

"; + assert_eq!(extract_canonical_text(html), "before after"); +} + +#[test] +fn extract_entity_decoding() { + assert_eq!( + extract_canonical_text("

A & B — C

"), + "A & B - C", + ); +} + +#[test] +fn extract_normalization_pipeline_applied() { + assert_eq!( + extract_canonical_text("

\u{201C}Hello\u{201D}

"), + "\"Hello\"", + ); +} + +#[test] +fn extract_nested_blocks() { + let html = "

Title

\ +

Para one.

Para two.

"; + assert_eq!(extract_canonical_text(html), "Title Para one. Para two."); +} + +#[test] +fn extract_list_items_separated() { + assert_eq!( + extract_canonical_text("
  • a
  • b
  • c
"), + "a b c", + ); +} + +#[test] +fn extract_inline_link_no_separator() { + assert_eq!( + extract_canonical_text("

see here now

"), + "see here now", + ); +} + +#[test] +fn claims_empty() { + let claims: BTreeMap = BTreeMap::new(); + assert_eq!(canonicalize_claims(&claims), ""); +} + +#[test] +fn claims_sorted_by_name() { + let mut claims = BTreeMap::new(); + claims.insert("License".to_string(), "CC-BY-4.0".to_string()); + claims.insert("AIAssistance".to_string(), "None".to_string()); + claims.insert("ContentType".to_string(), "Article".to_string()); + assert_eq!( + canonicalize_claims(&claims), + "AIAssistance=None\nContentType=Article\nLicense=CC-BY-4.0", + ); +} + +#[test] +fn claims_normalize_values() { + let mut claims = BTreeMap::new(); + claims.insert("author".to_string(), "\u{201C}Alice\u{201D}".to_string()); + assert_eq!(canonicalize_claims(&claims), "author=\"Alice\""); +}