Title
Para one.
Para two.
diff --git a/.gitignore b/.gitignore index e00c140..199736c 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,20 @@ go/go.sum php/vendor/ php/composer.lock +# Python +python/.venv/ +python/.pytest_cache/ +python/build/ +python/dist/ +python/*.egg-info/ +python/htmltrust_canonicalization.egg-info/ +__pycache__/ +*.pyc + +# Rust +rust/target/ +rust/Cargo.lock + # IDE .idea/ .vscode/ diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..1c443aa --- /dev/null +++ b/python/README.md @@ -0,0 +1,66 @@ +# HTMLTrust Canonicalization -- Python + +Python binding for the HTMLTrust canonical text normalization library. Produces byte-identical output to the JavaScript, Go, PHP, and Rust implementations for every test vector in the shared conformance suite. + +## Status + +Implemented. The 18-case normalization conformance suite from the JavaScript reference (`javascript/test.js`) passes. `extract_canonical_text` and `canonicalize_claims` have parity tests against the JavaScript / Go / PHP reference behaviour. + +Out of scope for this package: signature verification and key resolution. Those live in the higher-level HTMLTrust client libraries (and will arrive in a follow-up PR for the Python binding once the JS surface area lands on `main`). + +## Scope + +This package provides three functions: + +1. **`normalize_text(text: str, preserve_whitespace: bool = False) -> str`** -- applies the 8-phase canonicalization defined in [`../spec.md`](../spec.md) to a UTF-8 string. Mirrors the existing JavaScript/Go/PHP signatures. +2. **`extract_canonical_text(html: str, preserve_whitespace: bool = False) -> str`** -- parses an HTML fragment with BeautifulSoup, walks the DOM, emits text nodes in document order with single-space separators between block elements, and applies `normalize_text` to the result. This is the HTML -> canonical text extraction defined in the paper's §2.1. +3. **`canonicalize_claims(claims: Mapping[str, object]) -> str`** -- serializes a claim map to the canonical, hashable string used by the `claims-hash` field of the signature binding (each entry normalized, sorted lexically by name, joined with `\n` as `name=value`). + +All three are pure functions: no network, no file I/O, deterministic output for the same input. + +## Dependencies + +- `unicodedata` (stdlib) for NFKC normalization +- `beautifulsoup4 >= 4.12` for HTML parsing in `extract_canonical_text` +- No other runtime dependencies + +## Conformance + +`tests/test_normalize.py` runs all 18 normalization vectors from `javascript/test.js`. `tests/test_extract.py` and `tests/test_claims.py` cover the HTML extraction and claim canonicalization contracts. Output MUST stay byte-identical to the JavaScript / Go / PHP / Rust bindings. + +## Installation + +```bash +pip install htmltrust-canonicalization +# or for development: +cd python && pip install -e '.[dev]' +``` + +## Usage + +```python +from htmltrust_canonicalization import ( + normalize_text, + extract_canonical_text, + canonicalize_claims, +) + +canonical = normalize_text('He said, "Hello…"') +# -> 'He said, "Hello..."' + +from_html = extract_canonical_text('
Hello world!
') +# -> 'Hello world!' + +claims_str = canonicalize_claims({ + 'License': 'CC-BY-4.0', + 'AIAssistance': 'None', +}) +# -> 'AIAssistance=None\nLicense=CC-BY-4.0' +``` + +## Tests + +```bash +pip install -e '.[dev]' +pytest +``` diff --git a/python/htmltrust_canonicalization/__init__.py b/python/htmltrust_canonicalization/__init__.py new file mode 100644 index 0000000..7309b6a --- /dev/null +++ b/python/htmltrust_canonicalization/__init__.py @@ -0,0 +1,22 @@ +"""HTMLTrust canonicalization (Python binding). + +Public API: + - normalize_text(text, preserve_whitespace=False) -> str + - extract_canonical_text(html, preserve_whitespace=False) -> str + - canonicalize_claims(claims) -> str + +This binding produces byte-identical output to the JavaScript, Go, PHP, +and Rust implementations of the HTMLTrust canonicalization library. +""" + +from ._normalize import normalize_text +from ._extract import extract_canonical_text +from ._claims import canonicalize_claims + +__all__ = [ + "normalize_text", + "extract_canonical_text", + "canonicalize_claims", +] + +__version__ = "0.1.0" diff --git a/python/htmltrust_canonicalization/_claims.py b/python/htmltrust_canonicalization/_claims.py new file mode 100644 index 0000000..e39aa93 --- /dev/null +++ b/python/htmltrust_canonicalization/_claims.py @@ -0,0 +1,39 @@ +"""Canonical claims serialization (HTMLTrust spec §2.1). + +Direct port of ``canonicalizeClaims`` from the JavaScript reference +implementation. Claims are normalized through the same pipeline as +content text and emitted as a sorted list of ``name=value`` pairs joined +by newlines. The caller is responsible for hashing the result. +""" + +from __future__ import annotations + +from collections.abc import Mapping + +from ._normalize import normalize_text + + +def canonicalize_claims(claims: Mapping[str, object]) -> str: + """Serialize ``claims`` to the canonical, sortable, hashable string form. + + Each claim name and value is run through ``normalize_text`` so that + Unicode equivalents collapse to identical bytes. Entries are then + sorted lexically by name and joined with newlines as ``name=value``. + + Args: + claims: Mapping of claim name to value. Values are coerced to + ``str`` before normalization so callers may pass simple + scalar types. + + Returns: + Canonical serialized string ready to be hashed. + """ + if not isinstance(claims, Mapping): + raise TypeError("canonicalize_claims expects a Mapping") + + entries = [ + (normalize_text(name), normalize_text(str(value))) + for name, value in claims.items() + ] + entries.sort(key=lambda nv: nv[0]) + return "\n".join(f"{name}={value}" for name, value in entries) diff --git a/python/htmltrust_canonicalization/_extract.py b/python/htmltrust_canonicalization/_extract.py new file mode 100644 index 0000000..56a516e --- /dev/null +++ b/python/htmltrust_canonicalization/_extract.py @@ -0,0 +1,101 @@ +"""HTML -> canonical text extraction (HTMLTrust spec §2.1). + +Direct semantic port of ``extractCanonicalText`` from the JavaScript +reference implementation. The Python binding uses BeautifulSoup +(html.parser backend, stdlib) for parsing because real HTML is messy +and a forgiving parser produces more reliable output than the JS +binding's regex pipeline. The text-output contract (which elements +contribute, where whitespace separators go) is identical. +""" + +from __future__ import annotations + +from bs4 import BeautifulSoup, NavigableString, Tag + +from ._normalize import normalize_text + +# Elements whose text content is NEVER part of the signed content. +# `` is excluded because, inside a signed-section, it carries +# claim metadata, not signed content (claims are hashed separately into +# the claims-hash field). +_EXCLUDED_TAGS = frozenset({ + "script", "style", "meta", "link", "head", "noscript", +}) + +# Block-level elements whose boundaries become whitespace separators. +# Inline elements (em, strong, a, span, etc.) do NOT introduce separators, +# so "hello world
" canonicalizes to "hello world". +_BLOCK_TAGS = frozenset({ + "address", "article", "aside", "blockquote", "canvas", "dd", "div", + "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", + "h1", "h2", "h3", "h4", "h5", "h6", + "header", "hr", "li", "main", "nav", "noscript", "ol", "output", + "p", "pre", "section", "table", "tfoot", "thead", + "tr", "td", "th", "ul", "video", +}) + + +def extract_canonical_text(html: str, preserve_whitespace: bool = False) -> str: + """Extract canonical text content from an HTML fragment. + + Given an HTML fragment (typically the inner contents of a + ``A
B
`` + extracts to ``"A B"`` and not ``"AB"``. + 3. Emits text nodes verbatim (entity-decoded by the parser). + 4. Applies the full text-normalization pipeline (``normalize_text``). + + Args: + html: HTML fragment to canonicalize. + preserve_whitespace: Passed through to ``normalize_text``. + Defaults to ``False``. + + Returns: + Canonical text, ready to be hashed. Trimmed of leading/trailing + whitespace. + """ + if not isinstance(html, str): + raise TypeError("extract_canonical_text expects a str") + + soup = BeautifulSoup(html, "html.parser") + + # Remove excluded elements (and their text content) outright. + for tag_name in _EXCLUDED_TAGS: + for elem in soup.find_all(tag_name): + elem.decompose() + + parts: list[str] = [] + _walk(soup, parts) + + text = "".join(parts) + return normalize_text(text, preserve_whitespace).strip() + + +def _walk(node, out: list[str]) -> None: + """Walk ``node`` in document order, appending text and block-boundary + spaces to ``out`` in place. + """ + for child in getattr(node, "children", ()): + if isinstance(child, NavigableString): + # bs4 navigable strings include comments / doctypes / cdata. + # We only want plain text, not Comment / Doctype / CData. + # Comment is a NavigableString subclass; check the type name. + cls_name = type(child).__name__ + if cls_name in ("Comment", "Doctype", "CData", "ProcessingInstruction"): + continue + out.append(str(child)) + elif isinstance(child, Tag): + name = child.name.lower() if child.name else "" + is_block = name in _BLOCK_TAGS + if is_block: + out.append(" ") + _walk(child, out) + if is_block: + out.append(" ") + # Void elements (br, hr, img, etc.) within inline context: hr is + # already in _BLOCK_TAGS; br is treated as inline (no separator), + # matching the JS reference which strips br via ANY_TAG_RE. diff --git a/python/htmltrust_canonicalization/_normalize.py b/python/htmltrust_canonicalization/_normalize.py new file mode 100644 index 0000000..dd97408 --- /dev/null +++ b/python/htmltrust_canonicalization/_normalize.py @@ -0,0 +1,192 @@ +"""Text normalization (the 8-phase HTMLTrust canonicalization pipeline). + +Direct port of the JavaScript reference implementation +(htmltrust-canonicalization/javascript/index.js, function ``normalizeText``). + +The character classes below are byte-for-byte the same Unicode +codepoint sets used by the JavaScript and Go bindings; output MUST be +byte-identical across language implementations. + +To keep this source file pure-ASCII and immune to editor mangling, the +character sets are built programmatically from explicit codepoint +ranges via ``chr()``. Each list entry is an ``(int, int)`` pair giving +an inclusive range, OR a single ``int`` for a single codepoint. +""" + +from __future__ import annotations + +import re +import unicodedata +from typing import Iterable, Union + +_RangeOrPoint = Union[int, tuple[int, int]] + + +def _build_class(items: Iterable[_RangeOrPoint]) -> str: + """Build a regex character class string from ``items``. + + Each item is either a single codepoint (``int``) or an inclusive + ``(start, end)`` range. Returns ``"[`` content where
+ whitespace is significant. Defaults to ``False``.
+
+ Returns:
+ Normalized text, suitable for hashing.
+ """
+ if not isinstance(text, str):
+ raise TypeError("normalize_text expects a str")
+
+ # Phase 1: NFKC -- ligatures, fullwidth/halfwidth, presentation forms,
+ # superscripts, CJK compatibility, Jamo composition.
+ text = unicodedata.normalize("NFKC", text)
+
+ # Phases 6 + 7: strip invisible / formatting / bidi characters.
+ # Done early so they don't interfere with later phases.
+ text = _STRIP_RE.sub("", text)
+ text = _STRIP_SUPPLEMENTARY_RE.sub("", text)
+
+ # Phase 2: whitespace normalization.
+ if not preserve_whitespace:
+ text = _WHITESPACE_RE.sub(" ", text)
+ text = _RUN_OF_SPACES_RE.sub(" ", text)
+
+ # Phase 3: quotation marks.
+ text = _SINGLE_QUOTE_RE.sub("'", text)
+ text = _DOUBLE_QUOTE_RE.sub('"', text)
+ text = _CJK_QUOTE_RE.sub('"', text)
+
+ # Phase 4: dashes / hyphens / minus.
+ text = _DASH_RE.sub("-", text)
+
+ # Phase 5: ellipsis.
+ text = _ELLIPSIS_RE.sub("...", text)
+
+ return text
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..71b2062
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,45 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "htmltrust-canonicalization"
+version = "0.1.0"
+description = "Canonical text normalization and HTML extraction for HTMLTrust signed content. Byte-identical output to the JavaScript, Go, PHP, and Rust bindings."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+authors = [
+ { name = "HTMLTrust contributors" },
+]
+keywords = ["htmltrust", "canonicalization", "signing", "html", "unicode"]
+classifiers = [
+ "License :: OSI Approved :: MIT License",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Programming Language :: Python :: 3.14",
+ "Topic :: Text Processing",
+ "Topic :: Security :: Cryptography",
+]
+dependencies = [
+ "beautifulsoup4>=4.12",
+]
+
+[project.optional-dependencies]
+dev = [
+ "pytest>=8.0",
+]
+
+[project.urls]
+Homepage = "https://github.com/HTMLTrust/htmltrust-canonicalization"
+Repository = "https://github.com/HTMLTrust/htmltrust-canonicalization"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["htmltrust_canonicalization*"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/python/tests/__init__.py b/python/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/tests/test_claims.py b/python/tests/test_claims.py
new file mode 100644
index 0000000..b1961db
--- /dev/null
+++ b/python/tests/test_claims.py
@@ -0,0 +1,59 @@
+"""Conformance tests for ``canonicalize_claims``.
+
+Mirrors the JavaScript reference ``canonicalizeClaims``. Output MUST be
+byte-identical across language bindings: claim entries serialize as
+``name=value`` lines, sorted lexically by name, joined by ``\\n``.
+"""
+
+import pytest
+
+from htmltrust_canonicalization import canonicalize_claims
+
+
+def test_empty_claims():
+ assert canonicalize_claims({}) == ""
+
+
+def test_single_claim():
+ assert canonicalize_claims({"License": "CC-BY-4.0"}) == "License=CC-BY-4.0"
+
+
+def test_sorted_by_name():
+ """Order in -> sorted out, regardless of source ordering."""
+ out = canonicalize_claims({
+ "License": "CC-BY-4.0",
+ "AIAssistance": "None",
+ "ContentType": "Article",
+ })
+ assert out == (
+ "AIAssistance=None\n"
+ "ContentType=Article\n"
+ "License=CC-BY-4.0"
+ )
+
+
+def test_normalizes_values():
+ """Values run through normalize_text -- curly quotes collapse."""
+ out = canonicalize_claims({"author": "“Alice”"})
+ assert out == 'author="Alice"'
+
+
+def test_normalizes_names():
+ """Claim names also normalize -- ensures hash determinism."""
+ # An ellipsis in a claim name is exotic but tests the contract.
+ out = canonicalize_claims({"odd…name": "x"})
+ assert out == "odd...name=x"
+
+
+def test_coerces_value_to_string():
+ out = canonicalize_claims({"count": 42, "enabled": True})
+ # Booleans serialize as "True" / "False" via str(); that's fine for
+ # this layer -- callers should pre-stringify if they need different
+ # representations.
+ assert "count=42" in out
+ assert "enabled=True" in out
+
+
+def test_rejects_non_mapping():
+ with pytest.raises(TypeError):
+ canonicalize_claims([("a", "b")]) # type: ignore[arg-type]
diff --git a/python/tests/test_extract.py b/python/tests/test_extract.py
new file mode 100644
index 0000000..8a3d574
--- /dev/null
+++ b/python/tests/test_extract.py
@@ -0,0 +1,94 @@
+"""Conformance tests for ``extract_canonical_text``.
+
+These cases mirror the contract of the JavaScript reference
+``extractCanonicalText`` and confirm that block-element boundaries
+become whitespace, inline elements do not, excluded elements vanish
+entirely, and HTML entities are decoded by the parser before
+normalization.
+"""
+
+import pytest
+
+from htmltrust_canonicalization import extract_canonical_text
+
+
+def test_inline_no_separator():
+ """Inline elements like must NOT introduce extra whitespace."""
+ assert (
+ extract_canonical_text("hello world
")
+ == "hello world"
+ )
+
+
+def test_block_boundary_inserts_space():
+ """A
B
-> "A B" (not "AB")."""
+ assert (
+ extract_canonical_text("A
B
") == "A B"
+ )
+
+
+def test_excluded_elements_removed():
+ """script/style/meta content must vanish entirely."""
+ html = (
+ "before
"
+ ""
+ ""
+ ""
+ "after
"
+ )
+ assert extract_canonical_text(html) == "before after"
+
+
+def test_entity_decoding():
+ """HTML entities must be decoded by the parser."""
+ assert (
+ extract_canonical_text("A & B — C
")
+ == "A & B - C"
+ )
+
+
+def test_normalization_pipeline_applied():
+ """The canonicalization pipeline must run on the extracted text."""
+ # Curly quotes inside HTML get extracted then normalized to straight.
+ assert (
+ extract_canonical_text("“Hello”
") == '"Hello"'
+ )
+
+
+def test_nested_blocks():
+ """Deeply nested block structure still produces single-space joins."""
+ html = (
+ ""
+ "Title
"
+ "Para one.
Para two.
"
+ " "
+ )
+ out = extract_canonical_text(html)
+ # We don't pin the exact spacing count beyond "single-space collapsed",
+ # since multiple block-boundary spaces must collapse via phase 2.
+ assert out == "Title Para one. Para two."
+
+
+def test_list_items_separated():
+ assert (
+ extract_canonical_text("- a
- b
- c
")
+ == "a b c"
+ )
+
+
+def test_extract_rejects_non_string():
+ with pytest.raises(TypeError):
+ extract_canonical_text(123) # type: ignore[arg-type]
+
+
+def test_table_cells_separated():
+ html = "a b c d
"
+ assert extract_canonical_text(html) == "a b c d"
+
+
+def test_inline_link_no_separator():
+ """Anchor tags are inline; they must NOT add separators."""
+ assert (
+ extract_canonical_text('see here now
')
+ == "see here now"
+ )
diff --git a/python/tests/test_normalize.py b/python/tests/test_normalize.py
new file mode 100644
index 0000000..2c5cd86
--- /dev/null
+++ b/python/tests/test_normalize.py
@@ -0,0 +1,88 @@
+"""Conformance tests for ``normalize_text``.
+
+The 18 test cases below are a direct port of
+``htmltrust-canonicalization/javascript/test.js`` and MUST produce
+byte-identical results across all language bindings.
+"""
+
+import pytest
+
+from htmltrust_canonicalization import normalize_text
+
+
+# (input_a, input_b, should_match, description)
+NORMALIZATION_CASES = [
+ ("“Hello”", '"Hello"', True, "Curly double quotes -> straight"),
+ ("café", "café", True, "Precomposed vs combining (NFKC)"),
+ ("find", "find", True, "fi ligature (NFKC)"),
+ ("word — word", "word - word", True, "Em dash -> hyphen-minus"),
+ ("«Bonjour»", '"Bonjour"', True, "Guillemets -> double quotes"),
+ (
+ "「東京」",
+ '"東京"',
+ True,
+ "CJK corner brackets -> double quotes",
+ ),
+ (
+ "میخواهم",
+ "میخواهم",
+ False,
+ "ZWNJ is semantic (Persian)",
+ ),
+ (
+ "كتـــاب",
+ "كتاب",
+ True,
+ "Arabic tatweel stripped",
+ ),
+ ("A1", "A1", True, "Fullwidth ASCII (NFKC)"),
+ ("①", "1", True, "Circled digit (NFKC)"),
+ ("wordword", "wordword", True, "ZWSP stripped"),
+ ("wordword", "wordword", False, "ZWNJ preserved (different)"),
+ ("Hello…", "Hello...", True, "Ellipsis -> three dots"),
+ ("‘Hello’", "'Hello'", True, "Curly single quotes -> straight"),
+ ("‚German“", '"German"', True, "Low-9 quotes -> straight"),
+ ("a b", "a b", True, "No-break space -> space"),
+ ("a b", "a b", True, "Ideographic space -> space"),
+ ("a \t b", "a b", True, "Whitespace collapse"),
+]
+
+
+@pytest.mark.parametrize("a,b,should_match,desc", NORMALIZATION_CASES)
+def test_normalize_match(a: str, b: str, should_match: bool, desc: str):
+ norm_a = normalize_text(a)
+ norm_b = normalize_text(b)
+ if should_match:
+ assert norm_a == norm_b, (
+ f"{desc!r}: expected match but got\n"
+ f" A={norm_a!r}\n B={norm_b!r}"
+ )
+ else:
+ assert norm_a != norm_b, (
+ f"{desc!r}: expected mismatch but both normalized to {norm_a!r}"
+ )
+
+
+def test_preserve_whitespace():
+ """``preserve_whitespace=True`` must skip phase-2 collapsing."""
+ src = "line1\n line2\t\tline3"
+ assert normalize_text(src, preserve_whitespace=True) == src
+
+
+def test_normalize_text_rejects_non_string():
+ with pytest.raises(TypeError):
+ normalize_text(123) # type: ignore[arg-type]
+
+
+def test_zwj_preserved_emoji():
+ """Family ZWJ sequence must survive normalization."""
+ family = "\U0001F468\U0001F469\U0001F467"
+ assert normalize_text(family) == family
+
+
+def test_idempotent():
+ """Normalization must be a fixed-point operation."""
+ src = "“Café—test…”"
+ once = normalize_text(src)
+ twice = normalize_text(once)
+ assert once == twice
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
new file mode 100644
index 0000000..146ecc1
--- /dev/null
+++ b/rust/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "htmltrust-canonicalization"
+version = "0.1.0"
+edition = "2021"
+rust-version = "1.74"
+description = "Canonical text normalization and HTML extraction for HTMLTrust signed content. Byte-identical output to the JavaScript, Go, PHP, and Python bindings."
+license = "MIT"
+repository = "https://github.com/HTMLTrust/htmltrust-canonicalization"
+readme = "README.md"
+keywords = ["htmltrust", "canonicalization", "signing", "html", "unicode"]
+categories = ["text-processing", "cryptography"]
+
+[lib]
+name = "htmltrust_canonicalization"
+path = "src/lib.rs"
+
+[dependencies]
+unicode-normalization = "0.1"
+scraper = "0.20"
+ego-tree = "0.6"
+
+[dev-dependencies]
+# pure stdlib tests; nothing extra required.
diff --git a/rust/README.md b/rust/README.md
new file mode 100644
index 0000000..41c797b
--- /dev/null
+++ b/rust/README.md
@@ -0,0 +1,63 @@
+# HTMLTrust Canonicalization -- Rust
+
+Rust crate for the HTMLTrust canonical text normalization library. Produces byte-identical output to the JavaScript, Go, PHP, and Python implementations for every test vector in the shared conformance suite.
+
+## Status
+
+Implemented. The 18-case normalization conformance suite from the JavaScript reference (`javascript/test.js`) passes, along with parity tests for `extract_canonical_text` and `canonicalize_claims`.
+
+Out of scope for this crate: signature verification and key resolution. Those will arrive in a follow-up PR alongside the Python binding once the JavaScript surface area lands on `main`.
+
+## Scope
+
+This crate provides three functions:
+
+1. **`normalize_text(text: &str, preserve_whitespace: bool) -> String`** -- applies the 8-phase canonicalization defined in [`../spec.md`](../spec.md) to a UTF-8 string.
+2. **`extract_canonical_text(html: &str) -> String`** -- parses an HTML fragment with `scraper` (html5ever), walks the DOM, emits text nodes in document order with single-space separators between block elements, and applies `normalize_text` to the result.
+3. **`canonicalize_claims(claims: &BTreeMap) -> String`** -- serializes a claim map to the canonical, hashable string used by the `claims-hash` field of the signature binding.
+
+All three are pure functions: no I/O, deterministic output for the same input.
+
+## Dependencies
+
+- `unicode-normalization` for NFKC
+- `scraper` (html5ever-backed) for HTML parsing in `extract_canonical_text`
+- `ego-tree` for the DOM walk types re-exported by scraper
+
+## Conformance
+
+`tests/conformance.rs` runs all 18 normalization vectors from `javascript/test.js`, plus `extract_canonical_text` and `canonicalize_claims` parity cases. Output MUST stay byte-identical to the JavaScript / Go / PHP / Python bindings.
+
+## Installation
+
+```toml
+[dependencies]
+htmltrust-canonicalization = "0.1"
+```
+
+## Usage
+
+```rust
+use std::collections::BTreeMap;
+use htmltrust_canonicalization::{
+ normalize_text, extract_canonical_text, canonicalize_claims,
+};
+
+let canonical = normalize_text("He said, \"Hello\u{2026}\"", false);
+// -> "He said, \"Hello...\""
+
+let from_html = extract_canonical_text("Hello world!
");
+// -> "Hello world!"
+
+let mut claims = BTreeMap::new();
+claims.insert("License".to_string(), "CC-BY-4.0".to_string());
+claims.insert("AIAssistance".to_string(), "None".to_string());
+let claims_str = canonicalize_claims(&claims);
+// -> "AIAssistance=None\nLicense=CC-BY-4.0"
+```
+
+## Tests
+
+```bash
+cargo test
+```
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
new file mode 100644
index 0000000..be5b5c5
--- /dev/null
+++ b/rust/src/lib.rs
@@ -0,0 +1,313 @@
+//! HTMLTrust canonicalization (Rust binding).
+//!
+//! Public API:
+//!
+//! - [`normalize_text`] -- the 8-phase HTMLTrust canonicalization pipeline.
+//! - [`extract_canonical_text`] -- HTML -> canonical text extraction
+//! (spec §2.1), parses with `scraper` (html5ever) and walks the DOM.
+//! - [`canonicalize_claims`] -- canonical serialization of claim metadata
+//! for the `claims-hash` field of the signature binding.
+//!
+//! All three functions produce byte-identical output to the JavaScript,
+//! Go, PHP, and Python bindings. The 18 conformance cases in
+//! `tests/conformance.rs` are a direct port of the shared test suite
+//! (`htmltrust-canonicalization/javascript/test.js`).
+
+use std::collections::BTreeMap;
+
+use scraper::{node::Node, Html};
+use ego_tree::NodeRef;
+use unicode_normalization::UnicodeNormalization;
+
+// ---------------------------------------------------------------------------
+// Codepoint ranges, mirroring the JS reference regex character classes
+// byte-for-byte. Inclusive ranges. Single codepoints expressed as
+// (cp, cp).
+// ---------------------------------------------------------------------------
+
+/// Phase 6 + 7: invisible / formatting / bidi characters to strip.
+/// ZWNJ (U+200C) and ZWJ (U+200D) are deliberately preserved -- they are
+/// semantic in Persian, Indic, and emoji.
+const STRIP_RANGES: &[(u32, u32)] = &[
+ (0x00AD, 0x00AD), // soft hyphen
+ (0x200B, 0x200B), // zero-width space
+ (0x200E, 0x200E), // LRM
+ (0x200F, 0x200F), // RLM
+ (0x2060, 0x2060), // word joiner
+ (0xFEFF, 0xFEFF), // BOM / ZWNBSP
+ (0x034F, 0x034F), // combining grapheme joiner
+ (0x061C, 0x061C), // arabic letter mark
+ (0x180E, 0x180E), // mongolian vowel separator
+ (0x0640, 0x0640), // arabic tatweel
+ (0xFE00, 0xFE0F), // variation selectors 1-16
+ (0x202A, 0x202E), // bidi embedding controls
+ (0x2066, 0x2069), // bidi isolate controls
+ (0x2061, 0x2064), // invisible math operators
+ (0xFFF9, 0xFFFC), // interlinear annotation + obj replacement
+ // Supplementary plane: variation selectors 17-256, tag characters.
+ (0xE0001, 0xE007F),
+ (0xE0100, 0xE01EF),
+];
+
+/// Phase 2: Unicode whitespace -> U+0020.
+const WHITESPACE_RANGES: &[(u32, u32)] = &[
+ (0x0009, 0x000D), // HT, LF, VT, FF, CR
+ (0x0020, 0x0020), // SPACE
+ (0x0085, 0x0085), // NEL
+ (0x00A0, 0x00A0), // NBSP
+ (0x1680, 0x1680), // ogham space mark
+ (0x2000, 0x200A), // en quad .. hair space
+ (0x2028, 0x2028), // line separator
+ (0x2029, 0x2029), // paragraph separator
+ (0x202F, 0x202F), // narrow no-break space
+ (0x205F, 0x205F), // medium mathematical space
+ (0x3000, 0x3000), // ideographic space
+];
+
+/// Phase 3: single quotes -> ASCII apostrophe.
+const SINGLE_QUOTE_POINTS: &[u32] = &[
+ 0x2018, // left single quote
+ 0x2019, // right single quote
+ 0x201B, // single high-reversed-9
+ 0x2039, // single left guillemet
+ 0x203A, // single right guillemet
+ 0x0060, // grave accent
+ 0x00B4, // acute accent
+ 0x2032, // prime
+];
+
+/// Phase 3: double quotes -> ASCII double quote.
+const DOUBLE_QUOTE_POINTS: &[u32] = &[
+ 0x201A, // single low-9 quote (intentionally mapped to double)
+ 0x201C, // left double quote
+ 0x201D, // right double quote
+ 0x201E, // low double quote
+ 0x201F, // double high-reversed-9
+ 0x00AB, // left guillemet
+ 0x00BB, // right guillemet
+ 0x2033, // double prime
+ 0x301D, // reversed double prime quotation mark
+ 0x301E, // double prime quotation mark
+ 0x301F, // low double prime quotation mark
+];
+
+/// Phase 3: CJK corner brackets -> ASCII double quote.
+const CJK_QUOTE_RANGES: &[(u32, u32)] = &[
+ (0x300C, 0x300F), // CJK corner brackets
+ (0xFE41, 0xFE44), // presentation forms for vertical corner brackets
+];
+
+/// Phase 4: dashes -> ASCII hyphen-minus.
+const DASH_POINTS: &[u32] = &[
+ 0x2212, // minus sign
+ 0xFE58, // small em dash
+ 0xFE63, // small hyphen-minus
+];
+const DASH_RANGES: &[(u32, u32)] = &[
+ (0x2010, 0x2015), // hyphen .. horizontal bar
+];
+
+/// Phase 5: ellipsis -> three periods.
+const ELLIPSIS: char = '\u{2026}';
+
+// ---------------------------------------------------------------------------
+// Range / point membership helpers (linear; the sets are tiny).
+// ---------------------------------------------------------------------------
+
+fn in_ranges(c: char, ranges: &[(u32, u32)]) -> bool {
+ let cp = c as u32;
+ ranges.iter().any(|&(start, end)| cp >= start && cp <= end)
+}
+
+fn in_points(c: char, points: &[u32]) -> bool {
+ points.contains(&(c as u32))
+}
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+/// Apply the HTMLTrust 8-phase canonicalization pipeline to `text`.
+///
+/// Order matches the JavaScript reference implementation precisely.
+///
+/// # Arguments
+///
+/// * `text` -- raw text content (typically the output of
+/// [`extract_canonical_text`]).
+/// * `preserve_whitespace` -- `true` for `` content where whitespace
+/// is significant; otherwise `false`.
+///
+/// # Returns
+///
+/// Normalized text, suitable for hashing.
+pub fn normalize_text(text: &str, preserve_whitespace: bool) -> String {
+ // Phase 1: NFKC.
+ let nfkc: String = text.nfkc().collect();
+
+ // Phases 6 + 7: strip invisible / formatting / bidi characters.
+ let stripped: String = nfkc.chars().filter(|&c| !in_ranges(c, STRIP_RANGES)).collect();
+
+ // Phase 2: whitespace normalization.
+ let ws: String = if preserve_whitespace {
+ stripped
+ } else {
+ let mut buf = String::with_capacity(stripped.len());
+ let mut prev_space = false;
+ for c in stripped.chars() {
+ if in_ranges(c, WHITESPACE_RANGES) {
+ if !prev_space {
+ buf.push(' ');
+ prev_space = true;
+ }
+ } else {
+ buf.push(c);
+ prev_space = false;
+ }
+ }
+ buf
+ };
+
+ // Phases 3, 4, 5 in a single pass.
+ let mut out = String::with_capacity(ws.len());
+ for c in ws.chars() {
+ if in_points(c, SINGLE_QUOTE_POINTS) {
+ out.push('\'');
+ } else if in_points(c, DOUBLE_QUOTE_POINTS) || in_ranges(c, CJK_QUOTE_RANGES) {
+ out.push('"');
+ } else if in_points(c, DASH_POINTS) || in_ranges(c, DASH_RANGES) {
+ out.push('-');
+ } else if c == ELLIPSIS {
+ out.push_str("...");
+ } else {
+ out.push(c);
+ }
+ }
+ out
+}
+
+/// Extract canonical text from an HTML fragment.
+///
+/// Implements the HTML -> canonical text extraction defined in spec §2.1
+/// and ports the contract of the JavaScript `extractCanonicalText`. Uses
+/// `scraper` (html5ever under the hood) for parsing.
+///
+/// # Arguments
+///
+/// * `html` -- HTML fragment to canonicalize.
+///
+/// # Returns
+///
+/// Canonical text, ready to be hashed. Trimmed of leading/trailing
+/// whitespace.
+pub fn extract_canonical_text(html: &str) -> String {
+ let document = Html::parse_fragment(html);
+
+ let mut out = String::new();
+ walk(document.tree.root(), &mut out);
+
+ normalize_text(&out, false).trim().to_string()
+}
+
+fn is_excluded_tag(name: &str) -> bool {
+ matches!(
+ name,
+ "script" | "style" | "meta" | "link" | "head" | "noscript"
+ )
+}
+
+fn is_block_tag(name: &str) -> bool {
+ matches!(
+ name,
+ "address"
+ | "article"
+ | "aside"
+ | "blockquote"
+ | "canvas"
+ | "dd"
+ | "div"
+ | "dl"
+ | "dt"
+ | "fieldset"
+ | "figcaption"
+ | "figure"
+ | "footer"
+ | "form"
+ | "h1"
+ | "h2"
+ | "h3"
+ | "h4"
+ | "h5"
+ | "h6"
+ | "header"
+ | "hr"
+ | "li"
+ | "main"
+ | "nav"
+ | "noscript"
+ | "ol"
+ | "output"
+ | "p"
+ | "pre"
+ | "section"
+ | "table"
+ | "tfoot"
+ | "thead"
+ | "tr"
+ | "td"
+ | "th"
+ | "ul"
+ | "video"
+ )
+}
+
+fn walk<'a>(node: NodeRef<'a, Node>, out: &mut String) {
+ for child in node.children() {
+ match child.value() {
+ Node::Text(t) => {
+ out.push_str(&t.text);
+ }
+ Node::Element(e) => {
+ let name = e.name();
+ if is_excluded_tag(name) {
+ continue;
+ }
+ let block = is_block_tag(name);
+ if block {
+ out.push(' ');
+ }
+ walk(child, out);
+ if block {
+ out.push(' ');
+ }
+ }
+ _ => {
+ // Comments, doctypes, processing instructions -- not signed.
+ }
+ }
+ }
+}
+
+/// Compute the canonical serialization of a claim map.
+///
+/// Each name and value is run through [`normalize_text`] and entries are
+/// sorted lexically by name, then joined by `\n` as `name=value` pairs.
+/// The caller is responsible for hashing the result.
+///
+/// `BTreeMap` is used as the input type because its iteration order is
+/// already lexicographic, which makes the determinism property obvious
+/// at the type level. Callers with other map types can pass via
+/// `BTreeMap::from_iter(...)`.
+pub fn canonicalize_claims(claims: &BTreeMap) -> String {
+ let mut entries: Vec<(String, String)> = claims
+ .iter()
+ .map(|(k, v)| (normalize_text(k, false), normalize_text(v, false)))
+ .collect();
+ // Re-sort after normalization in case normalization changes name order.
+ entries.sort_by(|a, b| a.0.cmp(&b.0));
+ entries
+ .into_iter()
+ .map(|(k, v)| format!("{}={}", k, v))
+ .collect::>()
+ .join("\n")
+}
diff --git a/rust/tests/conformance.rs b/rust/tests/conformance.rs
new file mode 100644
index 0000000..4d3a04f
--- /dev/null
+++ b/rust/tests/conformance.rs
@@ -0,0 +1,208 @@
+//! Conformance tests for the HTMLTrust Rust binding.
+//!
+//! The 18 normalization cases below are a direct port of
+//! `htmltrust-canonicalization/javascript/test.js` and MUST produce
+//! byte-identical results across all language bindings.
+
+use std::collections::BTreeMap;
+
+use htmltrust_canonicalization::{
+ canonicalize_claims, extract_canonical_text, normalize_text,
+};
+
+/// One conformance vector. `(input_a, input_b, should_match, description)`.
+type Case = (&'static str, &'static str, bool, &'static str);
+
+const NORMALIZATION_CASES: &[Case] = &[
+ (
+ "\u{201C}Hello\u{201D}",
+ "\"Hello\"",
+ true,
+ "Curly double quotes -> straight",
+ ),
+ (
+ "caf\u{00E9}",
+ "cafe\u{0301}",
+ true,
+ "Precomposed vs combining (NFKC)",
+ ),
+ ("\u{FB01}nd", "find", true, "fi ligature (NFKC)"),
+ (
+ "word \u{2014} word",
+ "word - word",
+ true,
+ "Em dash -> hyphen-minus",
+ ),
+ (
+ "\u{00AB}Bonjour\u{00BB}",
+ "\"Bonjour\"",
+ true,
+ "Guillemets -> double quotes",
+ ),
+ (
+ "\u{300C}\u{6771}\u{4EAC}\u{300D}",
+ "\"\u{6771}\u{4EAC}\"",
+ true,
+ "CJK corner brackets -> double quotes",
+ ),
+ (
+ "\u{0645}\u{06CC}\u{200C}\u{062E}\u{0648}\u{0627}\u{0647}\u{0645}",
+ "\u{0645}\u{06CC}\u{062E}\u{0648}\u{0627}\u{0647}\u{0645}",
+ false,
+ "ZWNJ is semantic (Persian)",
+ ),
+ (
+ "\u{0643}\u{062A}\u{0640}\u{0640}\u{0640}\u{0627}\u{0628}",
+ "\u{0643}\u{062A}\u{0627}\u{0628}",
+ true,
+ "Arabic tatweel stripped",
+ ),
+ ("\u{FF21}\u{FF11}", "A1", true, "Fullwidth ASCII (NFKC)"),
+ ("\u{2460}", "1", true, "Circled digit (NFKC)"),
+ ("word\u{200B}word", "wordword", true, "ZWSP stripped"),
+ (
+ "word\u{200C}word",
+ "wordword",
+ false,
+ "ZWNJ preserved (different)",
+ ),
+ ("Hello\u{2026}", "Hello...", true, "Ellipsis -> three dots"),
+ (
+ "\u{2018}Hello\u{2019}",
+ "'Hello'",
+ true,
+ "Curly single quotes -> straight",
+ ),
+ (
+ "\u{201A}German\u{201C}",
+ "\"German\"",
+ true,
+ "Low-9 quotes -> straight",
+ ),
+ ("a\u{00A0}b", "a b", true, "No-break space -> space"),
+ ("a\u{3000}b", "a b", true, "Ideographic space -> space"),
+ ("a \t b", "a b", true, "Whitespace collapse"),
+];
+
+#[test]
+fn normalization_conformance() {
+ let mut failures = Vec::::new();
+ for &(a, b, should_match, desc) in NORMALIZATION_CASES {
+ let na = normalize_text(a, false);
+ let nb = normalize_text(b, false);
+ let matched = na == nb;
+ if matched != should_match {
+ failures.push(format!(
+ " {desc}: A={na:?} B={nb:?} expected match={should_match}, got match={matched}",
+ ));
+ }
+ }
+ assert!(
+ failures.is_empty(),
+ "{} failure(s):\n{}",
+ failures.len(),
+ failures.join("\n"),
+ );
+}
+
+#[test]
+fn preserve_whitespace_skips_collapse() {
+ let src = "line1\n line2\t\tline3";
+ assert_eq!(normalize_text(src, true), src);
+}
+
+#[test]
+fn idempotent_for_typical_input() {
+ let src = "\u{201C}Caf\u{00E9}\u{2014}test\u{2026}\u{201D}";
+ let once = normalize_text(src, false);
+ let twice = normalize_text(&once, false);
+ assert_eq!(once, twice);
+}
+
+#[test]
+fn extract_inline_no_separator() {
+ assert_eq!(
+ extract_canonical_text("hello world
"),
+ "hello world",
+ );
+}
+
+#[test]
+fn extract_block_boundary_inserts_space() {
+ assert_eq!(extract_canonical_text("A
B
"), "A B");
+}
+
+#[test]
+fn extract_excluded_elements_removed() {
+ let html = "\
+before
\
+\
+\
+\
+after
";
+ assert_eq!(extract_canonical_text(html), "before after");
+}
+
+#[test]
+fn extract_entity_decoding() {
+ assert_eq!(
+ extract_canonical_text("A & B — C
"),
+ "A & B - C",
+ );
+}
+
+#[test]
+fn extract_normalization_pipeline_applied() {
+ assert_eq!(
+ extract_canonical_text("\u{201C}Hello\u{201D}
"),
+ "\"Hello\"",
+ );
+}
+
+#[test]
+fn extract_nested_blocks() {
+ let html = "Title
\
+Para one.
Para two.
";
+ assert_eq!(extract_canonical_text(html), "Title Para one. Para two.");
+}
+
+#[test]
+fn extract_list_items_separated() {
+ assert_eq!(
+ extract_canonical_text("- a
- b
- c
"),
+ "a b c",
+ );
+}
+
+#[test]
+fn extract_inline_link_no_separator() {
+ assert_eq!(
+ extract_canonical_text("see here now
"),
+ "see here now",
+ );
+}
+
+#[test]
+fn claims_empty() {
+ let claims: BTreeMap = BTreeMap::new();
+ assert_eq!(canonicalize_claims(&claims), "");
+}
+
+#[test]
+fn claims_sorted_by_name() {
+ let mut claims = BTreeMap::new();
+ claims.insert("License".to_string(), "CC-BY-4.0".to_string());
+ claims.insert("AIAssistance".to_string(), "None".to_string());
+ claims.insert("ContentType".to_string(), "Article".to_string());
+ assert_eq!(
+ canonicalize_claims(&claims),
+ "AIAssistance=None\nContentType=Article\nLicense=CC-BY-4.0",
+ );
+}
+
+#[test]
+fn claims_normalize_values() {
+ let mut claims = BTreeMap::new();
+ claims.insert("author".to_string(), "\u{201C}Alice\u{201D}".to_string());
+ assert_eq!(canonicalize_claims(&claims), "author=\"Alice\"");
+}