Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
540 changes: 523 additions & 17 deletions go/canonicalize_test.go

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions go/endorsement.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package canonicalize

import (
"context"
"errors"
)

// Endorsement is a third-party signed JSON attestation about a specific
// content hash, as defined in HTMLTrust spec §2.5.
type Endorsement struct {
Endorser string `json:"endorser"`
Endorsement string `json:"endorsement"` // the targeted content-hash, e.g. "sha256:..."
Signature string `json:"signature"`
Timestamp string `json:"timestamp"`
Algorithm string `json:"algorithm,omitempty"` // defaults to "ed25519"
}

// VerifyEndorsement resolves the endorser's keyid and verifies the
// endorsement's signature over the canonical binding "{endorsement}:{timestamp}".
// If the endorsement does not specify an algorithm, ed25519 is assumed. If the
// resolver chain returns a key with its own declared algorithm, that takes
// precedence over the endorsement's hint (the resolved key is the source of
// truth about what the signer actually uses).
func VerifyEndorsement(ctx context.Context, endorsement Endorsement, resolvers []KeyResolver) (bool, error) {
if endorsement.Endorser == "" {
return false, errors.New("VerifyEndorsement: endorser is required")
}
if endorsement.Endorsement == "" {
return false, errors.New("VerifyEndorsement: endorsement (target content hash) is required")
}
if endorsement.Signature == "" {
return false, errors.New("VerifyEndorsement: signature is required")
}
if endorsement.Timestamp == "" {
return false, errors.New("VerifyEndorsement: timestamp is required")
}
key, err := ResolveKey(ctx, endorsement.Endorser, resolvers)
if err != nil {
return false, err
}
algorithm := key.Algorithm
if algorithm == "" {
algorithm = endorsement.Algorithm
}
if algorithm == "" {
algorithm = "ed25519"
}
message := endorsement.Endorsement + ":" + endorsement.Timestamp
return VerifySignature(message, endorsement.Signature, key.PublicKeyPEM, algorithm)
}
150 changes: 150 additions & 0 deletions go/extract.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
package canonicalize

import (
"regexp"
"sort"
"strconv"
"strings"
)

// Elements whose text content is NEVER part of the signed content. These are
// either metadata (meta, link, script, style) or the signed-section wrapper's
// own metadata (meta tags inside a signed-section carry claims, not content).
// They are stripped entirely (with their contents) before extracting text.
//
// Go's RE2 has no backreferences, so we compile one non-greedy regex per
// element name and apply them in sequence.
var excludedPairTagNames = []string{"script", "style", "meta", "link", "head", "noscript"}

var excludedPairREs = func() []*regexp.Regexp {
out := make([]*regexp.Regexp, 0, len(excludedPairTagNames))
for _, name := range excludedPairTagNames {
out = append(out, regexp.MustCompile(`(?is)<`+name+`\b[^>]*>.*?</`+name+`\s*>`))
}
return out
}()

// Self-closing and void elements (no text content) to strip.
var voidElementsRE = regexp.MustCompile(
`(?i)<(meta|link|br|hr|img|input|source|track|wbr|area|base|col|embed|param)\b[^>]*/?>`,
)

// Block-level elements whose boundaries should become whitespace separators.
const blockElements = `address|article|aside|blockquote|canvas|dd|div|dl|dt|` +
`fieldset|figcaption|figure|footer|form|h[1-6]|header|hr|li|main|nav|` +
`noscript|ol|output|p|pre|section|table|tfoot|thead|tr|td|th|ul|video`

var blockOpenRE = regexp.MustCompile(`(?i)<(` + blockElements + `)\b[^>]*>`)
var blockCloseRE = regexp.MustCompile(`(?i)</(` + blockElements + `)\s*>`)

// Any remaining HTML tag (inline elements stripped without adding whitespace).
var anyTagRE = regexp.MustCompile(`(?i)<\/?[a-z][a-z0-9-]*\b[^>]*>`)

// HTML named-entity table (common entities; numeric handled separately).
var namedEntities = map[string]string{
"&amp;": "&",
"&lt;": "<",
"&gt;": ">",
"&quot;": "\"",
"&apos;": "'",
"&nbsp;": " ",
"&ndash;": "–",
"&mdash;": "—",
"&lsquo;": "‘",
"&rsquo;": "’",
"&ldquo;": "“",
"&rdquo;": "”",
"&hellip;": "…",
"&copy;": "©",
"&reg;": "®",
"&trade;": "™",
}

var (
namedEntityRE = regexp.MustCompile(`&[a-zA-Z]+;`)
decimalEntityRE = regexp.MustCompile(`&#(\d+);`)
hexEntityRE = regexp.MustCompile(`&#x([0-9a-fA-F]+);`)
)

func decodeEntities(text string) string {
text = namedEntityRE.ReplaceAllStringFunc(text, func(match string) string {
key := strings.ToLower(match)
if v, ok := namedEntities[key]; ok {
return v
}
return match
})
text = decimalEntityRE.ReplaceAllStringFunc(text, func(match string) string {
m := decimalEntityRE.FindStringSubmatch(match)
if len(m) < 2 {
return match
}
n, err := strconv.Atoi(m[1])
if err != nil {
return match
}
return string(rune(n))
})
text = hexEntityRE.ReplaceAllStringFunc(text, func(match string) string {
m := hexEntityRE.FindStringSubmatch(match)
if len(m) < 2 {
return match
}
n, err := strconv.ParseInt(m[1], 16, 32)
if err != nil {
return match
}
return string(rune(n))
})
return text
}

// ExtractCanonicalText extracts canonical text from an HTML fragment for
// signing or verification. Mirrors the JS extractCanonicalText() reference
// implementation: strips excluded elements, converts block boundaries to
// whitespace, strips remaining inline markup, decodes entities, and runs the
// full text normalization pipeline. The returned string is trimmed.
//
// Per HTMLTrust spec §2.1 this produces a text-only hash input: markup and
// attributes of the signed content itself are not covered by the hash.
func ExtractCanonicalText(html string, opts ...Options) (string, error) {
// Step 1: Strip excluded elements and their contents.
text := html
for _, re := range excludedPairREs {
text = re.ReplaceAllString(text, " ")
}
text = voidElementsRE.ReplaceAllString(text, " ")

// Step 2: Convert block boundaries to whitespace.
text = blockOpenRE.ReplaceAllString(text, " ")
text = blockCloseRE.ReplaceAllString(text, " ")

// Step 3: Strip all remaining (inline) tags.
text = anyTagRE.ReplaceAllString(text, "")

// Step 4: Decode HTML entities.
text = decodeEntities(text)

// Step 5: Apply the full canonicalization pipeline.
return strings.TrimSpace(NormalizeText(text, opts...)), nil
}

// CanonicalizeClaims serializes a claims map as a sorted list of "name=value"
// pairs joined by "\n". Both names and values are pushed through NormalizeText
// before serialization so the output is independent of trivial Unicode noise.
// Mirrors the JS canonicalizeClaims() reference implementation.
func CanonicalizeClaims(claims map[string]string) string {
type entry struct{ name, value string }
entries := make([]entry, 0, len(claims))
for k, v := range claims {
entries = append(entries, entry{NormalizeText(k), NormalizeText(v)})
}
sort.Slice(entries, func(i, j int) bool {
return entries[i].name < entries[j].name
})
parts := make([]string, len(entries))
for i, e := range entries {
parts[i] = e.name + "=" + e.value
}
return strings.Join(parts, "\n")
}
Loading
Loading