From 3c573681af3d6ed912a1d4883cb004bde5c2503d Mon Sep 17 00:00:00 2001
From: riddim-developer-bot <developer-bot@riddimsoftware.com>
Date: Sun, 14 Jun 2026 21:02:51 -0400
Subject: [PATCH] [EPAC-2301]: Recover intermediate bill XML links for diff
 ingestion

The bills indexer only scraped the first publication's DocumentViewer page
for an XML/PDF anchor, then derived every later stage's URL by sort order and
persisted it unvalidated. Intermediate stages (as-amended-by-committee,
as-passed-by-the-house-of-commons, as-passed-by-the-senate) ended up with no
ingested text, so the diff producer emitted bill_clause_diffs=0.

enrichVersions now reads each stage's own DocumentViewer page and resolves its
XML through a validated candidate chain: the page's direct .xml anchor, the XML
sibling beside the page's PDF anchor (same parl.ca per-version directory), then
a sort-order sibling derived from the first stage. A candidate is persisted only
when it fetches as a 2xx <Bill> XML payload, so a guess that 404s or redirects to
an HTML error page is dropped instead of stored as xml_url. Direct first-reading
and royal-assent anchors keep resolving unchanged.
---
 .../internal/adapter/legisinfo/fetcher.go     | 144 +++++++--
 .../adapter/legisinfo/fetcher_test.go         | 276 ++++++++++++++++++
 docs/architecture/use-case-catalog.md         |   2 +-
 3 files changed, 395 insertions(+), 27 deletions(-)
diff --git a/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go b/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go
index 35977b9e..6ec150cd 100644
--- a/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go
+++ b/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go
@@ -157,6 +157,8 @@ func (f *Fetcher) fetchDetail(ctx context.Context, session domain.Session, numbe
 
 func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, number string, pubs []publicationJSON) []domain.BillVersion {
 	versions := make([]domain.BillVersion, 0, len(pubs))
+	// firstXMLURL / firstPDFURL hold the first version's resolved links (the "_1" base)
+	// so later stages that expose no link of their own can derive a sibling candidate.
 	var firstXMLURL, firstPDFURL string
 	for i, pub := range pubs {
 		stage := firstNonEmpty(pub.PublicationTypeNameEn, pub.PublicationTypeName)
@@ -172,34 +174,32 @@ func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, nu
 			SortOrder:     i + 1,
 		}
 
-		var xmlURL, pdfURL string
-		if firstXMLURL == "" {
-			xmlURL, pdfURL = f.fetchDocumentLinks(ctx, htmlURL)
-			if xmlURL != "" {
-				firstXMLURL = xmlURL
-			}
-			if pdfURL != "" {
-				firstPDFURL = pdfURL
-			}
-		} else {
-			xmlURL = constructXMLURL(firstXMLURL, i+1)
-			pdfURL = constructPDFURL(firstPDFURL, i+1)
-		}
+		// Read this stage's own DocumentViewer page first: it links to the correct
+		// document version regardless of where the stage falls in the publication list.
+		directXML, directPDF := f.fetchDocumentLinks(ctx, htmlURL)
 
+		// PDF: prefer this stage's own anchor, else derive a sibling from the first stage's.
+		version.PDFURL = firstNonEmpty(directPDF, constructPDFURL(firstPDFURL, i+1))
+
+		// XML: try candidates in order of trust and persist only one that fetches as
+		// bill XML, so a guessed sibling that 404s or returns a non-bill page is dropped.
+		xmlURL, xmlData := f.resolveVersionXML(ctx, number, directXML, directPDF, firstXMLURL, i+1)
 		version.XMLURL = xmlURL
-		version.PDFURL = pdfURL
-
-		if xmlURL != "" {
-			xmlData, err := f.getBytes(ctx, xmlURL, "text/xml")
-			if err == nil {
-				hash := computeSHA256(xmlData)
-				version.TextHash = &hash
-				version.TextSourceURL = &xmlURL
-
-				sections, err := parseBillXML(xmlData)
-				if err == nil {
-					version.Sections = sections
-				}
+
+		if firstXMLURL == "" && xmlURL != "" {
+			firstXMLURL = xmlURL
+		}
+		if firstPDFURL == "" && version.PDFURL != "" {
+			firstPDFURL = version.PDFURL
+		}
+
+		if len(xmlData) > 0 {
+			hash := computeSHA256(xmlData)
+			version.TextHash = &hash
+			source := xmlURL
+			version.TextSourceURL = &source
+			if sections, err := parseBillXML(xmlData); err == nil {
+				version.Sections = sections
 			}
 		}
 
@@ -208,6 +208,40 @@ func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, nu
 	return versions
 }
 
+// resolveVersionXML returns the authoritative XML URL for one publication along with the
+// fetched payload (reused by the caller to hash and parse, avoiding a second request).
+// Candidates are tried most-trusted first:
+//  1. the .xml anchor on the stage's own DocumentViewer page,
+//  2. the XML sibling that lives beside the stage's own PDF anchor (same version directory),
+//  3. a sort-order sibling derived from the first stage's resolved XML URL.
+//
+// Steps 1 and 2 come from this stage's own page, so they are always correctly attributed.
+// Step 3 is a best-effort guess that assumes parl.ca's document number tracks the
+// publication order; it only fires when the page exposes no links of its own. A candidate
+// wins only when it returns a 2xx bill-XML payload, so derived guesses that 404 or return a
+// non-bill page are never persisted as xml_url.
+func (f *Fetcher) resolveVersionXML(ctx context.Context, number, directXML, directPDF, firstXMLURL string, sortOrder int) (string, []byte) {
+	sortOrderSibling := constructXMLURL(firstXMLURL, sortOrder)
+	if sortOrderSibling == firstXMLURL {
+		// No version substitution happened (e.g. the base URL is not a "_1" directory),
+		// so the candidate is just the base version again — never reuse it for another stage.
+		sortOrderSibling = ""
+	}
+	candidates := dedupeNonEmpty(
+		directXML,
+		xmlSiblingFromPDF(directPDF, number),
+		sortOrderSibling,
+	)
+	for _, candidate := range candidates {
+		data, err := f.getBytes(ctx, candidate, "text/xml")
+		if err != nil || !looksLikeBillXML(data) {
+			continue
+		}
+		return candidate, data
+	}
+	return "", nil
+}
+
 func constructXMLURL(firstURL string, sortOrder int) string {
 	if firstURL == "" {
 		return ""
@@ -226,6 +260,64 @@ func constructPDFURL(firstURL string, sortOrder int) string {
 	return res
 }
 
+// xmlSiblingFromPDF derives the bill XML URL that lives in the same parl.ca version
+// directory as a PDF anchor. parl.ca keeps both files in a per-version folder (for
+// example .../C-11/C-11_3/C-11_3.PDF beside .../C-11/C-11_3/C-11_E.xml), so the PDF's
+// directory pins the correct version even when sort-order derivation would not. Returns
+// empty unless the input is a real .pdf path and the bill number is known.
+func xmlSiblingFromPDF(pdfURL, number string) string {
+	pdfURL = strings.TrimSpace(pdfURL)
+	number = strings.TrimSpace(number)
+	if pdfURL == "" || number == "" {
+		return ""
+	}
+	if !strings.HasSuffix(lowerURLPath(pdfURL), ".pdf") {
+		return ""
+	}
+	idx := strings.LastIndex(pdfURL, "/")
+	if idx < 0 {
+		return ""
+	}
+	return pdfURL[:idx+1] + number + "_E.xml"
+}
+
+// dedupeNonEmpty returns the inputs with blanks dropped and duplicates removed, preserving
+// first-seen order so the most-trusted candidate stays first.
+func dedupeNonEmpty(values ...string) []string {
+	seen := make(map[string]struct{}, len(values))
+	out := make([]string, 0, len(values))
+	for _, value := range values {
+		value = strings.TrimSpace(value)
+		if value == "" {
+			continue
+		}
+		if _, ok := seen[value]; ok {
+			continue
+		}
+		seen[value] = struct{}{}
+		out = append(out, value)
+	}
+	return out
+}
+
+// looksLikeBillXML reports whether data parses as XML whose root element is <Bill>, the
+// parl.ca legislative document root. It rejects empty bodies, HTML soft-error pages, and
+// unrelated XML, so only genuine bill payloads are accepted as a version's source text.
+func looksLikeBillXML(data []byte) bool {
+	if len(bytes.TrimSpace(data)) == 0 {
+		return false
+	}
+	decoder := xml.NewDecoder(bytes.NewReader(data))
+	for {
+		tok, err := decoder.Token()
+		if err != nil {
+			return false
+		}
+		if start, ok := tok.(xml.StartElement); ok {
+			return strings.EqualFold(start.Name.Local, "Bill")
+		}
+	}
+}
 
 func (f *Fetcher) fetchDocumentLinks(ctx context.Context, pageURL string) (string, string) {
 	body, err := f.getBytes(ctx, pageURL, "text/html")
diff --git a/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go b/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go
index e77e3bfe..eb755125 100644
--- a/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go
+++ b/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go
@@ -2,8 +2,10 @@ package legisinfo
 
 import (
 	"context"
+	"fmt"
 	"net/http"
 	"net/http/httptest"
+	"strings"
 	"testing"
 
 	"epac/bills-indexer/internal/domain"
@@ -204,3 +206,277 @@ func TestConstructURL(t *testing.T) {
 	})
 }
 
+func billXMLBody(label, text string) string {
+	return fmt.Sprintf(`<?xml version="1.0" encoding="utf-8"?>
+<Bill><Body><Section><Label>%s</Label><Text>%s</Text></Section></Body></Bill>`, label, text)
+}
+
+func assertVersionXML(t *testing.T, v domain.BillVersion, xmlSuffix, sectionLabel string) {
+	t.Helper()
+	if !strings.HasSuffix(v.XMLURL, xmlSuffix) {
+		t.Errorf("version %q XMLURL = %q, want suffix %q", v.StageSlug, v.XMLURL, xmlSuffix)
+	}
+	if v.TextHash == nil || *v.TextHash == "" {
+		t.Errorf("version %q TextHash not set", v.StageSlug)
+	}
+	if v.TextSourceURL == nil || !strings.HasSuffix(*v.TextSourceURL, xmlSuffix) {
+		t.Errorf("version %q TextSourceURL = %v, want suffix %q", v.StageSlug, v.TextSourceURL, xmlSuffix)
+	}
+	if len(v.Sections) != 1 || v.Sections[0].Label != sectionLabel {
+		t.Errorf("version %q Sections = %#v, want one section labeled %q", v.StageSlug, v.Sections, sectionLabel)
+	}
+}
+
+// TestEnrichVersionsResolvesDirectAndPDFSiblingXMLLinks covers the discovery happy paths:
+// stages that expose a direct .xml anchor stay populated and stable, and an intermediate
+// stage that exposes only a PDF anchor recovers its XML from the sibling beside that PDF —
+// at the document directory the page actually links, not the sort-order guess.
+func TestEnrichVersionsResolvesDirectAndPDFSiblingXMLLinks(t *testing.T) {
+	const xmlType = "application/xml"
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/DocumentViewer/en/45-1/bill/C-9/first-reading":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<a href="/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml">XML</a><a href="/Content/Bills/451/Government/C-9/C-9_1/C-9_1.PDF">PDF</a>`))
+		case "/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml":
+			w.Header().Set("Content-Type", xmlType)
+			_, _ = w.Write([]byte(billXMLBody("1", "First reading clause.")))
+
+		// Intermediate stage: only a PDF anchor, at a non-sort-order directory (C-9_3).
+		case "/DocumentViewer/en/45-1/bill/C-9/as-amended-by-committee":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<a href="/Content/Bills/451/Government/C-9/C-9_3/C-9_3.PDF">PDF</a>`))
+		case "/Content/Bills/451/Government/C-9/C-9_3/C-9_E.xml":
+			w.Header().Set("Content-Type", xmlType)
+			_, _ = w.Write([]byte(billXMLBody("2", "Amended clause.")))
+
+		// Royal assent: both anchors directly, at C-9_4.
+		case "/DocumentViewer/en/45-1/bill/C-9/royal-assent":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<a href="/Content/Bills/451/Government/C-9/C-9_4/C-9_E.xml">XML</a><a href="/Content/Bills/451/Government/C-9/C-9_4/C-9_4.PDF">PDF</a>`))
+		case "/Content/Bills/451/Government/C-9/C-9_4/C-9_E.xml":
+			w.Header().Set("Content-Type", xmlType)
+			_, _ = w.Write([]byte(billXMLBody("3", "Royal assent clause.")))
+
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer server.Close()
+
+	fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client()))
+	pubs := []publicationJSON{
+		{PublicationID: 1001, PublicationTypeNameEn: "First Reading"},
+		{PublicationID: 1002, PublicationTypeNameEn: "As Amended by Committee"},
+		{PublicationID: 1003, PublicationTypeNameEn: "Royal Assent"},
+	}
+	versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs)
+	if len(versions) != 3 {
+		t.Fatalf("versions len = %d, want 3", len(versions))
+	}
+
+	assertVersionXML(t, versions[0], "/C-9_1/C-9_E.xml", "1")
+	if !strings.HasSuffix(versions[0].PDFURL, "/C-9_1/C-9_1.PDF") {
+		t.Errorf("first-reading PDFURL = %q", versions[0].PDFURL)
+	}
+
+	assertVersionXML(t, versions[1], "/C-9_3/C-9_E.xml", "2")
+	if !strings.HasSuffix(versions[1].PDFURL, "/C-9_3/C-9_3.PDF") {
+		t.Errorf("as-amended PDFURL = %q", versions[1].PDFURL)
+	}
+
+	assertVersionXML(t, versions[2], "/C-9_4/C-9_E.xml", "3")
+}
+
+// TestEnrichVersionsDerivesSortOrderSiblingWhenPageHasNoLinks covers the last-resort path:
+// a stage whose DocumentViewer page renders its links client-side (no .xml/.pdf anchors)
+// still resolves through the predictable sort-order sibling derived from the first stage.
+func TestEnrichVersionsDerivesSortOrderSiblingWhenPageHasNoLinks(t *testing.T) {
+	const xmlType = "application/xml"
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/DocumentViewer/en/45-1/bill/C-9/first-reading":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<a href="/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml">XML</a><a href="/Content/Bills/451/Government/C-9/C-9_1/C-9_1.PDF">PDF</a>`))
+		case "/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml":
+			w.Header().Set("Content-Type", xmlType)
+			_, _ = w.Write([]byte(billXMLBody("1", "First reading clause.")))
+
+		case "/DocumentViewer/en/45-1/bill/C-9/as-passed-by-the-house-of-commons":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<div id="viewer">loaded by script</div>`))
+		case "/Content/Bills/451/Government/C-9/C-9_2/C-9_E.xml":
+			w.Header().Set("Content-Type", xmlType)
+			_, _ = w.Write([]byte(billXMLBody("2", "As passed clause.")))
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer server.Close()
+
+	fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client()))
+	pubs := []publicationJSON{
+		{PublicationID: 2001, PublicationTypeNameEn: "First Reading"},
+		{PublicationID: 2002, PublicationTypeNameEn: "As Passed by the House of Commons"},
+	}
+	versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs)
+	if len(versions) != 2 {
+		t.Fatalf("versions len = %d, want 2", len(versions))
+	}
+	assertVersionXML(t, versions[0], "/C-9_1/C-9_E.xml", "1")
+	assertVersionXML(t, versions[1], "/C-9_2/C-9_E.xml", "2")
+}
+
+// TestEnrichVersionsDropsUnvalidatedXMLCandidates covers acceptance: a derived candidate
+// that returns an HTTP 200 HTML soft-error, or 404s, is never persisted as xml_url and
+// leaves the version carrying no text.
+func TestEnrichVersionsDropsUnvalidatedXMLCandidates(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/DocumentViewer/en/45-1/bill/C-9/first-reading":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<a href="/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml">XML</a>`))
+		case "/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml":
+			w.Header().Set("Content-Type", "application/xml")
+			_, _ = w.Write([]byte(billXMLBody("1", "First reading clause.")))
+
+		// Sort-order sibling exists (C-9_2) but returns an HTTP 200 HTML soft-error page.
+		case "/DocumentViewer/en/45-1/bill/C-9/as-amended-by-committee":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<div>no document links here</div>`))
+		case "/Content/Bills/451/Government/C-9/C-9_2/C-9_E.xml":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<html><body>Document not found</body></html>`))
+
+		// Sort-order sibling (C-9_3) is not served at all, i.e. 404s.
+		case "/DocumentViewer/en/45-1/bill/C-9/as-passed-by-the-senate":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<div>still nothing</div>`))
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer server.Close()
+
+	fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client()))
+	pubs := []publicationJSON{
+		{PublicationID: 3001, PublicationTypeNameEn: "First Reading"},
+		{PublicationID: 3002, PublicationTypeNameEn: "As Amended by Committee"},
+		{PublicationID: 3003, PublicationTypeNameEn: "As Passed by the Senate"},
+	}
+	versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs)
+	if len(versions) != 3 {
+		t.Fatalf("versions len = %d, want 3", len(versions))
+	}
+	assertVersionXML(t, versions[0], "/C-9_1/C-9_E.xml", "1")
+
+	if versions[1].XMLURL != "" {
+		t.Errorf("soft-error stage XMLURL = %q, want empty", versions[1].XMLURL)
+	}
+	if versions[1].TextHash != nil || len(versions[1].Sections) != 0 {
+		t.Errorf("soft-error stage should carry no text: hash=%v sections=%#v", versions[1].TextHash, versions[1].Sections)
+	}
+	if versions[2].XMLURL != "" {
+		t.Errorf("missing-source stage XMLURL = %q, want empty", versions[2].XMLURL)
+	}
+	if versions[2].TextHash != nil || len(versions[2].Sections) != 0 {
+		t.Errorf("missing-source stage should carry no text: hash=%v sections=%#v", versions[2].TextHash, versions[2].Sections)
+	}
+}
+
+// TestEnrichVersionsDoesNotReuseBaseXMLForLaterStages guards the sort-order fallback: when
+// the first resolved XML is not a "_1" directory, a later stage with no links of its own
+// must be left empty rather than re-pointed at an earlier stage's document.
+func TestEnrichVersionsDoesNotReuseBaseXMLForLaterStages(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		// First-reading page renders client-side: no anchors, and no first-reading XML exists.
+		case "/DocumentViewer/en/45-1/bill/C-9/first-reading":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<div>loaded by script</div>`))
+
+		// As amended exposes a direct anchor at a non-"_1" directory (C-9_3).
+		case "/DocumentViewer/en/45-1/bill/C-9/as-amended-by-committee":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<a href="/Content/Bills/451/Government/C-9/C-9_3/C-9_E.xml">XML</a>`))
+		case "/Content/Bills/451/Government/C-9/C-9_3/C-9_E.xml":
+			w.Header().Set("Content-Type", "application/xml")
+			_, _ = w.Write([]byte(billXMLBody("3", "Amended clause.")))
+
+		// As passed renders client-side too: no anchors of its own.
+		case "/DocumentViewer/en/45-1/bill/C-9/as-passed-by-the-house-of-commons":
+			w.Header().Set("Content-Type", "text/html")
+			_, _ = w.Write([]byte(`<div>loaded by script</div>`))
+		default:
+			http.NotFound(w, r)
+		}
+	}))
+	defer server.Close()
+
+	fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client()))
+	pubs := []publicationJSON{
+		{PublicationID: 4001, PublicationTypeNameEn: "First Reading"},
+		{PublicationID: 4002, PublicationTypeNameEn: "As Amended by Committee"},
+		{PublicationID: 4003, PublicationTypeNameEn: "As Passed by the House of Commons"},
+	}
+	versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs)
+	if len(versions) != 3 {
+		t.Fatalf("versions len = %d, want 3", len(versions))
+	}
+	if versions[0].XMLURL != "" {
+		t.Errorf("first-reading XMLURL = %q, want empty", versions[0].XMLURL)
+	}
+	assertVersionXML(t, versions[1], "/C-9_3/C-9_E.xml", "3")
+	if versions[2].XMLURL != "" {
+		t.Errorf("as-passed XMLURL = %q, want empty (must not reuse the as-amended document)", versions[2].XMLURL)
+	}
+}
+
+func TestLooksLikeBillXML(t *testing.T) {
+	cases := []struct {
+		name string
+		body string
+		want bool
+	}{
+		{"bill with prolog", `<?xml version="1.0"?><Bill><Body/></Bill>`, true},
+		{"bill with namespace", `<Bill xmlns="http://parl.ca/schema"><Body/></Bill>`, true},
+		{"leading whitespace", "\n\t  <Bill></Bill>", true},
+		{"html soft error", `<html><body>Not found</body></html>`, false},
+		{"doctype html", `<!DOCTYPE html><html></html>`, false},
+		{"unrelated xml", `<Document><Section/></Document>`, false},
+		{"empty", "", false},
+		{"whitespace only", "   \n  ", false},
+		{"plain text", `Document not found`, false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := looksLikeBillXML([]byte(tc.body)); got != tc.want {
+				t.Errorf("looksLikeBillXML(%q) = %v, want %v", tc.body, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestXMLSiblingFromPDF(t *testing.T) {
+	got := xmlSiblingFromPDF("https://www.parl.ca/Content/Bills/451/Government/C-11/C-11_3/C-11_3.PDF", "C-11")
+	want := "https://www.parl.ca/Content/Bills/451/Government/C-11/C-11_3/C-11_E.xml"
+	if got != want {
+		t.Errorf("xmlSiblingFromPDF = %q, want %q", got, want)
+	}
+	if s := xmlSiblingFromPDF("https://www.parl.ca/Content/Bills/451/Government/C-11/C-11_3/C-11_E.xml", "C-11"); s != "" {
+		t.Errorf("xmlSiblingFromPDF on non-pdf input = %q, want empty", s)
+	}
+	if s := xmlSiblingFromPDF("", "C-11"); s != "" {
+		t.Errorf("xmlSiblingFromPDF empty url = %q, want empty", s)
+	}
+	if s := xmlSiblingFromPDF("https://www.parl.ca/x/C-11_3.PDF", ""); s != "" {
+		t.Errorf("xmlSiblingFromPDF empty number = %q, want empty", s)
+	}
+}
+
+func TestDedupeNonEmpty(t *testing.T) {
+	got := strings.Join(dedupeNonEmpty("a", "", "  ", "b", "a", "c", "b"), ",")
+	if got != "a,b,c" {
+		t.Errorf("dedupeNonEmpty = %q, want %q", got, "a,b,c")
+	}
+}
diff --git a/docs/architecture/use-case-catalog.md b/docs/architecture/use-case-catalog.md
index 12d9ed5b..1f10e1e8 100644
--- a/docs/architecture/use-case-catalog.md
+++ b/docs/architecture/use-case-catalog.md
@@ -345,7 +345,7 @@ Entities / values: BillVersion, VersionSection.
 Ports: backend Go: `BillSource`.
 Primary adapters: LEGISinfo/parl.ca XML crawler/parser.
 Current implementation:
-  backend/bills-indexer/internal/adapter/legisinfo/fetcher.go (enrichVersions: fetch XML + computeSHA256 hash; fetchDocumentLinks; parseBillXML: clause extraction)
+  backend/bills-indexer/internal/adapter/legisinfo/fetcher.go (enrichVersions: per-stage document-link discovery + computeSHA256 hash; fetchDocumentLinks; resolveVersionXML: validated XML candidate chain — direct anchor, then PDF-directory sibling, then sort-order sibling, accepting only a fetched bill-XML payload; parseBillXML: clause extraction)
   backend/bills-indexer/internal/domain/domain.go (BillVersion, VersionSection)
 ```