diff --git a/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go b/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go
index 35977b9e..6ec150cd 100644
--- a/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go
+++ b/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go
@@ -157,6 +157,8 @@ func (f *Fetcher) fetchDetail(ctx context.Context, session domain.Session, numbe
func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, number string, pubs []publicationJSON) []domain.BillVersion {
versions := make([]domain.BillVersion, 0, len(pubs))
+ // firstXMLURL / firstPDFURL hold the first version's resolved links (the "_1" base)
+ // so later stages that expose no link of their own can derive a sibling candidate.
var firstXMLURL, firstPDFURL string
for i, pub := range pubs {
stage := firstNonEmpty(pub.PublicationTypeNameEn, pub.PublicationTypeName)
@@ -172,34 +174,32 @@ func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, nu
SortOrder: i + 1,
}
- var xmlURL, pdfURL string
- if firstXMLURL == "" {
- xmlURL, pdfURL = f.fetchDocumentLinks(ctx, htmlURL)
- if xmlURL != "" {
- firstXMLURL = xmlURL
- }
- if pdfURL != "" {
- firstPDFURL = pdfURL
- }
- } else {
- xmlURL = constructXMLURL(firstXMLURL, i+1)
- pdfURL = constructPDFURL(firstPDFURL, i+1)
- }
+ // Read this stage's own DocumentViewer page first: it links to the correct
+ // document version regardless of where the stage falls in the publication list.
+ directXML, directPDF := f.fetchDocumentLinks(ctx, htmlURL)
+ // PDF: prefer this stage's own anchor, else derive a sibling from the first stage's.
+ version.PDFURL = firstNonEmpty(directPDF, constructPDFURL(firstPDFURL, i+1))
+
+ // XML: try candidates in order of trust and persist only one that fetches as
+ // bill XML, so a guessed sibling that 404s or returns a non-bill page is dropped.
+ xmlURL, xmlData := f.resolveVersionXML(ctx, number, directXML, directPDF, firstXMLURL, i+1)
version.XMLURL = xmlURL
- version.PDFURL = pdfURL
-
- if xmlURL != "" {
- xmlData, err := f.getBytes(ctx, xmlURL, "text/xml")
- if err == nil {
- hash := computeSHA256(xmlData)
- version.TextHash = &hash
- version.TextSourceURL = &xmlURL
-
- sections, err := parseBillXML(xmlData)
- if err == nil {
- version.Sections = sections
- }
+
+ if firstXMLURL == "" && xmlURL != "" {
+ firstXMLURL = xmlURL
+ }
+ if firstPDFURL == "" && version.PDFURL != "" {
+ firstPDFURL = version.PDFURL
+ }
+
+ if len(xmlData) > 0 {
+ hash := computeSHA256(xmlData)
+ version.TextHash = &hash
+ source := xmlURL
+ version.TextSourceURL = &source
+ if sections, err := parseBillXML(xmlData); err == nil {
+ version.Sections = sections
}
}
@@ -208,6 +208,40 @@ func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, nu
return versions
}
+// resolveVersionXML returns the authoritative XML URL for one publication along with the
+// fetched payload (reused by the caller to hash and parse, avoiding a second request).
+// Candidates are tried most-trusted first:
+// 1. the .xml anchor on the stage's own DocumentViewer page,
+// 2. the XML sibling that lives beside the stage's own PDF anchor (same version directory),
+// 3. a sort-order sibling derived from the first stage's resolved XML URL.
+//
+// Steps 1 and 2 come from this stage's own page, so they are always correctly attributed.
+// Step 3 is a best-effort guess that assumes parl.ca's document number tracks the
+// publication order; it only fires when the page exposes no links of its own. A candidate
+// wins only when it returns a 2xx bill-XML payload, so derived guesses that 404 or return a
+// non-bill page are never persisted as xml_url.
+func (f *Fetcher) resolveVersionXML(ctx context.Context, number, directXML, directPDF, firstXMLURL string, sortOrder int) (string, []byte) {
+ sortOrderSibling := constructXMLURL(firstXMLURL, sortOrder)
+ if sortOrderSibling == firstXMLURL {
+ // No version substitution happened (e.g. the base URL is not a "_1" directory),
+ // so the candidate is just the base version again — never reuse it for another stage.
+ sortOrderSibling = ""
+ }
+ candidates := dedupeNonEmpty(
+ directXML,
+ xmlSiblingFromPDF(directPDF, number),
+ sortOrderSibling,
+ )
+ for _, candidate := range candidates {
+ data, err := f.getBytes(ctx, candidate, "text/xml")
+ if err != nil || !looksLikeBillXML(data) {
+ continue
+ }
+ return candidate, data
+ }
+ return "", nil
+}
+
func constructXMLURL(firstURL string, sortOrder int) string {
if firstURL == "" {
return ""
@@ -226,6 +260,64 @@ func constructPDFURL(firstURL string, sortOrder int) string {
return res
}
+// xmlSiblingFromPDF derives the bill XML URL that lives in the same parl.ca version
+// directory as a PDF anchor. parl.ca keeps both files in a per-version folder (for
+// example .../C-11/C-11_3/C-11_3.PDF beside .../C-11/C-11_3/C-11_E.xml), so the PDF's
+// directory pins the correct version even when sort-order derivation would not. Returns
+// empty unless the input is a real .pdf path and the bill number is known.
+func xmlSiblingFromPDF(pdfURL, number string) string {
+ pdfURL = strings.TrimSpace(pdfURL)
+ number = strings.TrimSpace(number)
+ if pdfURL == "" || number == "" {
+ return ""
+ }
+ if !strings.HasSuffix(lowerURLPath(pdfURL), ".pdf") {
+ return ""
+ }
+ idx := strings.LastIndex(pdfURL, "/")
+ if idx < 0 {
+ return ""
+ }
+ return pdfURL[:idx+1] + number + "_E.xml"
+}
+
+// dedupeNonEmpty returns the inputs with blanks dropped and duplicates removed, preserving
+// first-seen order so the most-trusted candidate stays first.
+func dedupeNonEmpty(values ...string) []string {
+ seen := make(map[string]struct{}, len(values))
+ out := make([]string, 0, len(values))
+ for _, value := range values {
+ value = strings.TrimSpace(value)
+ if value == "" {
+ continue
+ }
+ if _, ok := seen[value]; ok {
+ continue
+ }
+ seen[value] = struct{}{}
+ out = append(out, value)
+ }
+ return out
+}
+
+// looksLikeBillXML reports whether data parses as XML whose root element is , the
+// parl.ca legislative document root. It rejects empty bodies, HTML soft-error pages, and
+// unrelated XML, so only genuine bill payloads are accepted as a version's source text.
+func looksLikeBillXML(data []byte) bool {
+ if len(bytes.TrimSpace(data)) == 0 {
+ return false
+ }
+ decoder := xml.NewDecoder(bytes.NewReader(data))
+ for {
+ tok, err := decoder.Token()
+ if err != nil {
+ return false
+ }
+ if start, ok := tok.(xml.StartElement); ok {
+ return strings.EqualFold(start.Name.Local, "Bill")
+ }
+ }
+}
func (f *Fetcher) fetchDocumentLinks(ctx context.Context, pageURL string) (string, string) {
body, err := f.getBytes(ctx, pageURL, "text/html")
diff --git a/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go b/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go
index e77e3bfe..eb755125 100644
--- a/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go
+++ b/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go
@@ -2,8 +2,10 @@ package legisinfo
import (
"context"
+ "fmt"
"net/http"
"net/http/httptest"
+ "strings"
"testing"
"epac/bills-indexer/internal/domain"
@@ -204,3 +206,277 @@ func TestConstructURL(t *testing.T) {
})
}
+func billXMLBody(label, text string) string {
+ return fmt.Sprintf(`
+`, label, text)
+}
+
+func assertVersionXML(t *testing.T, v domain.BillVersion, xmlSuffix, sectionLabel string) {
+ t.Helper()
+ if !strings.HasSuffix(v.XMLURL, xmlSuffix) {
+ t.Errorf("version %q XMLURL = %q, want suffix %q", v.StageSlug, v.XMLURL, xmlSuffix)
+ }
+ if v.TextHash == nil || *v.TextHash == "" {
+ t.Errorf("version %q TextHash not set", v.StageSlug)
+ }
+ if v.TextSourceURL == nil || !strings.HasSuffix(*v.TextSourceURL, xmlSuffix) {
+ t.Errorf("version %q TextSourceURL = %v, want suffix %q", v.StageSlug, v.TextSourceURL, xmlSuffix)
+ }
+ if len(v.Sections) != 1 || v.Sections[0].Label != sectionLabel {
+ t.Errorf("version %q Sections = %#v, want one section labeled %q", v.StageSlug, v.Sections, sectionLabel)
+ }
+}
+
+// TestEnrichVersionsResolvesDirectAndPDFSiblingXMLLinks covers the discovery happy paths:
+// stages that expose a direct .xml anchor stay populated and stable, and an intermediate
+// stage that exposes only a PDF anchor recovers its XML from the sibling beside that PDF —
+// at the document directory the page actually links, not the sort-order guess.
+func TestEnrichVersionsResolvesDirectAndPDFSiblingXMLLinks(t *testing.T) {
+ const xmlType = "application/xml"
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ switch r.URL.Path {
+ case "/DocumentViewer/en/45-1/bill/C-9/first-reading":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`XMLPDF`))
+ case "/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml":
+ w.Header().Set("Content-Type", xmlType)
+ _, _ = w.Write([]byte(billXMLBody("1", "First reading clause.")))
+
+ // Intermediate stage: only a PDF anchor, at a non-sort-order directory (C-9_3).
+ case "/DocumentViewer/en/45-1/bill/C-9/as-amended-by-committee":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`PDF`))
+ case "/Content/Bills/451/Government/C-9/C-9_3/C-9_E.xml":
+ w.Header().Set("Content-Type", xmlType)
+ _, _ = w.Write([]byte(billXMLBody("2", "Amended clause.")))
+
+ // Royal assent: both anchors directly, at C-9_4.
+ case "/DocumentViewer/en/45-1/bill/C-9/royal-assent":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`XMLPDF`))
+ case "/Content/Bills/451/Government/C-9/C-9_4/C-9_E.xml":
+ w.Header().Set("Content-Type", xmlType)
+ _, _ = w.Write([]byte(billXMLBody("3", "Royal assent clause.")))
+
+ default:
+ http.NotFound(w, r)
+ }
+ }))
+ defer server.Close()
+
+ fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client()))
+ pubs := []publicationJSON{
+ {PublicationID: 1001, PublicationTypeNameEn: "First Reading"},
+ {PublicationID: 1002, PublicationTypeNameEn: "As Amended by Committee"},
+ {PublicationID: 1003, PublicationTypeNameEn: "Royal Assent"},
+ }
+ versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs)
+ if len(versions) != 3 {
+ t.Fatalf("versions len = %d, want 3", len(versions))
+ }
+
+ assertVersionXML(t, versions[0], "/C-9_1/C-9_E.xml", "1")
+ if !strings.HasSuffix(versions[0].PDFURL, "/C-9_1/C-9_1.PDF") {
+ t.Errorf("first-reading PDFURL = %q", versions[0].PDFURL)
+ }
+
+ assertVersionXML(t, versions[1], "/C-9_3/C-9_E.xml", "2")
+ if !strings.HasSuffix(versions[1].PDFURL, "/C-9_3/C-9_3.PDF") {
+ t.Errorf("as-amended PDFURL = %q", versions[1].PDFURL)
+ }
+
+ assertVersionXML(t, versions[2], "/C-9_4/C-9_E.xml", "3")
+}
+
+// TestEnrichVersionsDerivesSortOrderSiblingWhenPageHasNoLinks covers the last-resort path:
+// a stage whose DocumentViewer page renders its links client-side (no .xml/.pdf anchors)
+// still resolves through the predictable sort-order sibling derived from the first stage.
+func TestEnrichVersionsDerivesSortOrderSiblingWhenPageHasNoLinks(t *testing.T) {
+ const xmlType = "application/xml"
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ switch r.URL.Path {
+ case "/DocumentViewer/en/45-1/bill/C-9/first-reading":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`XMLPDF`))
+ case "/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml":
+ w.Header().Set("Content-Type", xmlType)
+ _, _ = w.Write([]byte(billXMLBody("1", "First reading clause.")))
+
+ case "/DocumentViewer/en/45-1/bill/C-9/as-passed-by-the-house-of-commons":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`loaded by script
`))
+ case "/Content/Bills/451/Government/C-9/C-9_2/C-9_E.xml":
+ w.Header().Set("Content-Type", xmlType)
+ _, _ = w.Write([]byte(billXMLBody("2", "As passed clause.")))
+ default:
+ http.NotFound(w, r)
+ }
+ }))
+ defer server.Close()
+
+ fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client()))
+ pubs := []publicationJSON{
+ {PublicationID: 2001, PublicationTypeNameEn: "First Reading"},
+ {PublicationID: 2002, PublicationTypeNameEn: "As Passed by the House of Commons"},
+ }
+ versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs)
+ if len(versions) != 2 {
+ t.Fatalf("versions len = %d, want 2", len(versions))
+ }
+ assertVersionXML(t, versions[0], "/C-9_1/C-9_E.xml", "1")
+ assertVersionXML(t, versions[1], "/C-9_2/C-9_E.xml", "2")
+}
+
+// TestEnrichVersionsDropsUnvalidatedXMLCandidates covers acceptance: a derived candidate
+// that returns an HTTP 200 HTML soft-error, or 404s, is never persisted as xml_url and
+// leaves the version carrying no text.
+func TestEnrichVersionsDropsUnvalidatedXMLCandidates(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ switch r.URL.Path {
+ case "/DocumentViewer/en/45-1/bill/C-9/first-reading":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`XML`))
+ case "/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml":
+ w.Header().Set("Content-Type", "application/xml")
+ _, _ = w.Write([]byte(billXMLBody("1", "First reading clause.")))
+
+ // Sort-order sibling exists (C-9_2) but returns an HTTP 200 HTML soft-error page.
+ case "/DocumentViewer/en/45-1/bill/C-9/as-amended-by-committee":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`no document links here
`))
+ case "/Content/Bills/451/Government/C-9/C-9_2/C-9_E.xml":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`Document not found`))
+
+ // Sort-order sibling (C-9_3) is not served at all, i.e. 404s.
+ case "/DocumentViewer/en/45-1/bill/C-9/as-passed-by-the-senate":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`still nothing
`))
+ default:
+ http.NotFound(w, r)
+ }
+ }))
+ defer server.Close()
+
+ fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client()))
+ pubs := []publicationJSON{
+ {PublicationID: 3001, PublicationTypeNameEn: "First Reading"},
+ {PublicationID: 3002, PublicationTypeNameEn: "As Amended by Committee"},
+ {PublicationID: 3003, PublicationTypeNameEn: "As Passed by the Senate"},
+ }
+ versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs)
+ if len(versions) != 3 {
+ t.Fatalf("versions len = %d, want 3", len(versions))
+ }
+ assertVersionXML(t, versions[0], "/C-9_1/C-9_E.xml", "1")
+
+ if versions[1].XMLURL != "" {
+ t.Errorf("soft-error stage XMLURL = %q, want empty", versions[1].XMLURL)
+ }
+ if versions[1].TextHash != nil || len(versions[1].Sections) != 0 {
+ t.Errorf("soft-error stage should carry no text: hash=%v sections=%#v", versions[1].TextHash, versions[1].Sections)
+ }
+ if versions[2].XMLURL != "" {
+ t.Errorf("missing-source stage XMLURL = %q, want empty", versions[2].XMLURL)
+ }
+ if versions[2].TextHash != nil || len(versions[2].Sections) != 0 {
+ t.Errorf("missing-source stage should carry no text: hash=%v sections=%#v", versions[2].TextHash, versions[2].Sections)
+ }
+}
+
+// TestEnrichVersionsDoesNotReuseBaseXMLForLaterStages guards the sort-order fallback: when
+// the first resolved XML is not a "_1" directory, a later stage with no links of its own
+// must be left empty rather than re-pointed at an earlier stage's document.
+func TestEnrichVersionsDoesNotReuseBaseXMLForLaterStages(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ switch r.URL.Path {
+ // First-reading page renders client-side: no anchors, and no first-reading XML exists.
+ case "/DocumentViewer/en/45-1/bill/C-9/first-reading":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`loaded by script
`))
+
+ // As amended exposes a direct anchor at a non-"_1" directory (C-9_3).
+ case "/DocumentViewer/en/45-1/bill/C-9/as-amended-by-committee":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`XML`))
+ case "/Content/Bills/451/Government/C-9/C-9_3/C-9_E.xml":
+ w.Header().Set("Content-Type", "application/xml")
+ _, _ = w.Write([]byte(billXMLBody("3", "Amended clause.")))
+
+ // As passed renders client-side too: no anchors of its own.
+ case "/DocumentViewer/en/45-1/bill/C-9/as-passed-by-the-house-of-commons":
+ w.Header().Set("Content-Type", "text/html")
+ _, _ = w.Write([]byte(`loaded by script
`))
+ default:
+ http.NotFound(w, r)
+ }
+ }))
+ defer server.Close()
+
+ fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client()))
+ pubs := []publicationJSON{
+ {PublicationID: 4001, PublicationTypeNameEn: "First Reading"},
+ {PublicationID: 4002, PublicationTypeNameEn: "As Amended by Committee"},
+ {PublicationID: 4003, PublicationTypeNameEn: "As Passed by the House of Commons"},
+ }
+ versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs)
+ if len(versions) != 3 {
+ t.Fatalf("versions len = %d, want 3", len(versions))
+ }
+ if versions[0].XMLURL != "" {
+ t.Errorf("first-reading XMLURL = %q, want empty", versions[0].XMLURL)
+ }
+ assertVersionXML(t, versions[1], "/C-9_3/C-9_E.xml", "3")
+ if versions[2].XMLURL != "" {
+ t.Errorf("as-passed XMLURL = %q, want empty (must not reuse the as-amended document)", versions[2].XMLURL)
+ }
+}
+
+func TestLooksLikeBillXML(t *testing.T) {
+ cases := []struct {
+ name string
+ body string
+ want bool
+ }{
+ {"bill with prolog", ``, true},
+ {"bill with namespace", ``, true},
+ {"leading whitespace", "\n\t ", true},
+ {"html soft error", `
Not found`, false},
+ {"doctype html", ``, false},
+ {"unrelated xml", ``, false},
+ {"empty", "", false},
+ {"whitespace only", " \n ", false},
+ {"plain text", `Document not found`, false},
+ }
+ for _, tc := range cases {
+ t.Run(tc.name, func(t *testing.T) {
+ if got := looksLikeBillXML([]byte(tc.body)); got != tc.want {
+ t.Errorf("looksLikeBillXML(%q) = %v, want %v", tc.body, got, tc.want)
+ }
+ })
+ }
+}
+
+func TestXMLSiblingFromPDF(t *testing.T) {
+ got := xmlSiblingFromPDF("https://www.parl.ca/Content/Bills/451/Government/C-11/C-11_3/C-11_3.PDF", "C-11")
+ want := "https://www.parl.ca/Content/Bills/451/Government/C-11/C-11_3/C-11_E.xml"
+ if got != want {
+ t.Errorf("xmlSiblingFromPDF = %q, want %q", got, want)
+ }
+ if s := xmlSiblingFromPDF("https://www.parl.ca/Content/Bills/451/Government/C-11/C-11_3/C-11_E.xml", "C-11"); s != "" {
+ t.Errorf("xmlSiblingFromPDF on non-pdf input = %q, want empty", s)
+ }
+ if s := xmlSiblingFromPDF("", "C-11"); s != "" {
+ t.Errorf("xmlSiblingFromPDF empty url = %q, want empty", s)
+ }
+ if s := xmlSiblingFromPDF("https://www.parl.ca/x/C-11_3.PDF", ""); s != "" {
+ t.Errorf("xmlSiblingFromPDF empty number = %q, want empty", s)
+ }
+}
+
+func TestDedupeNonEmpty(t *testing.T) {
+ got := strings.Join(dedupeNonEmpty("a", "", " ", "b", "a", "c", "b"), ",")
+ if got != "a,b,c" {
+ t.Errorf("dedupeNonEmpty = %q, want %q", got, "a,b,c")
+ }
+}
diff --git a/docs/architecture/use-case-catalog.md b/docs/architecture/use-case-catalog.md
index 12d9ed5b..1f10e1e8 100644
--- a/docs/architecture/use-case-catalog.md
+++ b/docs/architecture/use-case-catalog.md
@@ -345,7 +345,7 @@ Entities / values: BillVersion, VersionSection.
Ports: backend Go: `BillSource`.
Primary adapters: LEGISinfo/parl.ca XML crawler/parser.
Current implementation:
- backend/bills-indexer/internal/adapter/legisinfo/fetcher.go (enrichVersions: fetch XML + computeSHA256 hash; fetchDocumentLinks; parseBillXML: clause extraction)
+ backend/bills-indexer/internal/adapter/legisinfo/fetcher.go (enrichVersions: per-stage document-link discovery + computeSHA256 hash; fetchDocumentLinks; resolveVersionXML: validated XML candidate chain — direct anchor, then PDF-directory sibling, then sort-order sibling, accepting only a fetched bill-XML payload; parseBillXML: clause extraction)
backend/bills-indexer/internal/domain/domain.go (BillVersion, VersionSection)
```