From 3c573681af3d6ed912a1d4883cb004bde5c2503d Mon Sep 17 00:00:00 2001 From: riddim-developer-bot Date: Sun, 14 Jun 2026 21:02:51 -0400 Subject: [PATCH] [EPAC-2301]: Recover intermediate bill XML links for diff ingestion The bills indexer only scraped the first publication's DocumentViewer page for an XML/PDF anchor, then derived every later stage's URL by sort order and persisted it unvalidated. Intermediate stages (as-amended-by-committee, as-passed-by-the-house-of-commons, as-passed-by-the-senate) ended up with no ingested text, so the diff producer emitted bill_clause_diffs=0. enrichVersions now reads each stage's own DocumentViewer page and resolves its XML through a validated candidate chain: the page's direct .xml anchor, the XML sibling beside the page's PDF anchor (same parl.ca per-version directory), then a sort-order sibling derived from the first stage. A candidate is persisted only when it fetches as a 2xx XML payload, so a guess that 404s or redirects to an HTML error page is dropped instead of stored as xml_url. Direct first-reading and royal-assent anchors keep resolving unchanged. --- .../internal/adapter/legisinfo/fetcher.go | 144 +++++++-- .../adapter/legisinfo/fetcher_test.go | 276 ++++++++++++++++++ docs/architecture/use-case-catalog.md | 2 +- 3 files changed, 395 insertions(+), 27 deletions(-) diff --git a/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go b/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go index 35977b9e..6ec150cd 100644 --- a/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go +++ b/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go @@ -157,6 +157,8 @@ func (f *Fetcher) fetchDetail(ctx context.Context, session domain.Session, numbe func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, number string, pubs []publicationJSON) []domain.BillVersion { versions := make([]domain.BillVersion, 0, len(pubs)) + // firstXMLURL / firstPDFURL hold the first version's resolved links (the "_1" base) + // so later stages that expose no link of their own can derive a sibling candidate. var firstXMLURL, firstPDFURL string for i, pub := range pubs { stage := firstNonEmpty(pub.PublicationTypeNameEn, pub.PublicationTypeName) @@ -172,34 +174,32 @@ func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, nu SortOrder: i + 1, } - var xmlURL, pdfURL string - if firstXMLURL == "" { - xmlURL, pdfURL = f.fetchDocumentLinks(ctx, htmlURL) - if xmlURL != "" { - firstXMLURL = xmlURL - } - if pdfURL != "" { - firstPDFURL = pdfURL - } - } else { - xmlURL = constructXMLURL(firstXMLURL, i+1) - pdfURL = constructPDFURL(firstPDFURL, i+1) - } + // Read this stage's own DocumentViewer page first: it links to the correct + // document version regardless of where the stage falls in the publication list. + directXML, directPDF := f.fetchDocumentLinks(ctx, htmlURL) + // PDF: prefer this stage's own anchor, else derive a sibling from the first stage's. + version.PDFURL = firstNonEmpty(directPDF, constructPDFURL(firstPDFURL, i+1)) + + // XML: try candidates in order of trust and persist only one that fetches as + // bill XML, so a guessed sibling that 404s or returns a non-bill page is dropped. + xmlURL, xmlData := f.resolveVersionXML(ctx, number, directXML, directPDF, firstXMLURL, i+1) version.XMLURL = xmlURL - version.PDFURL = pdfURL - - if xmlURL != "" { - xmlData, err := f.getBytes(ctx, xmlURL, "text/xml") - if err == nil { - hash := computeSHA256(xmlData) - version.TextHash = &hash - version.TextSourceURL = &xmlURL - - sections, err := parseBillXML(xmlData) - if err == nil { - version.Sections = sections - } + + if firstXMLURL == "" && xmlURL != "" { + firstXMLURL = xmlURL + } + if firstPDFURL == "" && version.PDFURL != "" { + firstPDFURL = version.PDFURL + } + + if len(xmlData) > 0 { + hash := computeSHA256(xmlData) + version.TextHash = &hash + source := xmlURL + version.TextSourceURL = &source + if sections, err := parseBillXML(xmlData); err == nil { + version.Sections = sections } } @@ -208,6 +208,40 @@ func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, nu return versions } +// resolveVersionXML returns the authoritative XML URL for one publication along with the +// fetched payload (reused by the caller to hash and parse, avoiding a second request). +// Candidates are tried most-trusted first: +// 1. the .xml anchor on the stage's own DocumentViewer page, +// 2. the XML sibling that lives beside the stage's own PDF anchor (same version directory), +// 3. a sort-order sibling derived from the first stage's resolved XML URL. +// +// Steps 1 and 2 come from this stage's own page, so they are always correctly attributed. +// Step 3 is a best-effort guess that assumes parl.ca's document number tracks the +// publication order; it only fires when the page exposes no links of its own. A candidate +// wins only when it returns a 2xx bill-XML payload, so derived guesses that 404 or return a +// non-bill page are never persisted as xml_url. +func (f *Fetcher) resolveVersionXML(ctx context.Context, number, directXML, directPDF, firstXMLURL string, sortOrder int) (string, []byte) { + sortOrderSibling := constructXMLURL(firstXMLURL, sortOrder) + if sortOrderSibling == firstXMLURL { + // No version substitution happened (e.g. the base URL is not a "_1" directory), + // so the candidate is just the base version again — never reuse it for another stage. + sortOrderSibling = "" + } + candidates := dedupeNonEmpty( + directXML, + xmlSiblingFromPDF(directPDF, number), + sortOrderSibling, + ) + for _, candidate := range candidates { + data, err := f.getBytes(ctx, candidate, "text/xml") + if err != nil || !looksLikeBillXML(data) { + continue + } + return candidate, data + } + return "", nil +} + func constructXMLURL(firstURL string, sortOrder int) string { if firstURL == "" { return "" @@ -226,6 +260,64 @@ func constructPDFURL(firstURL string, sortOrder int) string { return res } +// xmlSiblingFromPDF derives the bill XML URL that lives in the same parl.ca version +// directory as a PDF anchor. parl.ca keeps both files in a per-version folder (for +// example .../C-11/C-11_3/C-11_3.PDF beside .../C-11/C-11_3/C-11_E.xml), so the PDF's +// directory pins the correct version even when sort-order derivation would not. Returns +// empty unless the input is a real .pdf path and the bill number is known. +func xmlSiblingFromPDF(pdfURL, number string) string { + pdfURL = strings.TrimSpace(pdfURL) + number = strings.TrimSpace(number) + if pdfURL == "" || number == "" { + return "" + } + if !strings.HasSuffix(lowerURLPath(pdfURL), ".pdf") { + return "" + } + idx := strings.LastIndex(pdfURL, "/") + if idx < 0 { + return "" + } + return pdfURL[:idx+1] + number + "_E.xml" +} + +// dedupeNonEmpty returns the inputs with blanks dropped and duplicates removed, preserving +// first-seen order so the most-trusted candidate stays first. +func dedupeNonEmpty(values ...string) []string { + seen := make(map[string]struct{}, len(values)) + out := make([]string, 0, len(values)) + for _, value := range values { + value = strings.TrimSpace(value) + if value == "" { + continue + } + if _, ok := seen[value]; ok { + continue + } + seen[value] = struct{}{} + out = append(out, value) + } + return out +} + +// looksLikeBillXML reports whether data parses as XML whose root element is , the +// parl.ca legislative document root. It rejects empty bodies, HTML soft-error pages, and +// unrelated XML, so only genuine bill payloads are accepted as a version's source text. +func looksLikeBillXML(data []byte) bool { + if len(bytes.TrimSpace(data)) == 0 { + return false + } + decoder := xml.NewDecoder(bytes.NewReader(data)) + for { + tok, err := decoder.Token() + if err != nil { + return false + } + if start, ok := tok.(xml.StartElement); ok { + return strings.EqualFold(start.Name.Local, "Bill") + } + } +} func (f *Fetcher) fetchDocumentLinks(ctx context.Context, pageURL string) (string, string) { body, err := f.getBytes(ctx, pageURL, "text/html") diff --git a/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go b/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go index e77e3bfe..eb755125 100644 --- a/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go +++ b/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go @@ -2,8 +2,10 @@ package legisinfo import ( "context" + "fmt" "net/http" "net/http/httptest" + "strings" "testing" "epac/bills-indexer/internal/domain" @@ -204,3 +206,277 @@ func TestConstructURL(t *testing.T) { }) } +func billXMLBody(label, text string) string { + return fmt.Sprintf(` +
%s
`, label, text) +} + +func assertVersionXML(t *testing.T, v domain.BillVersion, xmlSuffix, sectionLabel string) { + t.Helper() + if !strings.HasSuffix(v.XMLURL, xmlSuffix) { + t.Errorf("version %q XMLURL = %q, want suffix %q", v.StageSlug, v.XMLURL, xmlSuffix) + } + if v.TextHash == nil || *v.TextHash == "" { + t.Errorf("version %q TextHash not set", v.StageSlug) + } + if v.TextSourceURL == nil || !strings.HasSuffix(*v.TextSourceURL, xmlSuffix) { + t.Errorf("version %q TextSourceURL = %v, want suffix %q", v.StageSlug, v.TextSourceURL, xmlSuffix) + } + if len(v.Sections) != 1 || v.Sections[0].Label != sectionLabel { + t.Errorf("version %q Sections = %#v, want one section labeled %q", v.StageSlug, v.Sections, sectionLabel) + } +} + +// TestEnrichVersionsResolvesDirectAndPDFSiblingXMLLinks covers the discovery happy paths: +// stages that expose a direct .xml anchor stay populated and stable, and an intermediate +// stage that exposes only a PDF anchor recovers its XML from the sibling beside that PDF — +// at the document directory the page actually links, not the sort-order guess. +func TestEnrichVersionsResolvesDirectAndPDFSiblingXMLLinks(t *testing.T) { + const xmlType = "application/xml" + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/DocumentViewer/en/45-1/bill/C-9/first-reading": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`XMLPDF`)) + case "/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml": + w.Header().Set("Content-Type", xmlType) + _, _ = w.Write([]byte(billXMLBody("1", "First reading clause."))) + + // Intermediate stage: only a PDF anchor, at a non-sort-order directory (C-9_3). + case "/DocumentViewer/en/45-1/bill/C-9/as-amended-by-committee": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`PDF`)) + case "/Content/Bills/451/Government/C-9/C-9_3/C-9_E.xml": + w.Header().Set("Content-Type", xmlType) + _, _ = w.Write([]byte(billXMLBody("2", "Amended clause."))) + + // Royal assent: both anchors directly, at C-9_4. + case "/DocumentViewer/en/45-1/bill/C-9/royal-assent": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`XMLPDF`)) + case "/Content/Bills/451/Government/C-9/C-9_4/C-9_E.xml": + w.Header().Set("Content-Type", xmlType) + _, _ = w.Write([]byte(billXMLBody("3", "Royal assent clause."))) + + default: + http.NotFound(w, r) + } + })) + defer server.Close() + + fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client())) + pubs := []publicationJSON{ + {PublicationID: 1001, PublicationTypeNameEn: "First Reading"}, + {PublicationID: 1002, PublicationTypeNameEn: "As Amended by Committee"}, + {PublicationID: 1003, PublicationTypeNameEn: "Royal Assent"}, + } + versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs) + if len(versions) != 3 { + t.Fatalf("versions len = %d, want 3", len(versions)) + } + + assertVersionXML(t, versions[0], "/C-9_1/C-9_E.xml", "1") + if !strings.HasSuffix(versions[0].PDFURL, "/C-9_1/C-9_1.PDF") { + t.Errorf("first-reading PDFURL = %q", versions[0].PDFURL) + } + + assertVersionXML(t, versions[1], "/C-9_3/C-9_E.xml", "2") + if !strings.HasSuffix(versions[1].PDFURL, "/C-9_3/C-9_3.PDF") { + t.Errorf("as-amended PDFURL = %q", versions[1].PDFURL) + } + + assertVersionXML(t, versions[2], "/C-9_4/C-9_E.xml", "3") +} + +// TestEnrichVersionsDerivesSortOrderSiblingWhenPageHasNoLinks covers the last-resort path: +// a stage whose DocumentViewer page renders its links client-side (no .xml/.pdf anchors) +// still resolves through the predictable sort-order sibling derived from the first stage. +func TestEnrichVersionsDerivesSortOrderSiblingWhenPageHasNoLinks(t *testing.T) { + const xmlType = "application/xml" + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/DocumentViewer/en/45-1/bill/C-9/first-reading": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`XMLPDF`)) + case "/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml": + w.Header().Set("Content-Type", xmlType) + _, _ = w.Write([]byte(billXMLBody("1", "First reading clause."))) + + case "/DocumentViewer/en/45-1/bill/C-9/as-passed-by-the-house-of-commons": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`
loaded by script
`)) + case "/Content/Bills/451/Government/C-9/C-9_2/C-9_E.xml": + w.Header().Set("Content-Type", xmlType) + _, _ = w.Write([]byte(billXMLBody("2", "As passed clause."))) + default: + http.NotFound(w, r) + } + })) + defer server.Close() + + fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client())) + pubs := []publicationJSON{ + {PublicationID: 2001, PublicationTypeNameEn: "First Reading"}, + {PublicationID: 2002, PublicationTypeNameEn: "As Passed by the House of Commons"}, + } + versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs) + if len(versions) != 2 { + t.Fatalf("versions len = %d, want 2", len(versions)) + } + assertVersionXML(t, versions[0], "/C-9_1/C-9_E.xml", "1") + assertVersionXML(t, versions[1], "/C-9_2/C-9_E.xml", "2") +} + +// TestEnrichVersionsDropsUnvalidatedXMLCandidates covers acceptance: a derived candidate +// that returns an HTTP 200 HTML soft-error, or 404s, is never persisted as xml_url and +// leaves the version carrying no text. +func TestEnrichVersionsDropsUnvalidatedXMLCandidates(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/DocumentViewer/en/45-1/bill/C-9/first-reading": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`XML`)) + case "/Content/Bills/451/Government/C-9/C-9_1/C-9_E.xml": + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(billXMLBody("1", "First reading clause."))) + + // Sort-order sibling exists (C-9_2) but returns an HTTP 200 HTML soft-error page. + case "/DocumentViewer/en/45-1/bill/C-9/as-amended-by-committee": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`
no document links here
`)) + case "/Content/Bills/451/Government/C-9/C-9_2/C-9_E.xml": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`Document not found`)) + + // Sort-order sibling (C-9_3) is not served at all, i.e. 404s. + case "/DocumentViewer/en/45-1/bill/C-9/as-passed-by-the-senate": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`
still nothing
`)) + default: + http.NotFound(w, r) + } + })) + defer server.Close() + + fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client())) + pubs := []publicationJSON{ + {PublicationID: 3001, PublicationTypeNameEn: "First Reading"}, + {PublicationID: 3002, PublicationTypeNameEn: "As Amended by Committee"}, + {PublicationID: 3003, PublicationTypeNameEn: "As Passed by the Senate"}, + } + versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs) + if len(versions) != 3 { + t.Fatalf("versions len = %d, want 3", len(versions)) + } + assertVersionXML(t, versions[0], "/C-9_1/C-9_E.xml", "1") + + if versions[1].XMLURL != "" { + t.Errorf("soft-error stage XMLURL = %q, want empty", versions[1].XMLURL) + } + if versions[1].TextHash != nil || len(versions[1].Sections) != 0 { + t.Errorf("soft-error stage should carry no text: hash=%v sections=%#v", versions[1].TextHash, versions[1].Sections) + } + if versions[2].XMLURL != "" { + t.Errorf("missing-source stage XMLURL = %q, want empty", versions[2].XMLURL) + } + if versions[2].TextHash != nil || len(versions[2].Sections) != 0 { + t.Errorf("missing-source stage should carry no text: hash=%v sections=%#v", versions[2].TextHash, versions[2].Sections) + } +} + +// TestEnrichVersionsDoesNotReuseBaseXMLForLaterStages guards the sort-order fallback: when +// the first resolved XML is not a "_1" directory, a later stage with no links of its own +// must be left empty rather than re-pointed at an earlier stage's document. +func TestEnrichVersionsDoesNotReuseBaseXMLForLaterStages(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + // First-reading page renders client-side: no anchors, and no first-reading XML exists. + case "/DocumentViewer/en/45-1/bill/C-9/first-reading": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`
loaded by script
`)) + + // As amended exposes a direct anchor at a non-"_1" directory (C-9_3). + case "/DocumentViewer/en/45-1/bill/C-9/as-amended-by-committee": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`XML`)) + case "/Content/Bills/451/Government/C-9/C-9_3/C-9_E.xml": + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(billXMLBody("3", "Amended clause."))) + + // As passed renders client-side too: no anchors of its own. + case "/DocumentViewer/en/45-1/bill/C-9/as-passed-by-the-house-of-commons": + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`
loaded by script
`)) + default: + http.NotFound(w, r) + } + })) + defer server.Close() + + fetcher := NewFetcher(WithBaseURL(server.URL), WithHTTPClient(server.Client())) + pubs := []publicationJSON{ + {PublicationID: 4001, PublicationTypeNameEn: "First Reading"}, + {PublicationID: 4002, PublicationTypeNameEn: "As Amended by Committee"}, + {PublicationID: 4003, PublicationTypeNameEn: "As Passed by the House of Commons"}, + } + versions := fetcher.enrichVersions(context.Background(), domain.Session{ParliamentNumber: 45, SessionNumber: 1}, "C-9", pubs) + if len(versions) != 3 { + t.Fatalf("versions len = %d, want 3", len(versions)) + } + if versions[0].XMLURL != "" { + t.Errorf("first-reading XMLURL = %q, want empty", versions[0].XMLURL) + } + assertVersionXML(t, versions[1], "/C-9_3/C-9_E.xml", "3") + if versions[2].XMLURL != "" { + t.Errorf("as-passed XMLURL = %q, want empty (must not reuse the as-amended document)", versions[2].XMLURL) + } +} + +func TestLooksLikeBillXML(t *testing.T) { + cases := []struct { + name string + body string + want bool + }{ + {"bill with prolog", ``, true}, + {"bill with namespace", ``, true}, + {"leading whitespace", "\n\t ", true}, + {"html soft error", `Not found`, false}, + {"doctype html", ``, false}, + {"unrelated xml", `
`, false}, + {"empty", "", false}, + {"whitespace only", " \n ", false}, + {"plain text", `Document not found`, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := looksLikeBillXML([]byte(tc.body)); got != tc.want { + t.Errorf("looksLikeBillXML(%q) = %v, want %v", tc.body, got, tc.want) + } + }) + } +} + +func TestXMLSiblingFromPDF(t *testing.T) { + got := xmlSiblingFromPDF("https://www.parl.ca/Content/Bills/451/Government/C-11/C-11_3/C-11_3.PDF", "C-11") + want := "https://www.parl.ca/Content/Bills/451/Government/C-11/C-11_3/C-11_E.xml" + if got != want { + t.Errorf("xmlSiblingFromPDF = %q, want %q", got, want) + } + if s := xmlSiblingFromPDF("https://www.parl.ca/Content/Bills/451/Government/C-11/C-11_3/C-11_E.xml", "C-11"); s != "" { + t.Errorf("xmlSiblingFromPDF on non-pdf input = %q, want empty", s) + } + if s := xmlSiblingFromPDF("", "C-11"); s != "" { + t.Errorf("xmlSiblingFromPDF empty url = %q, want empty", s) + } + if s := xmlSiblingFromPDF("https://www.parl.ca/x/C-11_3.PDF", ""); s != "" { + t.Errorf("xmlSiblingFromPDF empty number = %q, want empty", s) + } +} + +func TestDedupeNonEmpty(t *testing.T) { + got := strings.Join(dedupeNonEmpty("a", "", " ", "b", "a", "c", "b"), ",") + if got != "a,b,c" { + t.Errorf("dedupeNonEmpty = %q, want %q", got, "a,b,c") + } +} diff --git a/docs/architecture/use-case-catalog.md b/docs/architecture/use-case-catalog.md index 12d9ed5b..1f10e1e8 100644 --- a/docs/architecture/use-case-catalog.md +++ b/docs/architecture/use-case-catalog.md @@ -345,7 +345,7 @@ Entities / values: BillVersion, VersionSection. Ports: backend Go: `BillSource`. Primary adapters: LEGISinfo/parl.ca XML crawler/parser. Current implementation: - backend/bills-indexer/internal/adapter/legisinfo/fetcher.go (enrichVersions: fetch XML + computeSHA256 hash; fetchDocumentLinks; parseBillXML: clause extraction) + backend/bills-indexer/internal/adapter/legisinfo/fetcher.go (enrichVersions: per-stage document-link discovery + computeSHA256 hash; fetchDocumentLinks; resolveVersionXML: validated XML candidate chain — direct anchor, then PDF-directory sibling, then sort-order sibling, accepting only a fetched bill-XML payload; parseBillXML: clause extraction) backend/bills-indexer/internal/domain/domain.go (BillVersion, VersionSection) ```