diff --git a/backend/bills-indexer/internal/adapter/legisinfo/diff_test.go b/backend/bills-indexer/internal/adapter/legisinfo/diff_test.go new file mode 100644 index 00000000..65e8bc20 --- /dev/null +++ b/backend/bills-indexer/internal/adapter/legisinfo/diff_test.go @@ -0,0 +1,163 @@ +package legisinfo + +import ( + "reflect" + "testing" + + "epac/bills-indexer/internal/domain" +) + +func TestParseBillXML(t *testing.T) { + xmlData := []byte(` + + + Some Heading +
+ + This is clause one text with a newline + and extra spaces. + + + + Subparagraph text + + +
+
+ + This is clause two text. +
+ +
`) + + expected := []domain.VersionSection{ + { + Label: "1", + Text: "This is clause one text with a newline and extra spaces. (a) Subparagraph text", + }, + { + Label: "2", + Text: "This is clause two text.", + }, + } + + sections, err := parseBillXML(xmlData) + if err != nil { + t.Fatalf("parseBillXML: %v", err) + } + + if !reflect.DeepEqual(sections, expected) { + t.Errorf("parsed sections mismatch.\nExpected: %+v\nGot: %+v", expected, sections) + } +} + +func TestDiffClauses(t *testing.T) { + from := []domain.VersionSection{ + {Label: "1", Text: "Original text of section 1"}, + {Label: "2", Text: "Original text of section 2"}, + {Label: "3", Text: "Original text of section 3"}, + } + + to := []domain.VersionSection{ + {Label: "1", Text: "Original text of section 1"}, + {Label: "2", Text: "Modified text of section 2"}, + {Label: "2.1", Text: "Added text of section 2.1"}, + } + + expected := []domain.BillClauseDiff{ + { + Label: "1", + ChangeType: "unchanged", + FromText: "Original text of section 1", + ToText: "Original text of section 1", + }, + { + Label: "2", + ChangeType: "modified", + FromText: "Original text of section 2", + ToText: "Modified text of section 2", + }, + { + Label: "3", + ChangeType: "removed", + FromText: "Original text of section 3", + ToText: "", + }, + { + Label: "2.1", + ChangeType: "added", + FromText: "", + ToText: "Added text of section 2.1", + }, + } + + diffs := DiffClauses(from, to) + + if !reflect.DeepEqual(diffs, expected) { + t.Errorf("diff mismatch.\nExpected: %+v\nGot: %+v", expected, diffs) + } +} + +func TestBuildDiffsCases(t *testing.T) { + // Case 1: Multi-version bill with text available + v1 := domain.BillVersion{ + ID: "v1", + SortOrder: 1, + Sections: []domain.VersionSection{ + {Label: "1", Text: "Hello"}, + }, + TextHash: ptrString("hash-1"), + TextSourceURL: ptrString("https://example.test/xml1"), + } + v2 := domain.BillVersion{ + ID: "v2", + SortOrder: 2, + Sections: []domain.VersionSection{ + {Label: "1", Text: "Hello World"}, + }, + TextHash: ptrString("hash-2"), + TextSourceURL: ptrString("https://example.test/xml2"), + } + + diffs := buildDiffs("C-2", []domain.BillVersion{v1, v2}, "https://example.test/bill") + if len(diffs) != 1 { + t.Fatalf("expected 1 diff, got %d", len(diffs)) + } + if diffs[0].FromVersionID != "v1" || diffs[0].ToVersionID != "v2" { + t.Errorf("incorrect version pair in diff: %+v", diffs[0]) + } + if len(diffs[0].Clauses) != 1 || diffs[0].Clauses[0].Label != "1" || diffs[0].Clauses[0].ChangeType != "modified" { + t.Errorf("expected modified clause, got: %+v", diffs[0].Clauses) + } + + // Case 2: One-version bill -> no diff records should be built + diffsOne := buildDiffs("C-2", []domain.BillVersion{v1}, "https://example.test/bill") + if len(diffsOne) != 0 { + t.Errorf("expected 0 diffs for single version, got %d", len(diffsOne)) + } + + // Case 3: Multi-version bill with missing text -> creates diff records but no clauses + v1Missing := domain.BillVersion{ + ID: "v1", + SortOrder: 1, + TextHash: nil, + TextSourceURL: nil, + } + v2Missing := domain.BillVersion{ + ID: "v2", + SortOrder: 2, + TextHash: nil, + TextSourceURL: nil, + } + diffsMissing := buildDiffs("C-2", []domain.BillVersion{v1Missing, v2Missing}, "https://example.test/bill") + if len(diffsMissing) != 1 { + t.Fatalf("expected 1 diff, got %d", len(diffsMissing)) + } + if len(diffsMissing[0].Clauses) != 0 { + t.Errorf("expected 0 clauses in diff when text is missing, got %d", len(diffsMissing[0].Clauses)) + } +} + +func ptrString(s string) *string { + return &s +} diff --git a/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go b/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go index fcf06cfc..23e996fa 100644 --- a/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go +++ b/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go @@ -1,8 +1,12 @@ package legisinfo import ( + "bytes" "context" + "crypto/sha256" + "encoding/hex" "encoding/json" + "encoding/xml" "fmt" "html" "io" @@ -170,6 +174,21 @@ func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, nu xmlURL, pdfURL := f.fetchDocumentLinks(ctx, htmlURL) version.XMLURL = xmlURL version.PDFURL = pdfURL + + if xmlURL != "" { + xmlData, err := f.getBytes(ctx, xmlURL, "text/xml") + if err == nil { + hash := computeSHA256(xmlData) + version.TextHash = &hash + version.TextSourceURL = &xmlURL + + sections, err := parseBillXML(xmlData) + if err == nil { + version.Sections = sections + } + } + } + versions = append(versions, version) } return versions @@ -613,11 +632,36 @@ func buildDiffs(number string, versions []domain.BillVersion, detailURL string) sort.SliceStable(ordered, func(i, j int) bool { return ordered[i].SortOrder < ordered[j].SortOrder }) diffs := make([]domain.BillDiff, 0, len(ordered)-1) for i := 1; i < len(ordered); i++ { + fromVer := ordered[i-1] + toVer := ordered[i] + diffID := stableID("diff", number, fromVer.ID, toVer.ID) + + var clauseDiffs []domain.BillClauseDiff + if len(fromVer.Sections) > 0 && len(toVer.Sections) > 0 { + rawDiffs := DiffClauses(fromVer.Sections, toVer.Sections) + clauseDiffs = make([]domain.BillClauseDiff, 0, len(rawDiffs)) + for idx, rd := range rawDiffs { + clauseID := stableID("clause", number, diffID, rd.Label) + if rd.Label == "" { + clauseID = stableID("clause", number, diffID, strconv.Itoa(idx)) + } + clauseDiffs = append(clauseDiffs, domain.BillClauseDiff{ + ID: clauseID, + Label: rd.Label, + ChangeType: rd.ChangeType, + FromText: rd.FromText, + ToText: rd.ToText, + HansardAnchorURL: nil, + }) + } + } + diffs = append(diffs, domain.BillDiff{ - ID: stableID("diff", number, ordered[i-1].ID, ordered[i].ID), - FromVersionID: ordered[i-1].ID, - ToVersionID: ordered[i].ID, + ID: diffID, + FromVersionID: fromVer.ID, + ToVersionID: toVer.ID, SourceURL: detailURL, + Clauses: clauseDiffs, }) } return diffs @@ -753,3 +797,140 @@ var ( nonSlugPattern = regexp.MustCompile(`[^a-z0-9]+`) hrefPattern = regexp.MustCompile(`(?i)href=["']([^"']+)["']`) ) + +func computeSHA256(data []byte) string { + hash := sha256.Sum256(data) + return hex.EncodeToString(hash[:]) +} + +func parseBillXML(xmlData []byte) ([]domain.VersionSection, error) { + decoder := xml.NewDecoder(bytes.NewReader(xmlData)) + var sections []domain.VersionSection + var currentSec *domain.VersionSection + var inLabel bool + var labelDepth int + var secDepth int + + for { + t, err := decoder.Token() + if err == io.EOF { + break + } + if err != nil { + return nil, err + } + + switch se := t.(type) { + case xml.StartElement: + if se.Name.Local == "Section" { + currentSec = &domain.VersionSection{} + secDepth = 1 + } else if currentSec != nil { + secDepth++ + if se.Name.Local == "Label" && secDepth == 2 { + inLabel = true + labelDepth = secDepth + } + } + case xml.EndElement: + if currentSec != nil { + if se.Name.Local == "Section" && secDepth == 1 { + currentSec.Label = strings.TrimSpace(currentSec.Label) + currentSec.Text = cleanSectionText(currentSec.Text) + sections = append(sections, *currentSec) + currentSec = nil + } else { + if inLabel && secDepth == labelDepth { + inLabel = false + } + secDepth-- + } + } + case xml.CharData: + if currentSec != nil { + str := string(se) + if inLabel { + currentSec.Label += str + } else { + currentSec.Text += str + } + } + } + } + return sections, nil +} + +func cleanSectionText(s string) string { + words := strings.Fields(s) + return strings.Join(words, " ") +} + +func DiffClauses(fromClauses, toClauses []domain.VersionSection) []domain.BillClauseDiff { + n := len(fromClauses) + m := len(toClauses) + + dp := make([][]int, n+1) + for i := range dp { + dp[i] = make([]int, m+1) + } + + for i := 1; i <= n; i++ { + for j := 1; j <= m; j++ { + if fromClauses[i-1].Label != "" && fromClauses[i-1].Label == toClauses[j-1].Label { + dp[i][j] = dp[i-1][j-1] + 1 + } else { + dp[i][j] = maxInt(dp[i-1][j], dp[i][j-1]) + } + } + } + + var diffs []domain.BillClauseDiff + i, j := n, m + for i > 0 || j > 0 { + if i > 0 && j > 0 && fromClauses[i-1].Label != "" && fromClauses[i-1].Label == toClauses[j-1].Label { + fc := fromClauses[i-1] + tc := toClauses[j-1] + changeType := "unchanged" + if fc.Text != tc.Text { + changeType = "modified" + } + diffs = append(diffs, domain.BillClauseDiff{ + Label: fc.Label, + ChangeType: changeType, + FromText: fc.Text, + ToText: tc.Text, + }) + i-- + j-- + } else if j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j]) { + tc := toClauses[j-1] + diffs = append(diffs, domain.BillClauseDiff{ + Label: tc.Label, + ChangeType: "added", + ToText: tc.Text, + }) + j-- + } else { + fc := fromClauses[i-1] + diffs = append(diffs, domain.BillClauseDiff{ + Label: fc.Label, + ChangeType: "removed", + FromText: fc.Text, + }) + i-- + } + } + + for k := 0; k < len(diffs)/2; k++ { + diffs[k], diffs[len(diffs)-1-k] = diffs[len(diffs)-1-k], diffs[k] + } + + return diffs +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go b/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go index f3af0a09..40fbc6de 100644 --- a/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go +++ b/backend/bills-indexer/internal/adapter/legisinfo/fetcher_test.go @@ -111,6 +111,17 @@ func TestFetcherBuildsRelationalBillRecordsFromLegisInfoExports(t *testing.T) { case "/DocumentViewer/en/45-1/bill/C-2/first-reading": w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`XMLPDF`)) + case "/Content/Bills/451/Government/C-2/C-2_1/C-2_E.xml": + w.Header().Set("Content-Type", "application/xml") + _, _ = w.Write([]byte(` + + +
+ + Verbatim clause text of section 1. +
+ +
`)) default: t.Fatalf("unexpected path: %s", r.URL.String()) } diff --git a/backend/bills-indexer/internal/adapter/sqlite/writer.go b/backend/bills-indexer/internal/adapter/sqlite/writer.go index 6526a466..b4df3bcf 100644 --- a/backend/bills-indexer/internal/adapter/sqlite/writer.go +++ b/backend/bills-indexer/internal/adapter/sqlite/writer.go @@ -109,6 +109,7 @@ func (w *Writer) Write(ctx context.Context, dbPath string, batch domain.Batch) ( } const schemaSQL = ` +DROP TABLE IF EXISTS bill_clause_diffs; DROP TABLE IF EXISTS pbo_costings; DROP TABLE IF EXISTS bill_amendments; DROP TABLE IF EXISTS bill_diffs; @@ -211,6 +212,8 @@ CREATE TABLE bill_versions ( published_date TEXT, source TEXT NOT NULL DEFAULT '', sort_order INTEGER NOT NULL DEFAULT 0, + text_hash TEXT, + text_source_url TEXT, PRIMARY KEY (bill_id, id) ); CREATE TABLE bill_diffs ( @@ -221,6 +224,19 @@ CREATE TABLE bill_diffs ( source_url TEXT NOT NULL DEFAULT '', PRIMARY KEY (bill_id, id) ); +CREATE TABLE bill_clause_diffs ( + bill_id TEXT NOT NULL, + diff_id TEXT NOT NULL, + id TEXT NOT NULL, + label TEXT, + change_type TEXT NOT NULL, + from_text TEXT, + to_text TEXT, + hansard_anchor_url TEXT, + sort_order INTEGER NOT NULL, + PRIMARY KEY (bill_id, diff_id, id), + FOREIGN KEY (bill_id, diff_id) REFERENCES bill_diffs(bill_id, id) ON DELETE CASCADE +); CREATE TABLE bill_amendments ( bill_id TEXT NOT NULL REFERENCES bills(id) ON DELETE CASCADE, id TEXT NOT NULL, @@ -384,10 +400,11 @@ INSERT OR REPLACE INTO bill_committee_meetings ( if _, err := tx.ExecContext(ctx, ` INSERT OR REPLACE INTO bill_versions ( bill_id, id, publication_id, stage, stage_slug, html_url, xml_url, pdf_url, - published_date, source, sort_order -) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + published_date, source, sort_order, text_hash, text_source_url +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, bill.ID, version.ID, version.PublicationID, version.Stage, version.StageSlug, version.HTMLURL, version.XMLURL, - version.PDFURL, emptyToNil(version.PublishedDate), version.Source, version.SortOrder); err != nil { + version.PDFURL, emptyToNil(version.PublishedDate), version.Source, version.SortOrder, + emptyToNil(derefString(version.TextHash)), emptyToNil(derefString(version.TextSourceURL))); err != nil { return fmt.Errorf("insert bill version %s/%s: %w", bill.Number, version.ID, err) } stats.VersionCount++ @@ -400,6 +417,17 @@ VALUES (?, ?, ?, ?, ?)`, return fmt.Errorf("insert bill diff %s/%s: %w", bill.Number, diff.ID, err) } stats.DiffCount++ + + for idx, clause := range diff.Clauses { + if _, err := tx.ExecContext(ctx, ` +INSERT OR REPLACE INTO bill_clause_diffs ( + bill_id, diff_id, id, label, change_type, from_text, to_text, hansard_anchor_url, sort_order +) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, + bill.ID, diff.ID, clause.ID, emptyToNil(clause.Label), clause.ChangeType, + emptyToNil(clause.FromText), emptyToNil(clause.ToText), clause.HansardAnchorURL, idx+1); err != nil { + return fmt.Errorf("insert bill clause diff %s/%s/%s: %w", bill.Number, diff.ID, clause.ID, err) + } + } } for _, amendment := range bill.Amendments { if _, err := tx.ExecContext(ctx, ` @@ -468,6 +496,7 @@ func countTables(ctx context.Context, db *sql.DB) (map[string]int, error) { "bill_committee_meetings", "bill_versions", "bill_diffs", + "bill_clause_diffs", "bill_amendments", "pbo_costings", "bill_related_links", @@ -533,3 +562,10 @@ func removeSQLiteFiles(path string) error { } return nil } + +func derefString(s *string) string { + if s == nil { + return "" + } + return *s +} diff --git a/backend/bills-indexer/internal/adapter/sqlite/writer_test.go b/backend/bills-indexer/internal/adapter/sqlite/writer_test.go index e40c884a..78d677c9 100644 --- a/backend/bills-indexer/internal/adapter/sqlite/writer_test.go +++ b/backend/bills-indexer/internal/adapter/sqlite/writer_test.go @@ -31,6 +31,13 @@ func TestWriterCreatesBillsRelationalSchema(t *testing.T) { Events: []domain.BillEvent{{ID: "event-1", StageID: "60029", Name: "Introduction"}}, Versions: []domain.BillVersion{{ ID: "v1", Stage: "First Reading", HTMLURL: "https://example.test/html", XMLURL: "https://example.test/xml", SortOrder: 1, + TextHash: ptrString("abc123hash"), TextSourceURL: ptrString("https://example.test/xml"), + }}, + Diffs: []domain.BillDiff{{ + ID: "diff-1", FromVersionID: "v1", ToVersionID: "v2", SourceURL: "https://example.test/diff", + Clauses: []domain.BillClauseDiff{{ + ID: "clause-1", Label: "1", ChangeType: "modified", FromText: "Old text", ToText: "New text", HansardAnchorURL: ptrString("https://hansard.test/anchor"), + }}, }}, Amendments: []domain.Amendment{{ID: "a1", EventID: "event-1", AmendmentCount: 1}}, PBOCostings: []domain.PBOCosting{{ID: "p1", Title: "PBO costing", URL: "https://pbo.test"}}, @@ -59,7 +66,7 @@ func TestWriterCreatesBillsRelationalSchema(t *testing.T) { if err != nil { t.Fatalf("Write: %v", err) } - if stats.TableCounts["bills"] != 1 || stats.TableCounts["bill_versions"] != 1 || stats.TableCounts["pbo_costings"] != 1 { + if stats.TableCounts["bills"] != 1 || stats.TableCounts["bill_versions"] != 1 || stats.TableCounts["pbo_costings"] != 1 || stats.TableCounts["bill_clause_diffs"] != 1 { t.Fatalf("table counts = %#v", stats.TableCounts) } @@ -89,6 +96,22 @@ func TestWriterCreatesBillsRelationalSchema(t *testing.T) { if meetingNumber != 42 { t.Fatalf("meeting number = %d", meetingNumber) } + + var textHash, textSourceURL string + if err := db.QueryRow("SELECT text_hash, text_source_url FROM bill_versions WHERE bill_id = ? AND id = ?", "13543613", "v1").Scan(&textHash, &textSourceURL); err != nil { + t.Fatalf("query bill version text info: %v", err) + } + if textHash != "abc123hash" || textSourceURL != "https://example.test/xml" { + t.Fatalf("text_hash = %q, text_source_url = %q", textHash, textSourceURL) + } + + var changeType, fromText, toText, hansardAnchor string + if err := db.QueryRow("SELECT change_type, from_text, to_text, hansard_anchor_url FROM bill_clause_diffs WHERE bill_id = ? AND diff_id = ? AND id = ?", "13543613", "diff-1", "clause-1").Scan(&changeType, &fromText, &toText, &hansardAnchor); err != nil { + t.Fatalf("query bill clause diff: %v", err) + } + if changeType != "modified" || fromText != "Old text" || toText != "New text" || hansardAnchor != "https://hansard.test/anchor" { + t.Fatalf("clause diff fields mismatch: %q, %q, %q, %q", changeType, fromText, toText, hansardAnchor) + } } type fixedClock struct{} @@ -96,3 +119,7 @@ type fixedClock struct{} func (fixedClock) Now() time.Time { return time.Date(2026, 6, 10, 12, 0, 0, 0, time.UTC) } + +func ptrString(s string) *string { + return &s +} diff --git a/backend/bills-indexer/internal/domain/domain.go b/backend/bills-indexer/internal/domain/domain.go index d288b6f8..064cad80 100644 --- a/backend/bills-indexer/internal/domain/domain.go +++ b/backend/bills-indexer/internal/domain/domain.go @@ -87,6 +87,11 @@ type BillCommitteeMeeting struct { SortOrder int } +type VersionSection struct { + Label string + Text string +} + type BillVersion struct { ID string PublicationID string @@ -98,6 +103,18 @@ type BillVersion struct { PublishedDate string Source string SortOrder int + TextHash *string + TextSourceURL *string + Sections []VersionSection +} + +type BillClauseDiff struct { + ID string + Label string + ChangeType string // "added", "removed", "modified", "unchanged" + FromText string + ToText string + HansardAnchorURL *string } type BillDiff struct { @@ -105,6 +122,7 @@ type BillDiff struct { FromVersionID string ToVersionID string SourceURL string + Clauses []BillClauseDiff } type Amendment struct { diff --git a/docs/architecture/use-case-catalog.md b/docs/architecture/use-case-catalog.md index b8f7adbe..7c62ef9f 100644 --- a/docs/architecture/use-case-catalog.md +++ b/docs/architecture/use-case-catalog.md @@ -334,6 +334,38 @@ Current implementation: --- +### IngestBillVersionText + +``` +Actor: System (Backend Ingest / SQLite Writer boundary) +Goal: Fetch and extract the text content and clauses of a bill version from its LEGISinfo XML URL, compute its stable hash, and associate it with the bill version metadata. +Inputs: XML URL of the bill version. +Outputs: Extracted list of clauses (VersionSection) and stable SHA256 text hash. +Entities / values: BillVersion, VersionSection. +Ports: backend Go: `BillSource`. +Primary adapters: LEGISinfo/parl.ca XML crawler/parser. +``` + +> **Boundary rule:** XML retrieving, parsing, and clause extraction must stay entirely inside the backend indexer. Downstream iOS apps or APIs only consume the structured metadata and stable hash. + +--- + +### ComputeBillVersionDiff + +``` +Actor: System (Backend Ingest / SQLite Writer boundary) +Goal: Compute clause-level differences (additions, deletions, modifications, and unchanged clauses) between two consecutive version records of a bill using an alignment algorithm on their clauses. +Inputs: "Before" version clauses, "After" version clauses. +Outputs: Ordered list of clause-level differences (BillClauseDiff) with stable IDs. +Entities / values: BillDiff, BillClauseDiff. +Ports: backend Go: `BillSource`. +Primary adapters: Backend indexer diffing logic. +``` + +> **Boundary rule:** The clause-aware diff algorithm is a use-case policy executed during ingestion. Only the computed diff rows are persisted in the database to be served to downstream clients. + +--- + ### IngestMembers ```