diff --git a/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go b/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go
index e712806e..35977b9e 100644
--- a/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go
+++ b/backend/bills-indexer/internal/adapter/legisinfo/fetcher.go
@@ -13,7 +13,6 @@ import (
"net/http"
"net/url"
"regexp"
- "sort"
"strconv"
"strings"
"time"
@@ -419,7 +418,6 @@ func (d billDetailJSON) toDomain(baseURL string, list billListJSON) domain.Bill
bill.Stages, bill.Events, bill.Amendments = d.extractStagesAndEvents()
bill.CommitteeStages = d.extractCommitteeStages(baseURL)
bill.RelatedLinks, bill.PBOCostings = d.extractReferences(baseURL)
- bill.Diffs = buildDiffs(number, bill.Versions, d.detailURL)
if len(bill.Amendments) == 0 && d.LatestBillEventNumberOfAmendments > 0 {
noteID := anyString(d.LatestBillEventAmendmentNoteID)
bill.Amendments = append(bill.Amendments, domain.Amendment{
@@ -658,49 +656,6 @@ func (d billDetailJSON) extractReferences(baseURL string) ([]domain.RelatedLink,
return links, costings
}
-func buildDiffs(number string, versions []domain.BillVersion, detailURL string) []domain.BillDiff {
- if len(versions) < 2 {
- return nil
- }
- ordered := append([]domain.BillVersion(nil), versions...)
- sort.SliceStable(ordered, func(i, j int) bool { return ordered[i].SortOrder < ordered[j].SortOrder })
- diffs := make([]domain.BillDiff, 0, len(ordered)-1)
- for i := 1; i < len(ordered); i++ {
- fromVer := ordered[i-1]
- toVer := ordered[i]
- diffID := stableID("diff", number, fromVer.ID, toVer.ID)
-
- var clauseDiffs []domain.BillClauseDiff
- if len(fromVer.Sections) > 0 && len(toVer.Sections) > 0 {
- rawDiffs := DiffClauses(fromVer.Sections, toVer.Sections)
- clauseDiffs = make([]domain.BillClauseDiff, 0, len(rawDiffs))
- for idx, rd := range rawDiffs {
- clauseID := stableID("clause", number, diffID, rd.Label)
- if rd.Label == "" {
- clauseID = stableID("clause", number, diffID, strconv.Itoa(idx))
- }
- clauseDiffs = append(clauseDiffs, domain.BillClauseDiff{
- ID: clauseID,
- Label: rd.Label,
- ChangeType: rd.ChangeType,
- FromText: rd.FromText,
- ToText: rd.ToText,
- HansardAnchorURL: nil,
- })
- }
- }
-
- diffs = append(diffs, domain.BillDiff{
- ID: diffID,
- FromVersionID: fromVer.ID,
- ToVersionID: toVer.ID,
- SourceURL: detailURL,
- Clauses: clauseDiffs,
- })
- }
- return diffs
-}
-
func publicationSlug(stage string) string {
normalized := strings.ToLower(strings.TrimSpace(stage))
normalized = strings.ReplaceAll(normalized, "'", "")
@@ -898,73 +853,3 @@ func cleanSectionText(s string) string {
words := strings.Fields(s)
return strings.Join(words, " ")
}
-
-func DiffClauses(fromClauses, toClauses []domain.VersionSection) []domain.BillClauseDiff {
- n := len(fromClauses)
- m := len(toClauses)
-
- dp := make([][]int, n+1)
- for i := range dp {
- dp[i] = make([]int, m+1)
- }
-
- for i := 1; i <= n; i++ {
- for j := 1; j <= m; j++ {
- if fromClauses[i-1].Label != "" && fromClauses[i-1].Label == toClauses[j-1].Label {
- dp[i][j] = dp[i-1][j-1] + 1
- } else {
- dp[i][j] = maxInt(dp[i-1][j], dp[i][j-1])
- }
- }
- }
-
- var diffs []domain.BillClauseDiff
- i, j := n, m
- for i > 0 || j > 0 {
- if i > 0 && j > 0 && fromClauses[i-1].Label != "" && fromClauses[i-1].Label == toClauses[j-1].Label {
- fc := fromClauses[i-1]
- tc := toClauses[j-1]
- changeType := "unchanged"
- if fc.Text != tc.Text {
- changeType = "modified"
- }
- diffs = append(diffs, domain.BillClauseDiff{
- Label: fc.Label,
- ChangeType: changeType,
- FromText: fc.Text,
- ToText: tc.Text,
- })
- i--
- j--
- } else if j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j]) {
- tc := toClauses[j-1]
- diffs = append(diffs, domain.BillClauseDiff{
- Label: tc.Label,
- ChangeType: "added",
- ToText: tc.Text,
- })
- j--
- } else {
- fc := fromClauses[i-1]
- diffs = append(diffs, domain.BillClauseDiff{
- Label: fc.Label,
- ChangeType: "removed",
- FromText: fc.Text,
- })
- i--
- }
- }
-
- for k := 0; k < len(diffs)/2; k++ {
- diffs[k], diffs[len(diffs)-1-k] = diffs[len(diffs)-1-k], diffs[k]
- }
-
- return diffs
-}
-
-func maxInt(a, b int) int {
- if a > b {
- return a
- }
- return b
-}
diff --git a/backend/bills-indexer/internal/adapter/legisinfo/parse_test.go b/backend/bills-indexer/internal/adapter/legisinfo/parse_test.go
new file mode 100644
index 00000000..35d98b36
--- /dev/null
+++ b/backend/bills-indexer/internal/adapter/legisinfo/parse_test.go
@@ -0,0 +1,52 @@
+package legisinfo
+
+import (
+ "reflect"
+ "testing"
+
+ "epac/bills-indexer/internal/domain"
+)
+
+func TestParseBillXML(t *testing.T) {
+ xmlData := []byte(`
+
+
+ Some Heading
+
+
+ This is clause one text with a newline
+ and extra spaces.
+
+
+
+ Subparagraph text
+
+
+
+
+
+ This is clause two text.
+
+
+`)
+
+ expected := []domain.VersionSection{
+ {
+ Label: "1",
+ Text: "This is clause one text with a newline and extra spaces. (a) Subparagraph text",
+ },
+ {
+ Label: "2",
+ Text: "This is clause two text.",
+ },
+ }
+
+ sections, err := parseBillXML(xmlData)
+ if err != nil {
+ t.Fatalf("parseBillXML: %v", err)
+ }
+
+ if !reflect.DeepEqual(sections, expected) {
+ t.Errorf("parsed sections mismatch.\nExpected: %+v\nGot: %+v", expected, sections)
+ }
+}
diff --git a/backend/bills-indexer/internal/usecase/compute_bill_version_diff.go b/backend/bills-indexer/internal/usecase/compute_bill_version_diff.go
new file mode 100644
index 00000000..df3937c8
--- /dev/null
+++ b/backend/bills-indexer/internal/usecase/compute_bill_version_diff.go
@@ -0,0 +1,161 @@
+package usecase
+
+import (
+ "regexp"
+ "sort"
+ "strconv"
+ "strings"
+
+ "epac/bills-indexer/internal/domain"
+)
+
+// ComputeBillVersionDiff is the application policy that turns a bill's parsed
+// versions into ordered version-to-version diffs. It operates purely on
+// domain values (BillVersion sections in, BillDiff/BillClauseDiff out) and is
+// independent of any source wire format, so it lives in the application layer
+// rather than in a source adapter. Sources are responsible for fetching and
+// parsing version text into domain.VersionSection; this policy is composed on
+// top of that during ingestion.
+//
+// Versions are diffed in SortOrder sequence. A bill with fewer than two
+// versions has nothing to compare, so the result is nil. When either side of a
+// pair has no parsed sections (text unavailable), the diff record is still
+// produced but carries no clause-level detail.
+func ComputeBillVersionDiff(number string, versions []domain.BillVersion, detailURL string) []domain.BillDiff {
+ if len(versions) < 2 {
+ return nil
+ }
+ ordered := append([]domain.BillVersion(nil), versions...)
+ sort.SliceStable(ordered, func(i, j int) bool { return ordered[i].SortOrder < ordered[j].SortOrder })
+ diffs := make([]domain.BillDiff, 0, len(ordered)-1)
+ for i := 1; i < len(ordered); i++ {
+ fromVer := ordered[i-1]
+ toVer := ordered[i]
+ diffID := stableID("diff", number, fromVer.ID, toVer.ID)
+
+ var clauseDiffs []domain.BillClauseDiff
+ if len(fromVer.Sections) > 0 && len(toVer.Sections) > 0 {
+ rawDiffs := DiffClauses(fromVer.Sections, toVer.Sections)
+ clauseDiffs = make([]domain.BillClauseDiff, 0, len(rawDiffs))
+ for idx, rd := range rawDiffs {
+ clauseID := stableID("clause", number, diffID, rd.Label)
+ if rd.Label == "" {
+ clauseID = stableID("clause", number, diffID, strconv.Itoa(idx))
+ }
+ clauseDiffs = append(clauseDiffs, domain.BillClauseDiff{
+ ID: clauseID,
+ Label: rd.Label,
+ ChangeType: rd.ChangeType,
+ FromText: rd.FromText,
+ ToText: rd.ToText,
+ HansardAnchorURL: nil,
+ })
+ }
+ }
+
+ diffs = append(diffs, domain.BillDiff{
+ ID: diffID,
+ FromVersionID: fromVer.ID,
+ ToVersionID: toVer.ID,
+ SourceURL: detailURL,
+ Clauses: clauseDiffs,
+ })
+ }
+ return diffs
+}
+
+// DiffClauses aligns two clause lists by label using a longest-common-subsequence
+// pass, then classifies each clause as added, removed, modified, or unchanged.
+// It depends only on the clause label and text, so it is source-agnostic.
+func DiffClauses(fromClauses, toClauses []domain.VersionSection) []domain.BillClauseDiff {
+ n := len(fromClauses)
+ m := len(toClauses)
+
+ dp := make([][]int, n+1)
+ for i := range dp {
+ dp[i] = make([]int, m+1)
+ }
+
+ for i := 1; i <= n; i++ {
+ for j := 1; j <= m; j++ {
+ if fromClauses[i-1].Label != "" && fromClauses[i-1].Label == toClauses[j-1].Label {
+ dp[i][j] = dp[i-1][j-1] + 1
+ } else {
+ dp[i][j] = maxInt(dp[i-1][j], dp[i][j-1])
+ }
+ }
+ }
+
+ var diffs []domain.BillClauseDiff
+ i, j := n, m
+ for i > 0 || j > 0 {
+ if i > 0 && j > 0 && fromClauses[i-1].Label != "" && fromClauses[i-1].Label == toClauses[j-1].Label {
+ fc := fromClauses[i-1]
+ tc := toClauses[j-1]
+ changeType := "unchanged"
+ if fc.Text != tc.Text {
+ changeType = "modified"
+ }
+ diffs = append(diffs, domain.BillClauseDiff{
+ Label: fc.Label,
+ ChangeType: changeType,
+ FromText: fc.Text,
+ ToText: tc.Text,
+ })
+ i--
+ j--
+ } else if j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j]) {
+ tc := toClauses[j-1]
+ diffs = append(diffs, domain.BillClauseDiff{
+ Label: tc.Label,
+ ChangeType: "added",
+ ToText: tc.Text,
+ })
+ j--
+ } else {
+ fc := fromClauses[i-1]
+ diffs = append(diffs, domain.BillClauseDiff{
+ Label: fc.Label,
+ ChangeType: "removed",
+ FromText: fc.Text,
+ })
+ i--
+ }
+ }
+
+ for k := 0; k < len(diffs)/2; k++ {
+ diffs[k], diffs[len(diffs)-1-k] = diffs[len(diffs)-1-k], diffs[k]
+ }
+
+ return diffs
+}
+
+// stableID builds a deterministic, slug-safe identifier from its parts. It
+// mirrors the slug rule used by the source adapters so that relocating diff ID
+// generation into this layer keeps the produced identifiers byte-for-byte
+// identical.
+func stableID(parts ...string) string {
+ filtered := make([]string, 0, len(parts))
+ for _, part := range parts {
+ part = strings.TrimSpace(part)
+ if part != "" {
+ filtered = append(filtered, part)
+ }
+ }
+ id := strings.ToLower(strings.Join(filtered, "-"))
+ id = nonSlugPattern.ReplaceAllString(id, "-")
+ id = strings.Trim(id, "-")
+ if id == "" {
+ return "unknown"
+ }
+ return id
+}
+
+var nonSlugPattern = regexp.MustCompile(`[^a-z0-9]+`)
+
+func maxInt(a, b int) int {
+ if a > b {
+ return a
+ }
+ return b
+}
diff --git a/backend/bills-indexer/internal/adapter/legisinfo/diff_test.go b/backend/bills-indexer/internal/usecase/compute_bill_version_diff_test.go
similarity index 67%
rename from backend/bills-indexer/internal/adapter/legisinfo/diff_test.go
rename to backend/bills-indexer/internal/usecase/compute_bill_version_diff_test.go
index 65e8bc20..e52c9a02 100644
--- a/backend/bills-indexer/internal/adapter/legisinfo/diff_test.go
+++ b/backend/bills-indexer/internal/usecase/compute_bill_version_diff_test.go
@@ -1,4 +1,4 @@
-package legisinfo
+package usecase
import (
"reflect"
@@ -7,50 +7,6 @@ import (
"epac/bills-indexer/internal/domain"
)
-func TestParseBillXML(t *testing.T) {
- xmlData := []byte(`
-
-
- Some Heading
-
-
- This is clause one text with a newline
- and extra spaces.
-
-
-
- Subparagraph text
-
-
-
-
-
- This is clause two text.
-
-
-`)
-
- expected := []domain.VersionSection{
- {
- Label: "1",
- Text: "This is clause one text with a newline and extra spaces. (a) Subparagraph text",
- },
- {
- Label: "2",
- Text: "This is clause two text.",
- },
- }
-
- sections, err := parseBillXML(xmlData)
- if err != nil {
- t.Fatalf("parseBillXML: %v", err)
- }
-
- if !reflect.DeepEqual(sections, expected) {
- t.Errorf("parsed sections mismatch.\nExpected: %+v\nGot: %+v", expected, sections)
- }
-}
-
func TestDiffClauses(t *testing.T) {
from := []domain.VersionSection{
{Label: "1", Text: "Original text of section 1"},
@@ -98,7 +54,7 @@ func TestDiffClauses(t *testing.T) {
}
}
-func TestBuildDiffsCases(t *testing.T) {
+func TestComputeBillVersionDiffCases(t *testing.T) {
// Case 1: Multi-version bill with text available
v1 := domain.BillVersion{
ID: "v1",
@@ -119,7 +75,7 @@ func TestBuildDiffsCases(t *testing.T) {
TextSourceURL: ptrString("https://example.test/xml2"),
}
- diffs := buildDiffs("C-2", []domain.BillVersion{v1, v2}, "https://example.test/bill")
+ diffs := ComputeBillVersionDiff("C-2", []domain.BillVersion{v1, v2}, "https://example.test/bill")
if len(diffs) != 1 {
t.Fatalf("expected 1 diff, got %d", len(diffs))
}
@@ -131,7 +87,7 @@ func TestBuildDiffsCases(t *testing.T) {
}
// Case 2: One-version bill -> no diff records should be built
- diffsOne := buildDiffs("C-2", []domain.BillVersion{v1}, "https://example.test/bill")
+ diffsOne := ComputeBillVersionDiff("C-2", []domain.BillVersion{v1}, "https://example.test/bill")
if len(diffsOne) != 0 {
t.Errorf("expected 0 diffs for single version, got %d", len(diffsOne))
}
@@ -149,7 +105,7 @@ func TestBuildDiffsCases(t *testing.T) {
TextHash: nil,
TextSourceURL: nil,
}
- diffsMissing := buildDiffs("C-2", []domain.BillVersion{v1Missing, v2Missing}, "https://example.test/bill")
+ diffsMissing := ComputeBillVersionDiff("C-2", []domain.BillVersion{v1Missing, v2Missing}, "https://example.test/bill")
if len(diffsMissing) != 1 {
t.Fatalf("expected 1 diff, got %d", len(diffsMissing))
}
diff --git a/backend/bills-indexer/internal/usecase/usecase.go b/backend/bills-indexer/internal/usecase/usecase.go
index 7f2463e3..d6b12c4e 100644
--- a/backend/bills-indexer/internal/usecase/usecase.go
+++ b/backend/bills-indexer/internal/usecase/usecase.go
@@ -115,6 +115,13 @@ func (u *IngestBills) Execute(ctx context.Context, input Input) (Output, error)
if err != nil {
return Output{}, fmt.Errorf("fetch bills: %w", err)
}
+ // Diff computation is application policy, not source-format work: the source
+ // adapter fetches and parses version text into sections, then this use case
+ // composes the clause-aware diffs over the parsed batch.
+ for i := range batch.Bills {
+ bill := &batch.Bills[i]
+ bill.Diffs = ComputeBillVersionDiff(bill.Number, bill.Versions, bill.SourceURL)
+ }
stats, err := u.writer.Write(ctx, u.dbPath, batch)
if err != nil {
return Output{}, fmt.Errorf("write bills sqlite: %w", err)
diff --git a/backend/bills-indexer/internal/usecase/usecase_test.go b/backend/bills-indexer/internal/usecase/usecase_test.go
index 7d7ff47c..808e1c38 100644
--- a/backend/bills-indexer/internal/usecase/usecase_test.go
+++ b/backend/bills-indexer/internal/usecase/usecase_test.go
@@ -39,6 +39,49 @@ func TestIngestBillsBuildsSQLiteAndManifest(t *testing.T) {
}
}
+// TestExecuteComputesVersionDiffsDuringIngest proves the use case composes the
+// clause-diff policy over the fetched batch: the source supplies a two-version
+// bill with parsed sections but no diffs, and the batch handed to the writer
+// must carry the computed diff. This guards the composition step in Execute,
+// not just the standalone ComputeBillVersionDiff policy.
+func TestExecuteComputesVersionDiffsDuringIngest(t *testing.T) {
+ bill := domain.Bill{
+ ID: "13543613",
+ Number: "C-2",
+ SourceURL: "https://example.test/bill",
+ Versions: []domain.BillVersion{
+ {ID: "v1", SortOrder: 1, Sections: []domain.VersionSection{{Label: "1", Text: "Hello"}}},
+ {ID: "v2", SortOrder: 2, Sections: []domain.VersionSection{{Label: "1", Text: "Hello World"}}},
+ },
+ }
+ source := fakeSource{batch: domain.Batch{Bills: []domain.Bill{bill}}}
+ writer := &fakeWriter{stats: domain.Stats{TableCounts: map[string]int{"bills": 1}}}
+ uploader := fakeUploader{hash: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size: 42}
+ manifest := &fakeManifestWriter{}
+
+ uc, err := NewIngestBills(source, writer, uploader, manifest, WithDatabasePath("/tmp/test-bills.db"))
+ if err != nil {
+ t.Fatalf("NewIngestBills: %v", err)
+ }
+ if _, err := uc.Execute(context.Background(), Input{Session: domain.Session{ParliamentNumber: 45, SessionNumber: 1}, Prefix: "bills/v1"}); err != nil {
+ t.Fatalf("Execute: %v", err)
+ }
+
+ if len(writer.batch.Bills) != 1 {
+ t.Fatalf("writer batch bills = %d", len(writer.batch.Bills))
+ }
+ diffs := writer.batch.Bills[0].Diffs
+ if len(diffs) != 1 {
+ t.Fatalf("expected 1 computed diff in ingested batch, got %d", len(diffs))
+ }
+ if diffs[0].FromVersionID != "v1" || diffs[0].ToVersionID != "v2" || diffs[0].SourceURL != "https://example.test/bill" {
+ t.Errorf("unexpected diff record: %+v", diffs[0])
+ }
+ if len(diffs[0].Clauses) != 1 || diffs[0].Clauses[0].ChangeType != "modified" {
+ t.Errorf("expected one modified clause, got: %+v", diffs[0].Clauses)
+ }
+}
+
type fakeSource struct {
batch domain.Batch
}
@@ -49,11 +92,13 @@ func (f fakeSource) FetchBills(context.Context, domain.Session) (domain.Batch, e
type fakeWriter struct {
path string
+ batch domain.Batch
stats domain.Stats
}
-func (f *fakeWriter) Write(_ context.Context, path string, _ domain.Batch) (domain.Stats, error) {
+func (f *fakeWriter) Write(_ context.Context, path string, batch domain.Batch) (domain.Stats, error) {
f.path = path
+ f.batch = batch
return f.stats, nil
}
diff --git a/docs/architecture/use-case-catalog.md b/docs/architecture/use-case-catalog.md
index 4299873a..12d9ed5b 100644
--- a/docs/architecture/use-case-catalog.md
+++ b/docs/architecture/use-case-catalog.md
@@ -344,25 +344,33 @@ Outputs: Extracted list of clauses (VersionSection) and stable SHA256 text hash.
Entities / values: BillVersion, VersionSection.
Ports: backend Go: `BillSource`.
Primary adapters: LEGISinfo/parl.ca XML crawler/parser.
+Current implementation:
+ backend/bills-indexer/internal/adapter/legisinfo/fetcher.go (enrichVersions: fetch XML + computeSHA256 hash; fetchDocumentLinks; parseBillXML: clause extraction)
+ backend/bills-indexer/internal/domain/domain.go (BillVersion, VersionSection)
```
-> **Boundary rule:** XML retrieving, parsing, and clause extraction must stay entirely inside the backend indexer. Downstream iOS apps or APIs only consume the structured metadata and stable hash.
+> **Boundary rule:** XML retrieving, parsing, and clause extraction must stay entirely inside the backend indexer's LEGISinfo adapter (it is the source-format work behind `BillSource`). Downstream iOS apps or APIs only consume the structured metadata and stable hash.
---
### ComputeBillVersionDiff
```
-Actor: System (Backend Ingest / SQLite Writer boundary)
-Goal: Compute clause-level differences (additions, deletions, modifications, and unchanged clauses) between two consecutive version records of a bill using an alignment algorithm on their clauses.
-Inputs: "Before" version clauses, "After" version clauses.
-Outputs: Ordered list of clause-level differences (BillClauseDiff) with stable IDs.
-Entities / values: BillDiff, BillClauseDiff.
-Ports: backend Go: `BillSource`.
-Primary adapters: Backend indexer diffing logic.
+Actor: System (Backend Ingest, bills-indexer use-case layer)
+Goal: Compute clause-level differences (additions, deletions, modifications, and unchanged clauses) between each consecutive pair of a bill's version records using LCS alignment on their parsed clauses.
+Inputs: A bill's ordered BillVersion records with parsed VersionSection clauses (plus the bill number and source URL used to mint stable IDs).
+Outputs: Ordered list of version-to-version BillDiff records, each carrying clause-level BillClauseDiff rows with stable IDs.
+Entities / values: BillVersion, VersionSection, BillDiff, BillClauseDiff.
+Ports: none — the diff policy is pure domain→domain; parsed clauses arrive upstream through the `BillSource` adapter (fetch + parse).
+Primary adapters: none for the diff itself; the LEGISinfo fetcher supplies the parsed VersionSection clauses the policy consumes.
+Current implementation:
+ backend/bills-indexer/internal/usecase/compute_bill_version_diff.go (ComputeBillVersionDiff, DiffClauses)
+ backend/bills-indexer/internal/usecase/usecase.go (IngestBills.Execute composes the diff over the fetched batch)
+ backend/bills-indexer/internal/domain/domain.go (BillVersion, VersionSection, BillDiff, BillClauseDiff)
+ backend/bills-indexer/internal/adapter/legisinfo/fetcher.go (parseBillXML supplies VersionSection clauses; the diff policy no longer lives here)
```
-> **Boundary rule:** The clause-aware diff algorithm is a use-case policy executed during ingestion. Only the computed diff rows are persisted in the database to be served to downstream clients.
+> **Boundary rule:** The clause-aware diff algorithm is a use-case policy executed during ingestion. It lives in `backend/bills-indexer/internal/usecase/`, which imports only the standard library plus the local `domain` package (no `net/http`, `encoding/xml`, or adapter imports). The LEGISinfo adapter only fetches and parses version text into `VersionSection`s; `IngestBills.Execute` then composes the diff over the parsed batch. Only the computed diff rows are persisted in the database to be served to downstream clients.
---