Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions backend/bills-indexer/internal/adapter/legisinfo/diff_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
package legisinfo

import (
"reflect"
"testing"

"epac/bills-indexer/internal/domain"
)

func TestParseBillXML(t *testing.T) {
xmlData := []byte(`<?xml version="1.0" encoding="utf-8"?>
<Bill>
<Body>
<Heading level="1"><TitleText>Some Heading</TitleText></Heading>
<Section type="amending">
<Label>1</Label>
<Text>This is clause one text with a newline
and extra spaces.</Text>
<AmendedText>
<Subparagraph>
<Label>(a)</Label>
<Text>Subparagraph text</Text>
</Subparagraph>
</AmendedText>
</Section>
<Section>
<Label>2</Label>
<Text>This is clause two text.</Text>
</Section>
</Body>
</Bill>`)

expected := []domain.VersionSection{
{
Label: "1",
Text: "This is clause one text with a newline and extra spaces. (a) Subparagraph text",
},
{
Label: "2",
Text: "This is clause two text.",
},
}

sections, err := parseBillXML(xmlData)
if err != nil {
t.Fatalf("parseBillXML: %v", err)
}

if !reflect.DeepEqual(sections, expected) {
t.Errorf("parsed sections mismatch.\nExpected: %+v\nGot: %+v", expected, sections)
}
}

func TestDiffClauses(t *testing.T) {
from := []domain.VersionSection{
{Label: "1", Text: "Original text of section 1"},
{Label: "2", Text: "Original text of section 2"},
{Label: "3", Text: "Original text of section 3"},
}

to := []domain.VersionSection{
{Label: "1", Text: "Original text of section 1"},
{Label: "2", Text: "Modified text of section 2"},
{Label: "2.1", Text: "Added text of section 2.1"},
}

expected := []domain.BillClauseDiff{
{
Label: "1",
ChangeType: "unchanged",
FromText: "Original text of section 1",
ToText: "Original text of section 1",
},
{
Label: "2",
ChangeType: "modified",
FromText: "Original text of section 2",
ToText: "Modified text of section 2",
},
{
Label: "3",
ChangeType: "removed",
FromText: "Original text of section 3",
ToText: "",
},
{
Label: "2.1",
ChangeType: "added",
FromText: "",
ToText: "Added text of section 2.1",
},
}

diffs := DiffClauses(from, to)

if !reflect.DeepEqual(diffs, expected) {
t.Errorf("diff mismatch.\nExpected: %+v\nGot: %+v", expected, diffs)
}
}

func TestBuildDiffsCases(t *testing.T) {
// Case 1: Multi-version bill with text available
v1 := domain.BillVersion{
ID: "v1",
SortOrder: 1,
Sections: []domain.VersionSection{
{Label: "1", Text: "Hello"},
},
TextHash: ptrString("hash-1"),
TextSourceURL: ptrString("https://example.test/xml1"),
}
v2 := domain.BillVersion{
ID: "v2",
SortOrder: 2,
Sections: []domain.VersionSection{
{Label: "1", Text: "Hello World"},
},
TextHash: ptrString("hash-2"),
TextSourceURL: ptrString("https://example.test/xml2"),
}

diffs := buildDiffs("C-2", []domain.BillVersion{v1, v2}, "https://example.test/bill")
if len(diffs) != 1 {
t.Fatalf("expected 1 diff, got %d", len(diffs))
}
if diffs[0].FromVersionID != "v1" || diffs[0].ToVersionID != "v2" {
t.Errorf("incorrect version pair in diff: %+v", diffs[0])
}
if len(diffs[0].Clauses) != 1 || diffs[0].Clauses[0].Label != "1" || diffs[0].Clauses[0].ChangeType != "modified" {
t.Errorf("expected modified clause, got: %+v", diffs[0].Clauses)
}

// Case 2: One-version bill -> no diff records should be built
diffsOne := buildDiffs("C-2", []domain.BillVersion{v1}, "https://example.test/bill")
if len(diffsOne) != 0 {
t.Errorf("expected 0 diffs for single version, got %d", len(diffsOne))
}

// Case 3: Multi-version bill with missing text -> creates diff records but no clauses
v1Missing := domain.BillVersion{
ID: "v1",
SortOrder: 1,
TextHash: nil,
TextSourceURL: nil,
}
v2Missing := domain.BillVersion{
ID: "v2",
SortOrder: 2,
TextHash: nil,
TextSourceURL: nil,
}
diffsMissing := buildDiffs("C-2", []domain.BillVersion{v1Missing, v2Missing}, "https://example.test/bill")
if len(diffsMissing) != 1 {
t.Fatalf("expected 1 diff, got %d", len(diffsMissing))
}
if len(diffsMissing[0].Clauses) != 0 {
t.Errorf("expected 0 clauses in diff when text is missing, got %d", len(diffsMissing[0].Clauses))
}
}

func ptrString(s string) *string {
return &s
}
187 changes: 184 additions & 3 deletions backend/bills-indexer/internal/adapter/legisinfo/fetcher.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
package legisinfo

import (
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"encoding/xml"
"fmt"
"html"
"io"
Expand Down Expand Up @@ -170,6 +174,21 @@ func (f *Fetcher) enrichVersions(ctx context.Context, session domain.Session, nu
xmlURL, pdfURL := f.fetchDocumentLinks(ctx, htmlURL)
version.XMLURL = xmlURL
version.PDFURL = pdfURL

if xmlURL != "" {
xmlData, err := f.getBytes(ctx, xmlURL, "text/xml")
if err == nil {
hash := computeSHA256(xmlData)
version.TextHash = &hash
version.TextSourceURL = &xmlURL

sections, err := parseBillXML(xmlData)
if err == nil {
version.Sections = sections
}
}
}

versions = append(versions, version)
}
return versions
Expand Down Expand Up @@ -613,11 +632,36 @@ func buildDiffs(number string, versions []domain.BillVersion, detailURL string)
sort.SliceStable(ordered, func(i, j int) bool { return ordered[i].SortOrder < ordered[j].SortOrder })
diffs := make([]domain.BillDiff, 0, len(ordered)-1)
for i := 1; i < len(ordered); i++ {
fromVer := ordered[i-1]
toVer := ordered[i]
diffID := stableID("diff", number, fromVer.ID, toVer.ID)

var clauseDiffs []domain.BillClauseDiff
if len(fromVer.Sections) > 0 && len(toVer.Sections) > 0 {
rawDiffs := DiffClauses(fromVer.Sections, toVer.Sections)
clauseDiffs = make([]domain.BillClauseDiff, 0, len(rawDiffs))
for idx, rd := range rawDiffs {
clauseID := stableID("clause", number, diffID, rd.Label)
if rd.Label == "" {
clauseID = stableID("clause", number, diffID, strconv.Itoa(idx))
}
clauseDiffs = append(clauseDiffs, domain.BillClauseDiff{
ID: clauseID,
Label: rd.Label,
ChangeType: rd.ChangeType,
FromText: rd.FromText,
ToText: rd.ToText,
HansardAnchorURL: nil,
})
}
}

diffs = append(diffs, domain.BillDiff{
ID: stableID("diff", number, ordered[i-1].ID, ordered[i].ID),
FromVersionID: ordered[i-1].ID,
ToVersionID: ordered[i].ID,
ID: diffID,
FromVersionID: fromVer.ID,
ToVersionID: toVer.ID,
SourceURL: detailURL,
Clauses: clauseDiffs,
})
}
return diffs
Expand Down Expand Up @@ -753,3 +797,140 @@ var (
nonSlugPattern = regexp.MustCompile(`[^a-z0-9]+`)
hrefPattern = regexp.MustCompile(`(?i)href=["']([^"']+)["']`)
)

func computeSHA256(data []byte) string {
hash := sha256.Sum256(data)
return hex.EncodeToString(hash[:])
}

func parseBillXML(xmlData []byte) ([]domain.VersionSection, error) {
decoder := xml.NewDecoder(bytes.NewReader(xmlData))
var sections []domain.VersionSection
var currentSec *domain.VersionSection
var inLabel bool
var labelDepth int
var secDepth int

for {
t, err := decoder.Token()
if err == io.EOF {
break
}
if err != nil {
return nil, err
}

switch se := t.(type) {
case xml.StartElement:
if se.Name.Local == "Section" {
currentSec = &domain.VersionSection{}
secDepth = 1
} else if currentSec != nil {
secDepth++
if se.Name.Local == "Label" && secDepth == 2 {
inLabel = true
labelDepth = secDepth
}
}
case xml.EndElement:
if currentSec != nil {
if se.Name.Local == "Section" && secDepth == 1 {
currentSec.Label = strings.TrimSpace(currentSec.Label)
currentSec.Text = cleanSectionText(currentSec.Text)
sections = append(sections, *currentSec)
currentSec = nil
} else {
if inLabel && secDepth == labelDepth {
inLabel = false
}
secDepth--
}
}
case xml.CharData:
if currentSec != nil {
str := string(se)
if inLabel {
currentSec.Label += str
} else {
currentSec.Text += str
}
}
}
}
return sections, nil
}

func cleanSectionText(s string) string {
words := strings.Fields(s)
return strings.Join(words, " ")
}

func DiffClauses(fromClauses, toClauses []domain.VersionSection) []domain.BillClauseDiff {
n := len(fromClauses)
m := len(toClauses)

dp := make([][]int, n+1)
for i := range dp {
dp[i] = make([]int, m+1)
}

for i := 1; i <= n; i++ {
for j := 1; j <= m; j++ {
if fromClauses[i-1].Label != "" && fromClauses[i-1].Label == toClauses[j-1].Label {
dp[i][j] = dp[i-1][j-1] + 1
} else {
dp[i][j] = maxInt(dp[i-1][j], dp[i][j-1])
}
}
}

var diffs []domain.BillClauseDiff
i, j := n, m
for i > 0 || j > 0 {
if i > 0 && j > 0 && fromClauses[i-1].Label != "" && fromClauses[i-1].Label == toClauses[j-1].Label {
fc := fromClauses[i-1]
tc := toClauses[j-1]
changeType := "unchanged"
if fc.Text != tc.Text {
changeType = "modified"
}
diffs = append(diffs, domain.BillClauseDiff{
Label: fc.Label,
ChangeType: changeType,
FromText: fc.Text,
ToText: tc.Text,
})
i--
j--
} else if j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j]) {
tc := toClauses[j-1]
diffs = append(diffs, domain.BillClauseDiff{
Label: tc.Label,
ChangeType: "added",
ToText: tc.Text,
})
j--
} else {
fc := fromClauses[i-1]
diffs = append(diffs, domain.BillClauseDiff{
Label: fc.Label,
ChangeType: "removed",
FromText: fc.Text,
})
i--
}
}

for k := 0; k < len(diffs)/2; k++ {
diffs[k], diffs[len(diffs)-1-k] = diffs[len(diffs)-1-k], diffs[k]
}

return diffs
}

func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,17 @@ func TestFetcherBuildsRelationalBillRecordsFromLegisInfoExports(t *testing.T) {
case "/DocumentViewer/en/45-1/bill/C-2/first-reading":
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<a href="/Content/Bills/451/Government/C-2/C-2_1/C-2_E.xml">XML</a><a href="/Content/Bills/451/Government/C-2/C-2_1/C-2_1.PDF">PDF</a>`))
case "/Content/Bills/451/Government/C-2/C-2_1/C-2_E.xml":
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(`<?xml version="1.0" encoding="utf-8"?>
<Bill>
<Body>
<Section>
<Label>1</Label>
<Text>Verbatim clause text of section 1.</Text>
</Section>
</Body>
</Bill>`))
default:
t.Fatalf("unexpected path: %s", r.URL.String())
}
Expand Down
Loading
Loading