Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,3 +299,72 @@ def test_shared_person_across_clusters(embedder):
assert not (group & m_ids and group & summit_ids), (
f"Meridian and Summit incorrectly merged: {group}"
)


# ---------------------------------------------------------------------------
# 4. Progressive merging does not cause cascading false merges
# ---------------------------------------------------------------------------


def test_progressive_merging_no_cascading_false_merges(embedder):
"""Two unrelated clusters with similar structure should stay separate
even with progressive merging enabled.

Cluster A: Dr. Sarah Kim leads Quantum Computing Lab, funded by DARPA
Cluster B: Dr. Sarah Kim leads Marine Biology Lab, funded by NOAA

Same pattern as test_identical_names_different_contexts_no_merge but
explicitly exercises progressive merging (within-cluster pairs above
0.9 trigger epoch 2). After within-cluster merges, the cross-cluster
canonical pair should NOT merge despite enriched neighborhoods —
negative evidence from mismatched neighbors (Quantum vs Marine, DARPA
vs NOAA) prevents it."""
# Cluster A: quantum research (2 sources)
a1 = Graph(id="quantum-1")
sk_a1 = a1.add_entity("Dr. Sarah Kim")
lab_a1 = a1.add_entity("Quantum Computing Lab")
darpa_a1 = a1.add_entity("DARPA")
a1.add_edge(sk_a1, lab_a1, "leads")
a1.add_edge(lab_a1, darpa_a1, "funded by")

a2 = Graph(id="quantum-2")
sk_a2 = a2.add_entity("Dr. Sarah Kim")
lab_a2 = a2.add_entity("Quantum Computing Lab")
darpa_a2 = a2.add_entity("DARPA")
a2.add_edge(sk_a2, lab_a2, "leads")
a2.add_edge(lab_a2, darpa_a2, "funded by")

# Cluster B: marine biology (2 sources) — same name, same structure
b1 = Graph(id="marine-1")
sk_b1 = b1.add_entity("Dr. Sarah Kim")
lab_b1 = b1.add_entity("Marine Biology Lab")
noaa_b1 = b1.add_entity("NOAA")
b1.add_edge(sk_b1, lab_b1, "leads")
b1.add_edge(lab_b1, noaa_b1, "funded by")

b2 = Graph(id="marine-2")
sk_b2 = b2.add_entity("Dr. Sarah Kim")
lab_b2 = b2.add_entity("Marine Biology Lab")
noaa_b2 = b2.add_entity("NOAA")
b2.add_edge(sk_b2, lab_b2, "leads")
b2.add_edge(lab_b2, noaa_b2, "funded by")

graphs = [a1, a2, b1, b2]
confidence = match_graphs(graphs, embedder)
groups, _ = build_match_groups(graphs, confidence)

cluster_a_ids = {sk_a1.id, lab_a1.id, darpa_a1.id, sk_a2.id, lab_a2.id, darpa_a2.id}
cluster_b_ids = {sk_b1.id, lab_b1.id, noaa_b1.id, sk_b2.id, lab_b2.id, noaa_b2.id}

for group in groups:
has_a = bool(group & cluster_a_ids)
has_b = bool(group & cluster_b_ids)
assert not (has_a and has_b), (
f"Progressive merging caused cascading false merge across clusters: {group}"
)

# Within-cluster merges should still work
sk_a_group = _find_group_containing(groups, sk_a1.id)
assert sk_a_group is not None and sk_a2.id in sk_a_group
sk_b_group = _find_group_containing(groups, sk_b1.id)
assert sk_b_group is not None and sk_b2.id in sk_b_group
65 changes: 65 additions & 0 deletions tests/test_propagation.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,3 +598,68 @@ def test_positive_evidence_is_monotonically_nondecreasing(embedder):
f"at max_iter={n_iter}"
)
prev_conf = curr_conf


# ---------------------------------------------------------------------------
# Progressive merging: enriched neighborhoods
# ---------------------------------------------------------------------------


def test_progressive_merging_enriches_neighborhoods(embedder):
"""Progressive merging lets evidence from transitively-matched entities
compound across epochs.

Articles A and B describe the same acquisition with identical entity names
and two shared neighbors — enough for high confidence (>0.9) in epoch 1.
Article C uses a short name variant with only ONE shared neighbor per
pairwise comparison. After A+B merge, the merged entity's enriched
neighborhood provides TWO structural paths to C, boosting confidence.

Article A: [Nexora Corp] —acquired→ [DataVault], —headquartered in→ [Austin]
Article B: [Nexora Corp] —bought→ [DataVault], —headquartered in→ [Austin]
Article C: [NXR Corp] —acquired→ [DataVault], —founded by→ [James Chen]

Without progressive merging: C's NXR Corp has low name sim to Nexora Corp
and one structural path (DataVault). With progressive merging: after A+B
merge, the merged entity has edges to both DataVault and Austin, and the
confidence for C↔merged improves."""
g_a = Graph(id="article-a")
nexora_a = g_a.add_entity("Nexora Corp")
dv_a = g_a.add_entity("DataVault")
austin_a = g_a.add_entity("Austin")
g_a.add_edge(nexora_a, dv_a, "acquired")
g_a.add_edge(nexora_a, austin_a, "headquartered in")

g_b = Graph(id="article-b")
nexora_b = g_b.add_entity("Nexora Corp")
dv_b = g_b.add_entity("DataVault")
austin_b = g_b.add_entity("Austin")
g_b.add_edge(nexora_b, dv_b, "bought")
g_b.add_edge(nexora_b, austin_b, "headquartered in")

g_c = Graph(id="article-c")
nexora_c = g_c.add_entity("NXR Corp")
dv_c = g_c.add_entity("DataVault")
chen_c = g_c.add_entity("James Chen")
g_c.add_edge(nexora_c, dv_c, "acquired")
g_c.add_edge(nexora_c, chen_c, "founded by")

graphs = [g_a, g_b, g_c]

# With progressive merging (default max_epochs=5), A+B merge in epoch 1
# (identical names + 2 shared neighbors → confidence > 0.9). In epoch 2,
# the merged entity's enriched neighborhood benefits C's match.
conf_multi = match_graphs(graphs, embedder)

# With max_epochs=1, no progressive merging occurs.
conf_single = match_graphs(graphs, embedder, max_epochs=1)

# Progressive merging should produce higher confidence for C's entity
# matching A's (or equivalently B's, since A+B are merged).
multi_score = conf_multi.get((nexora_c.id, nexora_a.id), 0.0)
single_score = conf_single.get((nexora_c.id, nexora_a.id), 0.0)

assert multi_score > single_score, (
f"Progressive merging did not improve confidence for NXR Corp ↔ Nexora Corp: "
f"multi={multi_score:.4f}, single={single_score:.4f}"
)
Loading
Loading