From 1741cc9d5e27d0bb585519889b9c4f8bffa47a9b Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 15:43:00 +0100 Subject: [PATCH 01/13] Refactor: move negative dampening outside inner propagation loop The negative factor reads from name_seed (fixed) and doesn't depend on anything that changes within the positive fixpoint loop. Moving it to a single post-convergence pass makes the inner loop a clean monotone fixpoint and prepares for epoch-based progressive merging. Co-Authored-By: Claude Opus 4.6 --- worldgraph/match.py | 52 ++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/worldgraph/match.py b/worldgraph/match.py index b8d573b..ffd10e1 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -399,12 +399,10 @@ def propagate_similarity( # Positive base tracks the monotone non-decreasing positive signal, # computed using positive_base values for structural propagation. - # Final confidence = positive_base × negative_factor, recomputed each - # iteration so negative evidence never compounds. positive_base: Confidence = dict(confidence) + # --- Positive fixpoint loop (monotone non-decreasing) --- for _ in range(max_iter): - prev = dict(confidence) prev_base = dict(positive_base) changed = False @@ -416,9 +414,6 @@ def propagate_similarity( rs = rel_sim.get((neighbor_a.relation, neighbor_b.relation), 0.0) if rs < rel_threshold: continue - # Structural propagation uses positive_base so that - # negative penalties on neighbors don't suppress - # legitimate positive signal. neighbor_confidence = prev_base.get( (neighbor_a.entity_id, neighbor_b.entity_id), 0.0 ) @@ -436,34 +431,33 @@ def propagate_similarity( positive_base[(id_a, id_b)] = base positive_base[(id_b, id_a)] = base - # Apply negative evidence to pairs with enough positive signal. - # The negative factor uses name_seed (fixed name similarity) - # to check whether neighbors match, preventing circular - # reinforcement through structural propagation. - if base > neg_gate: - neg = compute_negative_factor( - id_a, - id_b, - forward_adj, - rel_sim, - name_seed, - alpha=neg_alpha, - floor=neg_floor, - rel_threshold=rel_threshold, - ) - combined = base * neg - else: - combined = base - - old = prev[(id_a, id_b)] - if abs(combined - old) > epsilon: - confidence[(id_a, id_b)] = combined - confidence[(id_b, id_a)] = combined + if abs(base - old_base) > epsilon: changed = True if not changed: break + # --- Apply negative dampening in a single post-convergence pass --- + # The negative factor uses name_seed (fixed name similarity) to check + # whether neighbors match, preventing circular reinforcement. + confidence = dict(positive_base) + for id_a, id_b in pairs: + base = positive_base[(id_a, id_b)] + if base > neg_gate: + neg = compute_negative_factor( + id_a, + id_b, + forward_adj, + rel_sim, + name_seed, + alpha=neg_alpha, + floor=neg_floor, + rel_threshold=rel_threshold, + ) + combined = base * neg + confidence[(id_a, id_b)] = combined + confidence[(id_b, id_a)] = combined + return confidence From 403e192f4cac923ab7204d43c936d84dcc2ffdfc Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 15:45:22 +0100 Subject: [PATCH 02/13] Refactor: extract propagate_positive() and apply_negative() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split the positive fixpoint loop and negative dampening pass into standalone functions. propagate_similarity() now calls these in sequence — same API, same behavior. This separation enables the epoch loop in the next step. Co-Authored-By: Claude Opus 4.6 --- worldgraph/match.py | 175 +++++++++++++++++++++++++++++--------------- 1 file changed, 114 insertions(+), 61 deletions(-) diff --git a/worldgraph/match.py b/worldgraph/match.py index ffd10e1..94e4bf1 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -315,6 +315,99 @@ def build_unified_graph(graphs: list[Graph]) -> Graph: return unified +def propagate_positive( + adjacency: dict[str, list[Neighbor]], + pairs: list[tuple[str, str]], + positive_base: Confidence, + *, + rel_sim: dict[tuple[str, str], float], + rel_threshold: float, + max_iter: int, + epsilon: float, + exp_lambda: float, +) -> Confidence: + """Run the monotone non-decreasing positive fixpoint loop. + + Updates ``positive_base`` in place and returns it. Each iteration + reads from the previous snapshot (double-buffering) and applies the + monotone max rule: new value = max(structural_update, old_value). + """ + for _ in range(max_iter): + prev_base = dict(positive_base) + changed = False + + for id_a, id_b in pairs: + strength_sum = 0.0 + + for neighbor_a in adjacency.get(id_a, []): + for neighbor_b in adjacency.get(id_b, []): + rs = rel_sim.get((neighbor_a.relation, neighbor_b.relation), 0.0) + if rs < rel_threshold: + continue + neighbor_confidence = prev_base.get( + (neighbor_a.entity_id, neighbor_b.entity_id), 0.0 + ) + if neighbor_confidence <= 0.0: + continue + weight = min(neighbor_a.func_weight, neighbor_b.func_weight) + strength_sum += weight * neighbor_confidence + + positive = ( + 1.0 - math.exp(-exp_lambda * strength_sum) if strength_sum > 0 else 0.0 + ) + + old_base = prev_base[(id_a, id_b)] + base = max(positive, old_base) + positive_base[(id_a, id_b)] = base + positive_base[(id_b, id_a)] = base + + if abs(base - old_base) > epsilon: + changed = True + + if not changed: + break + + return positive_base + + +def apply_negative( + positive_base: Confidence, + pairs: list[tuple[str, str]], + forward_adj: dict[str, list[Neighbor]], + rel_sim: dict[tuple[str, str], float], + name_seed: Confidence, + *, + neg_alpha: float, + neg_floor: float, + neg_gate: float, + rel_threshold: float, +) -> Confidence: + """Apply negative dampening as a single post-convergence pass. + + The negative factor uses ``name_seed`` (fixed name similarity) to + check whether neighbors match, preventing circular reinforcement. + Returns a new confidence dict with dampened values. + """ + confidence = dict(positive_base) + for id_a, id_b in pairs: + base = positive_base[(id_a, id_b)] + if base > neg_gate: + neg = compute_negative_factor( + id_a, + id_b, + forward_adj, + rel_sim, + name_seed, + alpha=neg_alpha, + floor=neg_floor, + rel_threshold=rel_threshold, + ) + combined = base * neg + confidence[(id_a, id_b)] = combined + confidence[(id_b, id_a)] = combined + return confidence + + def propagate_similarity( graph: Graph, idf: dict[str, float], @@ -397,68 +490,28 @@ def propagate_similarity( # in turn weakens the negative penalty on the original pair. name_seed: Confidence = dict(confidence) - # Positive base tracks the monotone non-decreasing positive signal, - # computed using positive_base values for structural propagation. - positive_base: Confidence = dict(confidence) - - # --- Positive fixpoint loop (monotone non-decreasing) --- - for _ in range(max_iter): - prev_base = dict(positive_base) - changed = False - - for id_a, id_b in pairs: - strength_sum = 0.0 - - for neighbor_a in adjacency.get(id_a, []): - for neighbor_b in adjacency.get(id_b, []): - rs = rel_sim.get((neighbor_a.relation, neighbor_b.relation), 0.0) - if rs < rel_threshold: - continue - neighbor_confidence = prev_base.get( - (neighbor_a.entity_id, neighbor_b.entity_id), 0.0 - ) - if neighbor_confidence <= 0.0: - continue - weight = min(neighbor_a.func_weight, neighbor_b.func_weight) - strength_sum += weight * neighbor_confidence - - positive = ( - 1.0 - math.exp(-exp_lambda * strength_sum) if strength_sum > 0 else 0.0 - ) - - old_base = prev_base[(id_a, id_b)] - base = max(positive, old_base) - positive_base[(id_a, id_b)] = base - positive_base[(id_b, id_a)] = base - - if abs(base - old_base) > epsilon: - changed = True - - if not changed: - break - - # --- Apply negative dampening in a single post-convergence pass --- - # The negative factor uses name_seed (fixed name similarity) to check - # whether neighbors match, preventing circular reinforcement. - confidence = dict(positive_base) - for id_a, id_b in pairs: - base = positive_base[(id_a, id_b)] - if base > neg_gate: - neg = compute_negative_factor( - id_a, - id_b, - forward_adj, - rel_sim, - name_seed, - alpha=neg_alpha, - floor=neg_floor, - rel_threshold=rel_threshold, - ) - combined = base * neg - confidence[(id_a, id_b)] = combined - confidence[(id_b, id_a)] = combined + positive_base = propagate_positive( + adjacency, + pairs, + dict(confidence), + rel_sim=rel_sim, + rel_threshold=rel_threshold, + max_iter=max_iter, + epsilon=epsilon, + exp_lambda=exp_lambda, + ) - return confidence + return apply_negative( + positive_base, + pairs, + forward_adj, + rel_sim, + name_seed, + neg_alpha=neg_alpha, + neg_floor=neg_floor, + neg_gate=neg_gate, + rel_threshold=rel_threshold, + ) # --------------------------------------------------------------------------- From 073146268c2bbf2d7a996576b4adc9538dbca647 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:02:56 +0100 Subject: [PATCH 03/13] Add epoch-based progressive merging via union-find Wrap the propagate + dampen cycle in an outer epoch loop. High-confidence merges (>= merge_threshold, default 0.9) are committed between epochs via union-find. Merged entities' neighborhoods are unioned for subsequent epochs, allowing evidence from transitively-matched entities to compound. Key design decisions: - Adjacency lists are deduplicated per epoch to prevent inflated evidence from merged entities having multiple copies of structurally identical edges - Name-only seed (not carried-forward confidence) is used for negative evidence to prevent circular reinforcement across epochs - Best confidence across all epochs is preserved per pair, ensuring progressive merging never worsens scores vs single-epoch behavior - With default parameters, existing behavior is preserved Co-Authored-By: Claude Opus 4.6 --- worldgraph/match.py | 342 +++++++++++++++++++++++++++++++++----------- 1 file changed, 255 insertions(+), 87 deletions(-) diff --git a/worldgraph/match.py b/worldgraph/match.py index 94e4bf1..b475255 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -408,50 +408,11 @@ def apply_negative( return confidence -def propagate_similarity( +def _build_rel_sim( graph: Graph, - idf: dict[str, float], relation_embeddings: dict[str, np.ndarray], - functionality: dict[str, Functionality], - rel_threshold: float = 0.8, - max_iter: int = 30, - epsilon: float = 1e-4, - exp_lambda: float = 1.0, - neg_alpha: float = 0.3, - neg_floor: float = 0.5, - neg_gate: float = 0.3, -) -> Confidence: - """Run similarity propagation on a single unified graph. - - Compares entity pairs from different source graphs (based on - node.graph_id). Name similarity seeds the confidence dict before - the iteration loop. Entity-entity confidence is updated iteratively - using double-buffering (each iteration reads from the previous - iteration's values). - - Relation similarity is treated as binary via ``rel_threshold``: - relation pairs with embedding similarity >= threshold are considered - the same relation; below threshold they are skipped entirely. This - is the same threshold used by ``compute_functionality`` for pooling - and by ``compute_negative_factor`` for mismatch detection — one - concept of relation equivalence throughout. Each propagation path - contributes: - - min(func_a, func_b) × neighbor_confidence - - After computing the positive update, a dampened negative factor is - applied to pairs above ``neg_gate``. This penalizes pairs whose - functional neighbors have no matching counterpart in the other entity's - neighborhood, preventing false merges between entities with identical - names but different structural contexts. - - Returns confidence: (entity_id_a, entity_id_b) -> float in [0, 1]. - Both orderings (a,b) and (b,a) are stored for convenient lookup. - """ - adjacency = _build_weighted_adjacency(graph, functionality) - forward_adj = _build_forward_adjacency(graph, functionality) - - # Precompute pairwise relation similarities (gated by rel_threshold) +) -> dict[tuple[str, str], float]: + """Precompute pairwise relation similarities for all relations in graph.""" all_relations = {edge.relation for edge in graph.edges} rel_sim: dict[tuple[str, str], float] = {} for rel_a in all_relations: @@ -463,55 +424,261 @@ def propagate_similarity( if embedding_b is None: continue rel_sim[(rel_a, rel_b)] = max(0.0, float(np.dot(embedding_a, embedding_b))) + return rel_sim - entity_ids = list(graph.nodes.keys()) - # Seed confidence from name similarity before the iteration loop. - confidence: Confidence = {} +def _build_epoch_adjacency( + graph: Graph, + functionality: dict[str, Functionality], + uf: UnionFind, +) -> tuple[dict[str, list[Neighbor]], dict[str, list[Neighbor]]]: + """Build adjacency lists using union-find canonical reps. + + Merged entities' neighborhoods are unioned: each edge contributes + neighbors keyed by the canonical rep of both endpoints. Duplicate + entries (same canonical neighbor + same relation) are deduplicated + to prevent inflated evidence from merged entities having multiple + copies of structurally identical edges. + """ + default = Functionality(1.0, 1.0) + # Collect unique (entity, neighbor, relation) triples per direction + adj_seen: dict[str, set[tuple[str, str]]] = defaultdict(set) + fwd_seen: dict[str, set[tuple[str, str]]] = defaultdict(set) + adjacency: dict[str, list[Neighbor]] = defaultdict(list) + forward_adj: dict[str, list[Neighbor]] = defaultdict(list) + for edge in graph.edges: + func = functionality.get(edge.relation, default) + src = uf.find(edge.source) + tgt = uf.find(edge.target) + # Weighted adjacency (for positive propagation) + key_src = (tgt, edge.relation) + if key_src not in adj_seen[src]: + adj_seen[src].add(key_src) + adjacency[src].append(Neighbor(tgt, edge.relation, func.inverse)) + key_tgt = (src, edge.relation) + if key_tgt not in adj_seen[tgt]: + adj_seen[tgt].add(key_tgt) + adjacency[tgt].append(Neighbor(src, edge.relation, func.forward)) + # Forward adjacency (for negative evidence) + if key_src not in fwd_seen[src]: + fwd_seen[src].add(key_src) + forward_adj[src].append(Neighbor(tgt, edge.relation, func.forward)) + if key_tgt not in fwd_seen[tgt]: + fwd_seen[tgt].add(key_tgt) + forward_adj[tgt].append(Neighbor(src, edge.relation, func.inverse)) + return adjacency, forward_adj + + +def _build_epoch_pairs( + graph: Graph, + uf: UnionFind, +) -> list[tuple[str, str]]: + """Build cross-graph entity pairs between canonical reps. + + Maps original entity IDs through uf.find(), deduplicates, and + skips pairs where all members share a single source graph. + """ + # Collect which graph_ids each canonical rep covers + canon_graphs: dict[str, set[str]] = defaultdict(set) + for eid, node in graph.nodes.items(): + canon_graphs[uf.find(eid)].add(node.graph_id) + + canons = sorted(canon_graphs.keys()) pairs: list[tuple[str, str]] = [] - for i, id_a in enumerate(entity_ids): - for id_b in entity_ids[i + 1 :]: - if graph.nodes[id_a].graph_id == graph.nodes[id_b].graph_id: + for i, ca in enumerate(canons): + for cb in canons[i + 1 :]: + # Skip if all members of both groups are from the same graph + if len(canon_graphs[ca] | canon_graphs[cb]) == 1: continue - name_sim = max( - soft_tfidf(na, nb, idf) - for na in graph.nodes[id_a].names - for nb in graph.nodes[id_b].names - ) - name_sim = max(0.0, name_sim) - confidence[(id_a, id_b)] = name_sim - confidence[(id_b, id_a)] = name_sim - pairs.append((id_a, id_b)) - - # Name-similarity seed is fixed and used by the negative factor to - # check whether neighbors match. Using name sim (not propagated - # confidence) prevents circular reinforcement where structural evidence - # from an entity pair inflates its own neighbors' match quality, which - # in turn weakens the negative penalty on the original pair. - name_seed: Confidence = dict(confidence) - - positive_base = propagate_positive( - adjacency, - pairs, - dict(confidence), - rel_sim=rel_sim, - rel_threshold=rel_threshold, - max_iter=max_iter, - epsilon=epsilon, - exp_lambda=exp_lambda, - ) + pairs.append((ca, cb)) + return pairs - return apply_negative( - positive_base, - pairs, - forward_adj, - rel_sim, - name_seed, - neg_alpha=neg_alpha, - neg_floor=neg_floor, - neg_gate=neg_gate, - rel_threshold=rel_threshold, - ) + +def _seed_epoch_confidence( + graph: Graph, + idf: dict[str, float], + uf: UnionFind, + pairs: list[tuple[str, str]], + prev_confidence: Confidence | None = None, +) -> tuple[Confidence, Confidence]: + """Seed confidence for an epoch using max name similarity across members. + + For each canonical pair (ca, cb), the name seed is the maximum + soft-TFIDF score across all name pairs from all members of both + groups. The full seed additionally carries forward previous epoch + confidence. + + Returns (full_seed, name_seed) — the full seed is used for positive + propagation, while the name-only seed is used for negative evidence + to prevent circular reinforcement across epochs. + """ + # Build member lists per canonical rep + members: dict[str, list[str]] = defaultdict(list) + for eid in graph.nodes: + members[uf.find(eid)].append(eid) + + full_seed: Confidence = {} + name_seed: Confidence = {} + for ca, cb in pairs: + # Max name similarity across all member-pair name comparisons + best_name = 0.0 + for ma in members[ca]: + for mb in members[cb]: + if graph.nodes[ma].graph_id == graph.nodes[mb].graph_id: + continue + for na in graph.nodes[ma].names: + for nb in graph.nodes[mb].names: + best_name = max(best_name, soft_tfidf(na, nb, idf)) + best_name = max(0.0, best_name) + + name_seed[(ca, cb)] = best_name + name_seed[(cb, ca)] = best_name + + # Carry forward previous confidence between any member pairs + best_prev = 0.0 + if prev_confidence: + for ma in members[ca]: + for mb in members[cb]: + best_prev = max( + best_prev, + prev_confidence.get((ma, mb), 0.0), + ) + + seed = max(best_name, best_prev) + full_seed[(ca, cb)] = seed + full_seed[(cb, ca)] = seed + + return full_seed, name_seed + + +def propagate_similarity( + graph: Graph, + idf: dict[str, float], + relation_embeddings: dict[str, np.ndarray], + functionality: dict[str, Functionality], + rel_threshold: float = 0.8, + max_iter: int = 30, + epsilon: float = 1e-4, + exp_lambda: float = 1.0, + neg_alpha: float = 0.3, + neg_floor: float = 0.5, + neg_gate: float = 0.3, + merge_threshold: float = 0.9, + max_epochs: int = 5, +) -> tuple[Confidence, UnionFind]: + """Run epoch-based similarity propagation with progressive merging. + + Each epoch runs the positive fixpoint loop to convergence, applies + negative dampening, then commits high-confidence merges via union-find. + Merged entities' neighborhoods are unioned for subsequent epochs, + allowing evidence from transitively-matched entities to compound. + + With default ``merge_threshold=0.9`` and ``max_epochs=5``, pairs + scoring below 0.9 never trigger progressive merges, so the epoch + loop exits after one epoch — reproducing the previous non-epoch + behavior. + + Returns (confidence, union_find) where confidence maps canonical-rep + pairs to scores and union_find tracks all committed merges. + """ + rel_sim = _build_rel_sim(graph, relation_embeddings) + uf = UnionFind() + + # Initialize all entities in the union-find + for eid in graph.nodes: + uf.find(eid) + + confidence: Confidence = {} + # Track the best score seen for each original entity pair across all + # epochs. Enriched neighborhoods in later epochs can strengthen + # negative evidence, but earlier positive evidence should not be lost. + best_confidence: Confidence = {} + prev_epoch_confidence: Confidence | None = None + + for _epoch in range(max_epochs): + adjacency, forward_adj = _build_epoch_adjacency(graph, functionality, uf) + pairs = _build_epoch_pairs(graph, uf) + + if not pairs: + break + + confidence, name_seed = _seed_epoch_confidence( + graph, idf, uf, pairs, prev_epoch_confidence + ) + + positive_base = propagate_positive( + adjacency, + pairs, + dict(confidence), + rel_sim=rel_sim, + rel_threshold=rel_threshold, + max_iter=max_iter, + epsilon=epsilon, + exp_lambda=exp_lambda, + ) + + confidence = apply_negative( + positive_base, + pairs, + forward_adj, + rel_sim, + name_seed, + neg_alpha=neg_alpha, + neg_floor=neg_floor, + neg_gate=neg_gate, + rel_threshold=rel_threshold, + ) + + # Expand this epoch's canonical-rep scores to original entity + # pairs and merge into best_confidence. + members_now: dict[str, list[str]] = defaultdict(list) + for eid in graph.nodes: + members_now[uf.find(eid)].append(eid) + + for (ca, cb), score in confidence.items(): + if ca == cb: + continue + for ma in members_now.get(ca, [ca]): + for mb in members_now.get(cb, [cb]): + if graph.nodes[ma].graph_id == graph.nodes[mb].graph_id: + continue + old = best_confidence.get((ma, mb), -1.0) + new = max(old, score) + best_confidence[(ma, mb)] = new + best_confidence[(mb, ma)] = new + + # Find new merges above merge_threshold + new_merges = [] + for ca, cb in pairs: + if confidence.get((ca, cb), 0.0) >= merge_threshold: + if uf.find(ca) != uf.find(cb): + new_merges.append((ca, cb)) + + if not new_merges: + break + + for ca, cb in new_merges: + uf.union(ca, cb) + + prev_epoch_confidence = confidence + + # Set confidence=1.0 for all pairs within the same UF group + # (they were merged with high confidence during epochs). + members: dict[str, list[str]] = defaultdict(list) + for eid in graph.nodes: + members[uf.find(eid)].append(eid) + + for group_members in members.values(): + if len(group_members) < 2: + continue + for i, ma in enumerate(group_members): + for mb in group_members[i + 1 :]: + if graph.nodes[ma].graph_id == graph.nodes[mb].graph_id: + continue + best_confidence[(ma, mb)] = 1.0 + best_confidence[(mb, ma)] = 1.0 + + return best_confidence, uf # --------------------------------------------------------------------------- @@ -548,7 +715,7 @@ def match_graphs( graphs, relation_embeddings, rel_cluster_threshold ) - return propagate_similarity( + confidence, _uf = propagate_similarity( unified, idf, relation_embeddings, @@ -556,6 +723,7 @@ def match_graphs( rel_threshold=rel_cluster_threshold, **propagate_kwargs, ) + return confidence def build_match_groups( From 2d90b5d459eb6a10ce36a1941d7b3af336d347ee Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:10:39 +0100 Subject: [PATCH 04/13] Add test for enriched-neighborhood benefit of progressive merging Three articles where A+B merge in epoch 1 (identical names + strong structural overlap). Article C's "Meridian Tech Corp" has moderate name similarity and neighbors split between A-unique (Austin) and B-unique (Volta Systems). Without progressive merging, C sees only pairwise evidence. With progressive merging, the enriched A+B neighborhood provides additional structural paths, producing measurably higher confidence. Co-Authored-By: Claude Opus 4.6 --- tests/test_propagation.py | 93 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/tests/test_propagation.py b/tests/test_propagation.py index 75c31f5..daade7e 100644 --- a/tests/test_propagation.py +++ b/tests/test_propagation.py @@ -652,3 +652,96 @@ def test_multi_label_all_names_contribute_to_idf(embedder): # Should not raise — multi-label names flow through the pipeline confidence = match_graphs([g1, g2], embedder) assert confidence[(m1.id, m2.id)] > 0.8 + + +# --------------------------------------------------------------------------- +# Progressive merging — enriched neighborhood +# --------------------------------------------------------------------------- + + +def test_progressive_merging_enriched_neighborhood(embedder): + """Progressive merging enriches neighborhoods across epochs, enabling + matches that pairwise comparison alone cannot produce. + + Articles A and B describe Meridian Corp with overlapping structure + (DataVault, James Chen, Stanford) plus unique neighbors (A: Austin, + B: Volta Systems). They merge in epoch 1 (identical names + strong + structural match). + + Article C describes "Meridian Tech Corp" — moderate name similarity + (~0.64) to "Meridian Corp", sharing one neighbor with A (Austin) and + one with B (Volta Systems), plus James Chen (shared by both). + + Without progressive merging (max_epochs=1), C's best pairwise match + sees only 2 structural paths (James Chen + one of Austin/Volta). + With progressive merging, the merged A+B entity has ALL neighbors + (DataVault, James Chen, Stanford, Austin, Volta), giving C three + matching paths. The additional structural evidence produces a + measurably higher confidence. + """ + # Article A: Meridian Corp with DataVault, James Chen, Stanford, Austin + ga = Graph(id="a") + ma = ga.add_entity("Meridian Corp") + dva = ga.add_entity("DataVault") + ja = ga.add_entity("James Chen") + su_a = ga.add_entity("Stanford University") + austin_a = ga.add_entity("Austin") + ga.add_edge(ma, dva, "acquired") + ga.add_edge(ma, ja, "CEO is") + ga.add_edge(ma, su_a, "alumna of") + ga.add_edge(ma, austin_a, "headquartered in") + + # Article B: Meridian Corp with DataVault, James Chen, Stanford, Volta + gb = Graph(id="b") + mb = gb.add_entity("Meridian Corp") + dvb = gb.add_entity("DataVault") + jb = gb.add_entity("James Chen") + su_b = gb.add_entity("Stanford University") + volta_b = gb.add_entity("Volta Systems") + gb.add_edge(mb, dvb, "purchased") + gb.add_edge(mb, jb, "CEO is") + gb.add_edge(mb, su_b, "alumna of") + gb.add_edge(mb, volta_b, "partnered with") + + # Article C: "Meridian Tech Corp" — moderate name sim, neighbors from + # both A-unique (Austin) and B-unique (Volta) plus shared (James Chen) + gc = Graph(id="c") + mc = gc.add_entity("Meridian Tech Corp") + austin_c = gc.add_entity("Austin") + volta_c = gc.add_entity("Volta Systems") + jc = gc.add_entity("James Chen") + gc.add_edge(mc, austin_c, "headquartered in") + gc.add_edge(mc, volta_c, "partnered with") + gc.add_edge(mc, jc, "CEO is") + + graphs = [ga, gb, gc] + + # Premise: name similarity alone is insufficient + from worldgraph.names import build_idf, soft_tfidf + + names = [name for g in graphs for n in g.nodes.values() for name in n.names] + idf = build_idf(names) + assert soft_tfidf("Meridian Tech Corp", "Meridian Corp", idf) < 0.8 + + # Premise: A+B merge above merge_threshold in epoch 1 + conf_single = match_graphs(graphs, embedder, max_epochs=1) + assert conf_single[(ma.id, mb.id)] >= 0.9, ( + f"A-B should merge: {conf_single[(ma.id, mb.id)]:.3f}" + ) + + # Without progressive merging, C sees only pairwise evidence + conf_progressive = match_graphs(graphs, embedder, max_epochs=5) + + # Progressive merging produces strictly higher confidence for C + c_single = max( + conf_single.get((mc.id, ma.id), 0), + conf_single.get((mc.id, mb.id), 0), + ) + c_progressive = max( + conf_progressive.get((mc.id, ma.id), 0), + conf_progressive.get((mc.id, mb.id), 0), + ) + assert c_progressive > c_single, ( + f"Progressive merging should improve C's match: " + f"single={c_single:.3f}, progressive={c_progressive:.3f}" + ) From 16d226d1d66b77c74656f2033f0bc9a2a62ef576 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:11:31 +0100 Subject: [PATCH 05/13] Add integration test: progressive merging prevents cascading false merges Two unrelated clusters with isomorphic structure (NovaTech/DataVault vs Quantum Labs/ClearSky) stay separate even with progressive merging enabled. Within-cluster merges work correctly while cross-cluster isolation is maintained. Co-Authored-By: Claude Opus 4.6 --- tests/test_integration.py | 69 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/test_integration.py b/tests/test_integration.py index cfaa24a..2b6b490 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -299,3 +299,72 @@ def test_shared_person_across_clusters(embedder): assert not (group & m_ids and group & summit_ids), ( f"Meridian and Summit incorrectly merged: {group}" ) + + +# --------------------------------------------------------------------------- +# 4. Progressive merging does not cause cascading false merges +# --------------------------------------------------------------------------- + + +def test_progressive_merging_no_cascading_false_merges(embedder): + """Two unrelated clusters with similar structure should stay separate + even with progressive merging enabled. + + Cluster A: NovaTech acquired DataVault (CEO: James Chen) + Cluster B: Quantum Labs acquired ClearSky (CEO: Sarah Park) + + Both clusters have isomorphic structure (acquirer → target, acquirer + → CEO), but all entity names differ. Within-cluster entities merge + across sources (identical names), but cross-cluster entities must + not merge even after progressive merging enriches neighborhoods.""" + # Cluster A, source 1 + a1 = Graph(id="nova-1") + nt_a1 = a1.add_entity("NovaTech") + dv_a1 = a1.add_entity("DataVault") + jc_a1 = a1.add_entity("James Chen") + a1.add_edge(nt_a1, dv_a1, "acquired") + a1.add_edge(nt_a1, jc_a1, "CEO is") + + # Cluster A, source 2 + a2 = Graph(id="nova-2") + nt_a2 = a2.add_entity("NovaTech") + dv_a2 = a2.add_entity("DataVault") + jc_a2 = a2.add_entity("James Chen") + a2.add_edge(nt_a2, dv_a2, "purchased") + a2.add_edge(nt_a2, jc_a2, "CEO is") + + # Cluster B, source 1 + b1 = Graph(id="quantum-1") + ql_b1 = b1.add_entity("Quantum Labs") + cs_b1 = b1.add_entity("ClearSky") + sp_b1 = b1.add_entity("Sarah Park") + b1.add_edge(ql_b1, cs_b1, "acquired") + b1.add_edge(ql_b1, sp_b1, "CEO is") + + # Cluster B, source 2 + b2 = Graph(id="quantum-2") + ql_b2 = b2.add_entity("Quantum Labs") + cs_b2 = b2.add_entity("ClearSky") + sp_b2 = b2.add_entity("Sarah Park") + b2.add_edge(ql_b2, cs_b2, "purchased") + b2.add_edge(ql_b2, sp_b2, "CEO is") + + graphs = [a1, a2, b1, b2] + confidence = match_graphs(graphs, embedder) + groups, _ = build_match_groups(graphs, confidence) + + cluster_a_ids = {nt_a1.id, dv_a1.id, jc_a1.id, nt_a2.id, dv_a2.id, jc_a2.id} + cluster_b_ids = {ql_b1.id, cs_b1.id, sp_b1.id, ql_b2.id, cs_b2.id, sp_b2.id} + + for group in groups: + has_a = bool(group & cluster_a_ids) + has_b = bool(group & cluster_b_ids) + assert not (has_a and has_b), ( + f"Progressive merging caused cross-cluster false merge: {group}" + ) + + # Within-cluster merges should still work + nt_group = _find_group_containing(groups, nt_a1.id) + assert nt_group is not None and nt_a2.id in nt_group + ql_group = _find_group_containing(groups, ql_b1.id) + assert ql_group is not None and ql_b2.id in ql_group From 557e666c696816f1ee707c0bb87e6576e6f51bb3 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:12:26 +0100 Subject: [PATCH 06/13] Expose merge_threshold and max_epochs in CLI and run_matching Add --merge-threshold and --max-epochs options to the match CLI command, passing them through run_matching to the propagation pipeline. Co-Authored-By: Claude Opus 4.6 --- worldgraph/cli.py | 18 +++++++++++++++++- worldgraph/match.py | 4 ++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/worldgraph/cli.py b/worldgraph/cli.py index fc914f0..a27e42f 100644 --- a/worldgraph/cli.py +++ b/worldgraph/cli.py @@ -47,7 +47,19 @@ def extract(articles: tuple[Path, ...], output_dir: Path, model: str): "--max-iter", default=30, type=int, - help="Maximum propagation iterations.", + help="Maximum propagation iterations per epoch.", +) +@click.option( + "--merge-threshold", + default=0.9, + type=float, + help="Minimum confidence to commit a merge between epochs.", +) +@click.option( + "--max-epochs", + default=5, + type=int, + help="Maximum number of progressive merge epochs.", ) def match( graphs: tuple[Path, ...], @@ -55,6 +67,8 @@ def match( relation_threshold: float, match_threshold: float, max_iter: int, + merge_threshold: float, + max_epochs: int, ): """Stage 2: Entity alignment via similarity propagation — merge matched graphs.""" run_matching( @@ -63,4 +77,6 @@ def match( relation_threshold=relation_threshold, match_threshold=match_threshold, max_iter=max_iter, + merge_threshold=merge_threshold, + max_epochs=max_epochs, ) diff --git a/worldgraph/match.py b/worldgraph/match.py index b475255..c0a1a87 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -760,6 +760,8 @@ def run_matching( match_threshold: float, max_iter: int = 30, epsilon: float = 1e-4, + merge_threshold: float = 0.9, + max_epochs: int = 5, ) -> None: """Load graphs, run matching pipeline, save results.""" graphs = [load_graph(path) for path in graph_files] @@ -777,6 +779,8 @@ def run_matching( rel_cluster_threshold=relation_threshold, max_iter=max_iter, epsilon=epsilon, + merge_threshold=merge_threshold, + max_epochs=max_epochs, ) match_groups, unified = build_match_groups(graphs, confidence, match_threshold) From a626765d3676163a6b21899a506cc873507ed77a Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:19:53 +0100 Subject: [PATCH 07/13] Remove dead code: _build_weighted_adjacency and _build_forward_adjacency These functions were replaced by _build_epoch_adjacency but left behind. Co-Authored-By: Claude Opus 4.6 --- worldgraph/match.py | 56 --------------------------------------------- 1 file changed, 56 deletions(-) diff --git a/worldgraph/match.py b/worldgraph/match.py index c0a1a87..320368e 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -152,62 +152,6 @@ def compute_functionality( # --------------------------------------------------------------------------- -def _build_weighted_adjacency( - graph: Graph, - functionality: dict[str, Functionality], -) -> dict[str, list[Neighbor]]: - """Build per-entity adjacency list with direction-appropriate functionality. - - PARIS semantics: the functionality weight measures "if my neighbor matches, - how strong is the evidence that I match?" - - For edge source --r--> target: - - source uses inverse functionality: "given the target, how unique is the - source?" If fun⁻¹(r) ≈ 1, a target match strongly implies a source match. - - target uses forward functionality: "given the source, how unique is the - target?" If fun(r) ≈ 1, a source match strongly implies a target match. - """ - default = Functionality(1.0, 1.0) - adjacency: dict[str, list[Neighbor]] = defaultdict(list) - for edge in graph.edges: - func = functionality.get(edge.relation, default) - adjacency[edge.source].append( - Neighbor(edge.target, edge.relation, func.inverse) - ) - adjacency[edge.target].append( - Neighbor(edge.source, edge.relation, func.forward) - ) - return adjacency - - -def _build_forward_adjacency( - graph: Graph, - functionality: dict[str, Functionality], -) -> dict[str, list[Neighbor]]: - """Build per-entity forward adjacency for negative evidence. - - For negative evidence, we need forward functionality: "given the source, - how many targets does this relation map to?" If the answer is one (high - forward functionality) and the target doesn't match, that's damning. - - For each edge source --r--> target, the source gets a neighbor entry - with forward functionality. Both directions are included symmetrically: - the target also gets a neighbor entry (with inverse functionality as the - "forward" direction from target's perspective). - """ - default = Functionality(1.0, 1.0) - adjacency: dict[str, list[Neighbor]] = defaultdict(list) - for edge in graph.edges: - func = functionality.get(edge.relation, default) - adjacency[edge.source].append( - Neighbor(edge.target, edge.relation, func.forward) - ) - adjacency[edge.target].append( - Neighbor(edge.source, edge.relation, func.inverse) - ) - return adjacency - - def compute_negative_factor( id_a: str, id_b: str, From ff1a9dd7ec2fcc51b5019354acdcff6e7fd4c245 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 16:26:47 +0100 Subject: [PATCH 08/13] Skip self-loops in _build_epoch_adjacency after union-find mapping When merged entities were previously connected by a relation, mapping both endpoints through union-find produces src == tgt. These self-loops would pollute adjacency lists with spurious neighbors. Filter them out. Co-Authored-By: Claude Opus 4.6 --- worldgraph/match.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/worldgraph/match.py b/worldgraph/match.py index 320368e..267361a 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -394,6 +394,8 @@ def _build_epoch_adjacency( func = functionality.get(edge.relation, default) src = uf.find(edge.source) tgt = uf.find(edge.target) + if src == tgt: + continue # Weighted adjacency (for positive propagation) key_src = (tgt, edge.relation) if key_src not in adj_seen[src]: From 28857647b3ae75a4cf802df56044ea055a5e6261 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 29 Mar 2026 13:26:34 +0200 Subject: [PATCH 09/13] Replace epoch-based double loop with single propagation loop The epoch-based design ran an inner fixpoint loop to convergence, then an outer epoch loop that rebuilt adjacency, re-seeded confidence, and rebuilt pairs between epochs. This was wasteful and harder to follow. Replace with a single propagation loop that maintains a canonical_adj alongside the UnionFind, updated incrementally on merge (O(degree) per merge instead of O(|edges|) full rebuild). Positive evidence runs to convergence, then negative dampening is applied once, then merges are committed inline. Enriched neighborhoods compound across merge cycles. Key changes: - Remove _build_epoch_adjacency, _build_epoch_pairs, _seed_epoch_confidence - Remove propagate_positive and apply_negative as separate functions - Remove max_epochs parameter (single loop handles everything) - Add _build_adjacency, _build_pairs, _seed_confidence (initial setup) - Incremental canonical_adj dedup on merge prevents evidence inflation - Re-seed from name similarity after merge to prevent compounding dampening - Negative evidence resolves neighbor IDs through UnionFind and treats same-canonical-entity neighbors as perfect matches Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 8 +- docs/progressive_merging.md | 34 ++- tests/test_propagation.py | 8 +- worldgraph/cli.py | 12 +- worldgraph/match.py | 542 +++++++++++++++++------------------- 5 files changed, 291 insertions(+), 313 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 212c940..508eae2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -48,13 +48,13 @@ The core propagation loop (`match.py`): 6. **Unified N-graph matching** — all article graphs merged into one, propagation runs once over all cross-graph pairs. Final grouping via union-find. -### What's not implemented (yet) +7. **Negative evidence** ([docs/negative_evidence.md](docs/negative_evidence.md)) — dampened negative factor penalizes entity pairs whose functional neighbors don't match. Applied once per convergence cycle, after positive evidence stabilizes. Uses name-seed confidence (not structural) for neighbor matching to prevent circular reinforcement. -These are documented in `docs/` with design sketches but no code. +8. **Progressive merging** ([docs/progressive_merging.md](docs/progressive_merging.md)) — high-confidence merges are committed inline during the single propagation loop. Canonical adjacency is updated incrementally on merge (O(degree) per merge), avoiding full adjacency rebuilds. Enriched neighborhoods compound structural evidence across merge cycles. -- **Negative evidence** ([docs/negative_evidence.md](docs/negative_evidence.md)) — the absence of expected neighbor matches should count against entity equivalence. Without this, entities with identical names but different contexts merge incorrectly (see `test_identical_names_different_contexts_no_merge`, `test_similar_names_disjoint_neighborhoods_no_match`). PARIS tried this and abandoned it as too aggressive; we propose a dampened version. +### What's not implemented (yet) -- **Progressive merging** ([docs/progressive_merging.md](docs/progressive_merging.md)) — commit high-confidence merges during propagation and continue with enriched neighborhoods. Currently all merging is post-processing via union-find. +These are documented in `docs/` with design sketches but no code. - **Local functionality** — FLORA uses per-entity functionality (`1/|targets for this specific source|`), not just global averages. We only compute global. diff --git a/docs/progressive_merging.md b/docs/progressive_merging.md index 597f0e0..feecd82 100644 --- a/docs/progressive_merging.md +++ b/docs/progressive_merging.md @@ -40,29 +40,37 @@ Strategy 2 avoids the convergence issues of progressive merging but misses the e ## Our approach: epoch-based progressive merging -Neither the literature nor standard fixpoint theory provides a clean answer for progressive merging during propagation. We propose a hybrid that preserves most of the convergence properties while gaining the neighborhood enrichment benefit. +Neither the literature nor standard fixpoint theory provides a clean answer for progressive merging during propagation. We propose a single-loop design that preserves most of the convergence properties while gaining the neighborhood enrichment benefit. ### The mechanism -Divide propagation into **epochs**. Within each epoch, run standard propagation (monotone non-decreasing, convergence guaranteed). Between epochs, commit matches and merge: +A single propagation loop runs positive evidence to convergence, applies negative dampening, then checks for merges. When merges are found, the canonical adjacency is updated incrementally and propagation continues with enriched neighborhoods: ``` -for epoch in range(max_epochs): - # Phase 1: Standard propagation within the epoch - confidence = propagate_to_convergence(graph, confidence) +for iteration in range(max_iter): + # Phase 1: Positive propagation (monotone non-decreasing) + confidence = propagate_one_step(confidence) + if not converged: + continue - # Phase 2: Commit high-confidence merges + # Phase 2: Negative dampening (post-convergence) + confidence = apply_negative(confidence) + + # Phase 3: Commit high-confidence merges new_merges = find_merges(confidence, threshold=merge_threshold) if not new_merges: - break # No new merges → global convergence - - # Phase 3: Merge entities in the graph - graph = apply_merges(graph, new_merges) - - # Phase 4: Re-seed confidence for the merged graph - confidence = reseed(graph, confidence, new_merges) + break # Converged, no new merges → done + + # Phase 4: Update adjacency incrementally, remap pairs/confidence + for a, b in new_merges: + uf.union(a, b) + canonical_adj[uf.find(a)] = dedup(adj[a] + adj[b]) + confidence, pairs = remap_to_canonical(confidence, pairs, uf) + confidence = reseed_from_names(confidence, name_seed) # prevent compounding dampening ``` +The key insight: maintaining a `canonical_adj` alongside the UnionFind, updated incrementally on merge, avoids rebuilding adjacency from scratch each cycle. Each merge costs O(degree), not O(|edges|). + ### What merging means concretely When entities `a` and `b` are merged into entity `ab`: diff --git a/tests/test_propagation.py b/tests/test_propagation.py index daade7e..3840aa7 100644 --- a/tests/test_propagation.py +++ b/tests/test_propagation.py @@ -723,14 +723,14 @@ def test_progressive_merging_enriched_neighborhood(embedder): idf = build_idf(names) assert soft_tfidf("Meridian Tech Corp", "Meridian Corp", idf) < 0.8 - # Premise: A+B merge above merge_threshold in epoch 1 - conf_single = match_graphs(graphs, embedder, max_epochs=1) + # Premise: A+B merge above merge_threshold + conf_single = match_graphs(graphs, embedder, merge_threshold=float("inf")) assert conf_single[(ma.id, mb.id)] >= 0.9, ( f"A-B should merge: {conf_single[(ma.id, mb.id)]:.3f}" ) - # Without progressive merging, C sees only pairwise evidence - conf_progressive = match_graphs(graphs, embedder, max_epochs=5) + # Without progressive merging (merge_threshold=inf), C sees only pairwise evidence + conf_progressive = match_graphs(graphs, embedder) # Progressive merging produces strictly higher confidence for C c_single = max( diff --git a/worldgraph/cli.py b/worldgraph/cli.py index a27e42f..94d4612 100644 --- a/worldgraph/cli.py +++ b/worldgraph/cli.py @@ -47,19 +47,13 @@ def extract(articles: tuple[Path, ...], output_dir: Path, model: str): "--max-iter", default=30, type=int, - help="Maximum propagation iterations per epoch.", + help="Maximum propagation iterations.", ) @click.option( "--merge-threshold", default=0.9, type=float, - help="Minimum confidence to commit a merge between epochs.", -) -@click.option( - "--max-epochs", - default=5, - type=int, - help="Maximum number of progressive merge epochs.", + help="Minimum confidence to commit a progressive merge during propagation.", ) def match( graphs: tuple[Path, ...], @@ -68,7 +62,6 @@ def match( match_threshold: float, max_iter: int, merge_threshold: float, - max_epochs: int, ): """Stage 2: Entity alignment via similarity propagation — merge matched graphs.""" run_matching( @@ -78,5 +71,4 @@ def match( match_threshold=match_threshold, max_iter=max_iter, merge_threshold=merge_threshold, - max_epochs=max_epochs, ) diff --git a/worldgraph/match.py b/worldgraph/match.py index 267361a..85450e8 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -155,13 +155,14 @@ def compute_functionality( def compute_negative_factor( id_a: str, id_b: str, - forward_adj: dict[str, list[Neighbor]], + adj: dict[str, list[Neighbor]], rel_sim: dict[tuple[str, str], float], confidence: Confidence, alpha: float = 0.3, floor: float = 0.5, *, rel_threshold: float, + uf: UnionFind, ) -> float: """Compute dampened negative factor for an entity pair. @@ -182,22 +183,24 @@ def compute_negative_factor( neg_a = _one_sided_negative( id_a, id_b, - forward_adj, + adj, rel_sim, confidence, alpha, floor, rel_threshold=rel_threshold, + uf=uf, ) neg_b = _one_sided_negative( id_b, id_a, - forward_adj, + adj, rel_sim, confidence, alpha, floor, rel_threshold=rel_threshold, + uf=uf, ) return max(neg_a, neg_b) @@ -205,13 +208,14 @@ def compute_negative_factor( def _one_sided_negative( id_a: str, id_b: str, - forward_adj: dict[str, list[Neighbor]], + adj: dict[str, list[Neighbor]], rel_sim: dict[tuple[str, str], float], confidence: Confidence, alpha: float, floor: float, *, rel_threshold: float, + uf: UnionFind, ) -> float: """Negative factor from a's perspective. @@ -224,20 +228,30 @@ def _one_sided_negative( Relation similarity is treated as binary (matching or not) using ``rel_threshold`` — the same threshold used by positive propagation and functionality pooling. + + Neighbor entity IDs are resolved through ``uf`` to handle merged + entities whose adjacency entries may reference pre-merge IDs. """ - neighbors_a = forward_adj.get(id_a, []) + neighbors_a = adj.get(id_a, []) if not neighbors_a: return 1.0 total_weight = 0.0 weighted_mismatch = 0.0 for neighbor_a in neighbors_a: + ra = uf.find(neighbor_a.entity_id) + if ra == id_a: + continue match_prob = 0.0 - for neighbor_b in forward_adj.get(id_b, []): + for neighbor_b in adj.get(id_b, []): + rb = uf.find(neighbor_b.entity_id) + if rb == id_b: + continue rs = rel_sim.get((neighbor_a.relation, neighbor_b.relation), 0.0) if rs < rel_threshold: continue - nbr_conf = confidence.get((neighbor_a.entity_id, neighbor_b.entity_id), 0.0) + # Same canonical entity = perfect match (both sides merged). + nbr_conf = 1.0 if ra == rb else confidence.get((ra, rb), 0.0) match_prob += nbr_conf match_prob = min(match_prob, 1.0) total_weight += neighbor_a.func_weight @@ -259,99 +273,6 @@ def build_unified_graph(graphs: list[Graph]) -> Graph: return unified -def propagate_positive( - adjacency: dict[str, list[Neighbor]], - pairs: list[tuple[str, str]], - positive_base: Confidence, - *, - rel_sim: dict[tuple[str, str], float], - rel_threshold: float, - max_iter: int, - epsilon: float, - exp_lambda: float, -) -> Confidence: - """Run the monotone non-decreasing positive fixpoint loop. - - Updates ``positive_base`` in place and returns it. Each iteration - reads from the previous snapshot (double-buffering) and applies the - monotone max rule: new value = max(structural_update, old_value). - """ - for _ in range(max_iter): - prev_base = dict(positive_base) - changed = False - - for id_a, id_b in pairs: - strength_sum = 0.0 - - for neighbor_a in adjacency.get(id_a, []): - for neighbor_b in adjacency.get(id_b, []): - rs = rel_sim.get((neighbor_a.relation, neighbor_b.relation), 0.0) - if rs < rel_threshold: - continue - neighbor_confidence = prev_base.get( - (neighbor_a.entity_id, neighbor_b.entity_id), 0.0 - ) - if neighbor_confidence <= 0.0: - continue - weight = min(neighbor_a.func_weight, neighbor_b.func_weight) - strength_sum += weight * neighbor_confidence - - positive = ( - 1.0 - math.exp(-exp_lambda * strength_sum) if strength_sum > 0 else 0.0 - ) - - old_base = prev_base[(id_a, id_b)] - base = max(positive, old_base) - positive_base[(id_a, id_b)] = base - positive_base[(id_b, id_a)] = base - - if abs(base - old_base) > epsilon: - changed = True - - if not changed: - break - - return positive_base - - -def apply_negative( - positive_base: Confidence, - pairs: list[tuple[str, str]], - forward_adj: dict[str, list[Neighbor]], - rel_sim: dict[tuple[str, str], float], - name_seed: Confidence, - *, - neg_alpha: float, - neg_floor: float, - neg_gate: float, - rel_threshold: float, -) -> Confidence: - """Apply negative dampening as a single post-convergence pass. - - The negative factor uses ``name_seed`` (fixed name similarity) to - check whether neighbors match, preventing circular reinforcement. - Returns a new confidence dict with dampened values. - """ - confidence = dict(positive_base) - for id_a, id_b in pairs: - base = positive_base[(id_a, id_b)] - if base > neg_gate: - neg = compute_negative_factor( - id_a, - id_b, - forward_adj, - rel_sim, - name_seed, - alpha=neg_alpha, - floor=neg_floor, - rel_threshold=rel_threshold, - ) - combined = base * neg - confidence[(id_a, id_b)] = combined - confidence[(id_b, id_a)] = combined - return confidence - - def _build_rel_sim( graph: Graph, relation_embeddings: dict[str, np.ndarray], @@ -371,130 +292,71 @@ def _build_rel_sim( return rel_sim -def _build_epoch_adjacency( +def _build_adjacency( graph: Graph, functionality: dict[str, Functionality], - uf: UnionFind, -) -> tuple[dict[str, list[Neighbor]], dict[str, list[Neighbor]]]: - """Build adjacency lists using union-find canonical reps. - - Merged entities' neighborhoods are unioned: each edge contributes - neighbors keyed by the canonical rep of both endpoints. Duplicate - entries (same canonical neighbor + same relation) are deduplicated - to prevent inflated evidence from merged entities having multiple - copies of structurally identical edges. +) -> dict[str, list[Neighbor]]: + """Build the initial canonical adjacency from graph edges. + + Each edge contributes two entries (one per endpoint). Duplicate + entries (same neighbor + same relation) are deduplicated to prevent + inflated evidence. """ default = Functionality(1.0, 1.0) - # Collect unique (entity, neighbor, relation) triples per direction - adj_seen: dict[str, set[tuple[str, str]]] = defaultdict(set) - fwd_seen: dict[str, set[tuple[str, str]]] = defaultdict(set) + seen: dict[str, set[tuple[str, str]]] = defaultdict(set) adjacency: dict[str, list[Neighbor]] = defaultdict(list) - forward_adj: dict[str, list[Neighbor]] = defaultdict(list) for edge in graph.edges: func = functionality.get(edge.relation, default) - src = uf.find(edge.source) - tgt = uf.find(edge.target) + src, tgt = edge.source, edge.target if src == tgt: continue - # Weighted adjacency (for positive propagation) key_src = (tgt, edge.relation) - if key_src not in adj_seen[src]: - adj_seen[src].add(key_src) + if key_src not in seen[src]: + seen[src].add(key_src) adjacency[src].append(Neighbor(tgt, edge.relation, func.inverse)) key_tgt = (src, edge.relation) - if key_tgt not in adj_seen[tgt]: - adj_seen[tgt].add(key_tgt) + if key_tgt not in seen[tgt]: + seen[tgt].add(key_tgt) adjacency[tgt].append(Neighbor(src, edge.relation, func.forward)) - # Forward adjacency (for negative evidence) - if key_src not in fwd_seen[src]: - fwd_seen[src].add(key_src) - forward_adj[src].append(Neighbor(tgt, edge.relation, func.forward)) - if key_tgt not in fwd_seen[tgt]: - fwd_seen[tgt].add(key_tgt) - forward_adj[tgt].append(Neighbor(src, edge.relation, func.inverse)) - return adjacency, forward_adj - + return dict(adjacency) -def _build_epoch_pairs( - graph: Graph, - uf: UnionFind, -) -> list[tuple[str, str]]: - """Build cross-graph entity pairs between canonical reps. - - Maps original entity IDs through uf.find(), deduplicates, and - skips pairs where all members share a single source graph. - """ - # Collect which graph_ids each canonical rep covers - canon_graphs: dict[str, set[str]] = defaultdict(set) - for eid, node in graph.nodes.items(): - canon_graphs[uf.find(eid)].add(node.graph_id) - canons = sorted(canon_graphs.keys()) +def _build_pairs(graph: Graph) -> list[tuple[str, str]]: + """Build cross-graph entity pairs.""" + graph_ids = {eid: node.graph_id for eid, node in graph.nodes.items()} + entities = sorted(graph.nodes.keys()) pairs: list[tuple[str, str]] = [] - for i, ca in enumerate(canons): - for cb in canons[i + 1 :]: - # Skip if all members of both groups are from the same graph - if len(canon_graphs[ca] | canon_graphs[cb]) == 1: - continue - pairs.append((ca, cb)) + for i, a in enumerate(entities): + for b in entities[i + 1 :]: + if graph_ids[a] != graph_ids[b]: + pairs.append((a, b)) return pairs -def _seed_epoch_confidence( +def _seed_confidence( graph: Graph, idf: dict[str, float], - uf: UnionFind, pairs: list[tuple[str, str]], - prev_confidence: Confidence | None = None, ) -> tuple[Confidence, Confidence]: - """Seed confidence for an epoch using max name similarity across members. + """Seed confidence from name similarity. - For each canonical pair (ca, cb), the name seed is the maximum - soft-TFIDF score across all name pairs from all members of both - groups. The full seed additionally carries forward previous epoch - confidence. - - Returns (full_seed, name_seed) — the full seed is used for positive - propagation, while the name-only seed is used for negative evidence - to prevent circular reinforcement across epochs. + Returns (confidence, name_seed) — initially identical. ``name_seed`` + is kept fixed (modulo remapping on merge) and used by negative evidence + to prevent circular reinforcement. """ - # Build member lists per canonical rep - members: dict[str, list[str]] = defaultdict(list) - for eid in graph.nodes: - members[uf.find(eid)].append(eid) - - full_seed: Confidence = {} + confidence: Confidence = {} name_seed: Confidence = {} - for ca, cb in pairs: - # Max name similarity across all member-pair name comparisons - best_name = 0.0 - for ma in members[ca]: - for mb in members[cb]: - if graph.nodes[ma].graph_id == graph.nodes[mb].graph_id: - continue - for na in graph.nodes[ma].names: - for nb in graph.nodes[mb].names: - best_name = max(best_name, soft_tfidf(na, nb, idf)) - best_name = max(0.0, best_name) - - name_seed[(ca, cb)] = best_name - name_seed[(cb, ca)] = best_name - - # Carry forward previous confidence between any member pairs - best_prev = 0.0 - if prev_confidence: - for ma in members[ca]: - for mb in members[cb]: - best_prev = max( - best_prev, - prev_confidence.get((ma, mb), 0.0), - ) - - seed = max(best_name, best_prev) - full_seed[(ca, cb)] = seed - full_seed[(cb, ca)] = seed - - return full_seed, name_seed + for a, b in pairs: + best = 0.0 + for na in graph.nodes[a].names: + for nb in graph.nodes[b].names: + best = max(best, soft_tfidf(na, nb, idf)) + best = max(0.0, best) + confidence[(a, b)] = best + confidence[(b, a)] = best + name_seed[(a, b)] = best + name_seed[(b, a)] = best + return confidence, name_seed def propagate_similarity( @@ -510,110 +372,228 @@ def propagate_similarity( neg_floor: float = 0.5, neg_gate: float = 0.3, merge_threshold: float = 0.9, - max_epochs: int = 5, ) -> tuple[Confidence, UnionFind]: - """Run epoch-based similarity propagation with progressive merging. + """Run similarity propagation with inline progressive merging. - Each epoch runs the positive fixpoint loop to convergence, applies - negative dampening, then commits high-confidence merges via union-find. - Merged entities' neighborhoods are unioned for subsequent epochs, - allowing evidence from transitively-matched entities to compound. + A single propagation loop alternates positive evidence accumulation + and negative dampening. When pairs exceed ``merge_threshold``, they + are merged via union-find and the canonical adjacency is updated + incrementally — no epoch rebuild required. - With default ``merge_threshold=0.9`` and ``max_epochs=5``, pairs - scoring below 0.9 never trigger progressive merges, so the epoch - loop exits after one epoch — reproducing the previous non-epoch - behavior. + Positive updates use the monotone max rule (new = max(structural, old)) + so positive evidence never decreases. Negative dampening is applied + as a multiplicative factor after each positive pass. - Returns (confidence, union_find) where confidence maps canonical-rep - pairs to scores and union_find tracks all committed merges. + On merge, the canonical adjacency for the new representative is built + by combining and deduplicating the adjacency lists of the merged + entities — O(degree) per merge, not O(|edges|). + + Returns (confidence, union_find) where confidence maps original + entity-ID pairs to scores and union_find tracks all committed merges. """ rel_sim = _build_rel_sim(graph, relation_embeddings) uf = UnionFind() - - # Initialize all entities in the union-find for eid in graph.nodes: uf.find(eid) - confidence: Confidence = {} - # Track the best score seen for each original entity pair across all - # epochs. Enriched neighborhoods in later epochs can strengthen - # negative evidence, but earlier positive evidence should not be lost. - best_confidence: Confidence = {} - prev_epoch_confidence: Confidence | None = None - - for _epoch in range(max_epochs): - adjacency, forward_adj = _build_epoch_adjacency(graph, functionality, uf) - pairs = _build_epoch_pairs(graph, uf) - - if not pairs: - break + canonical_adj = _build_adjacency(graph, functionality) + pairs = _build_pairs(graph) - confidence, name_seed = _seed_epoch_confidence( - graph, idf, uf, pairs, prev_epoch_confidence - ) + if not pairs: + return {}, uf - positive_base = propagate_positive( - adjacency, - pairs, - dict(confidence), - rel_sim=rel_sim, - rel_threshold=rel_threshold, - max_iter=max_iter, - epsilon=epsilon, - exp_lambda=exp_lambda, - ) + confidence, name_seed = _seed_confidence(graph, idf, pairs) - confidence = apply_negative( - positive_base, - pairs, - forward_adj, - rel_sim, - name_seed, - neg_alpha=neg_alpha, - neg_floor=neg_floor, - neg_gate=neg_gate, - rel_threshold=rel_threshold, - ) + # Track which graph_ids each canonical rep covers (for pair filtering). + canon_graphs: dict[str, set[str]] = defaultdict(set) + for eid, node in graph.nodes.items(): + canon_graphs[eid].add(node.graph_id) - # Expand this epoch's canonical-rep scores to original entity - # pairs and merge into best_confidence. - members_now: dict[str, list[str]] = defaultdict(list) - for eid in graph.nodes: - members_now[uf.find(eid)].append(eid) + for _ in range(max_iter): + prev = dict(confidence) + pos_changed = False - for (ca, cb), score in confidence.items(): - if ca == cb: - continue - for ma in members_now.get(ca, [ca]): - for mb in members_now.get(cb, [cb]): - if graph.nodes[ma].graph_id == graph.nodes[mb].graph_id: + # --- Positive propagation --- + for ca, cb in pairs: + strength_sum = 0.0 + for nbr_a in canonical_adj.get(ca, []): + ra = uf.find(nbr_a.entity_id) + if ra == ca: + continue + for nbr_b in canonical_adj.get(cb, []): + rb = uf.find(nbr_b.entity_id) + if rb == cb: continue - old = best_confidence.get((ma, mb), -1.0) - new = max(old, score) - best_confidence[(ma, mb)] = new - best_confidence[(mb, ma)] = new + rs = rel_sim.get((nbr_a.relation, nbr_b.relation), 0.0) + if rs < rel_threshold: + continue + nc = prev.get((ra, rb), 0.0) + if nc <= 0.0: + continue + weight = min(nbr_a.func_weight, nbr_b.func_weight) + strength_sum += weight * nc - # Find new merges above merge_threshold - new_merges = [] - for ca, cb in pairs: - if confidence.get((ca, cb), 0.0) >= merge_threshold: - if uf.find(ca) != uf.find(cb): - new_merges.append((ca, cb)) + positive = ( + 1.0 - math.exp(-exp_lambda * strength_sum) if strength_sum > 0 else 0.0 + ) + old = prev[(ca, cb)] + new_val = max(positive, old) + confidence[(ca, cb)] = new_val + confidence[(cb, ca)] = new_val + if abs(new_val - old) > epsilon: + pos_changed = True + + # Keep iterating positive until convergence before applying + # negative dampening and checking for merges. + if pos_changed: + continue - if not new_merges: - break + # --- Negative dampening (positive has converged) --- + for ca, cb in pairs: + base = confidence[(ca, cb)] + if base > neg_gate: + neg = compute_negative_factor( + ca, + cb, + canonical_adj, + rel_sim, + name_seed, + alpha=neg_alpha, + floor=neg_floor, + rel_threshold=rel_threshold, + uf=uf, + ) + combined = base * neg + confidence[(ca, cb)] = combined + confidence[(cb, ca)] = combined + + # --- Progressive merging --- + new_merges = [ + (ca, cb) + for ca, cb in pairs + if confidence.get((ca, cb), 0.0) >= merge_threshold + and uf.find(ca) != uf.find(cb) + ] - for ca, cb in new_merges: - uf.union(ca, cb) + if new_merges: + # Record pre-merge state for adjacency combination. + pre_merge_canons: set[str] = set() + pre_merge_graphs: dict[str, set[str]] = {} + for ca, cb in new_merges: + pre_merge_canons.add(ca) + pre_merge_canons.add(cb) + pre_merge_graphs[ca] = set(canon_graphs.get(ca, set())) + pre_merge_graphs[cb] = set(canon_graphs.get(cb, set())) + + for ca, cb in new_merges: + uf.union(ca, cb) + + # Group pre-merge canonical reps by their new representative. + merge_groups: dict[str, list[str]] = defaultdict(list) + for old_canon in pre_merge_canons: + merge_groups[uf.find(old_canon)].append(old_canon) + + # Update canonical_adj incrementally: combine + dedup. + for new_canon, old_canons in merge_groups.items(): + combined: list[Neighbor] = [] + for oc in old_canons: + combined.extend(canonical_adj.get(oc, [])) + seen: set[tuple[str, str]] = set() + deduped: list[Neighbor] = [] + for nbr in combined: + canon_nbr = uf.find(nbr.entity_id) + if canon_nbr == new_canon: + continue + key = (canon_nbr, nbr.relation) + if key not in seen: + seen.add(key) + deduped.append( + Neighbor(canon_nbr, nbr.relation, nbr.func_weight) + ) + canonical_adj[new_canon] = deduped + + # Update canon_graphs for merged reps. + for new_canon, old_canons in merge_groups.items(): + combined_graphs: set[str] = set() + for oc in old_canons: + combined_graphs |= pre_merge_graphs.get( + oc, canon_graphs.get(oc, set()) + ) + canon_graphs[new_canon] = combined_graphs + + # Remap pairs to canonical reps, dedup, drop self-pairs. + new_pairs: list[tuple[str, str]] = [] + pair_set: set[tuple[str, str]] = set() + for a, b in pairs: + ra, rb = uf.find(a), uf.find(b) + if ra == rb: + continue + pair = (min(ra, rb), max(ra, rb)) + if pair in pair_set: + continue + if len(canon_graphs.get(ra, set()) | canon_graphs.get(rb, set())) == 1: + continue + pair_set.add(pair) + new_pairs.append(pair) + pairs = new_pairs + + # Remap confidence and name_seed to canonical reps. + new_conf: Confidence = {} + for (a, b), score in confidence.items(): + ra, rb = uf.find(a), uf.find(b) + if ra == rb: + continue + old = new_conf.get((ra, rb), 0.0) + new_conf[(ra, rb)] = max(old, score) + new_conf[(rb, ra)] = max(old, score) + confidence = new_conf + + new_name: Confidence = {} + for (a, b), score in name_seed.items(): + ra, rb = uf.find(a), uf.find(b) + if ra == rb: + continue + old = new_name.get((ra, rb), 0.0) + new_name[(ra, rb)] = max(old, score) + new_name[(rb, ra)] = max(old, score) + name_seed = new_name + + # Re-seed: confidence should never fall below name similarity. + # Without this, negative dampening compounds across merge cycles + # (dampened values get dampened again after the next convergence). + for ca, cb in pairs: + ns = name_seed.get((ca, cb), 0.0) + if ns > confidence.get((ca, cb), 0.0): + confidence[(ca, cb)] = ns + confidence[(cb, ca)] = ns + + if not pairs: + break + # Merges happened — continue propagating with enriched neighborhoods. + continue - prev_epoch_confidence = confidence + # Positive converged, no new merges — done. + break - # Set confidence=1.0 for all pairs within the same UF group - # (they were merged with high confidence during epochs). + # Expand canonical-rep confidence to original entity-ID pairs. members: dict[str, list[str]] = defaultdict(list) for eid in graph.nodes: members[uf.find(eid)].append(eid) + final: Confidence = {} + for (ca, cb), score in confidence.items(): + if ca == cb: + continue + for ma in members.get(ca, [ca]): + for mb in members.get(cb, [cb]): + if graph.nodes[ma].graph_id == graph.nodes[mb].graph_id: + continue + old = final.get((ma, mb), 0.0) + new_val = max(old, score) + final[(ma, mb)] = new_val + final[(mb, ma)] = new_val + + # Merged pairs get 1.0 — they were committed with high confidence. for group_members in members.values(): if len(group_members) < 2: continue @@ -621,10 +601,10 @@ def propagate_similarity( for mb in group_members[i + 1 :]: if graph.nodes[ma].graph_id == graph.nodes[mb].graph_id: continue - best_confidence[(ma, mb)] = 1.0 - best_confidence[(mb, ma)] = 1.0 + final[(ma, mb)] = 1.0 + final[(mb, ma)] = 1.0 - return best_confidence, uf + return final, uf # --------------------------------------------------------------------------- @@ -707,7 +687,6 @@ def run_matching( max_iter: int = 30, epsilon: float = 1e-4, merge_threshold: float = 0.9, - max_epochs: int = 5, ) -> None: """Load graphs, run matching pipeline, save results.""" graphs = [load_graph(path) for path in graph_files] @@ -726,7 +705,6 @@ def run_matching( max_iter=max_iter, epsilon=epsilon, merge_threshold=merge_threshold, - max_epochs=max_epochs, ) match_groups, unified = build_match_groups(graphs, confidence, match_threshold) From 3585307760f4b3c220b446ab4649e81cb738fe72 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 29 Mar 2026 13:33:03 +0200 Subject: [PATCH 10/13] Fix negative evidence using wrong functionality direction The refactor to a single canonical_adj dropped the distinction between positive and negative functionality directions. Negative evidence needs forward func for outgoing neighbors and inverse for incoming (the opposite of positive propagation). Add neg_func_weight to Neighbor so each context uses the correct weight. Co-Authored-By: Claude Opus 4.6 --- worldgraph/match.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/worldgraph/match.py b/worldgraph/match.py index 85450e8..9827d09 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -38,11 +38,22 @@ class Functionality(NamedTuple): class Neighbor(NamedTuple): - """An entry in a node's weighted adjacency list.""" + """An entry in a node's weighted adjacency list. + + ``func_weight`` is used by positive propagation (inverse for outgoing, + forward for incoming — measures how uniquely the neighbor determines + this entity). + + ``neg_func_weight`` is used by negative evidence (forward for outgoing, + inverse for incoming — measures how uniquely this entity determines + the neighbor, so a missing match on a functional relation penalizes + heavily). + """ entity_id: str relation: str func_weight: float + neg_func_weight: float # Type aliases for the main data structures flowing through the pipeline. @@ -254,8 +265,8 @@ def _one_sided_negative( nbr_conf = 1.0 if ra == rb else confidence.get((ra, rb), 0.0) match_prob += nbr_conf match_prob = min(match_prob, 1.0) - total_weight += neighbor_a.func_weight - weighted_mismatch += neighbor_a.func_weight * (1.0 - match_prob) + total_weight += neighbor_a.neg_func_weight + weighted_mismatch += neighbor_a.neg_func_weight * (1.0 - match_prob) if total_weight == 0.0: return 1.0 @@ -313,11 +324,15 @@ def _build_adjacency( key_src = (tgt, edge.relation) if key_src not in seen[src]: seen[src].add(key_src) - adjacency[src].append(Neighbor(tgt, edge.relation, func.inverse)) + adjacency[src].append( + Neighbor(tgt, edge.relation, func.inverse, func.forward) + ) key_tgt = (src, edge.relation) if key_tgt not in seen[tgt]: seen[tgt].add(key_tgt) - adjacency[tgt].append(Neighbor(src, edge.relation, func.forward)) + adjacency[tgt].append( + Neighbor(src, edge.relation, func.forward, func.inverse) + ) return dict(adjacency) @@ -508,7 +523,12 @@ def propagate_similarity( if key not in seen: seen.add(key) deduped.append( - Neighbor(canon_nbr, nbr.relation, nbr.func_weight) + Neighbor( + canon_nbr, + nbr.relation, + nbr.func_weight, + nbr.neg_func_weight, + ) ) canonical_adj[new_canon] = deduped From 03ebf180fc55dcd1417b49e0c7bc1a208d07c4f4 Mon Sep 17 00:00:00 2001 From: Johan Schuijt Date: Sun, 29 Mar 2026 14:10:20 +0200 Subject: [PATCH 11/13] Simplify merge bookkeeping in propagate_similarity - Remove canon_graphs tracking (dead code: all pairs originate cross-graph, so the len==1 filter never triggers; ra==rb already handles self-pairs) - Replace pre_merge_canons/pre_merge_graphs with a simple set comprehension - Extract _remap_confidence helper for the identical confidence/name_seed remapping operations Co-Authored-By: Claude Opus 4.6 (1M context) --- worldgraph/match.py | 84 ++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 59 deletions(-) diff --git a/worldgraph/match.py b/worldgraph/match.py index 9827d09..c608904 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -374,6 +374,19 @@ def _seed_confidence( return confidence, name_seed +def _remap_confidence(conf: Confidence, uf: UnionFind) -> Confidence: + """Remap a confidence dict to canonical reps, taking max on collisions.""" + remapped: Confidence = {} + for (a, b), score in conf.items(): + ra, rb = uf.find(a), uf.find(b) + if ra == rb: + continue + old = remapped.get((ra, rb), 0.0) + remapped[(ra, rb)] = max(old, score) + remapped[(rb, ra)] = max(old, score) + return remapped + + def propagate_similarity( graph: Graph, idf: dict[str, float], @@ -419,11 +432,6 @@ def propagate_similarity( confidence, name_seed = _seed_confidence(graph, idf, pairs) - # Track which graph_ids each canonical rep covers (for pair filtering). - canon_graphs: dict[str, set[str]] = defaultdict(set) - for eid, node in graph.nodes.items(): - canon_graphs[eid].add(node.graph_id) - for _ in range(max_iter): prev = dict(confidence) pos_changed = False @@ -491,24 +499,14 @@ def propagate_similarity( ] if new_merges: - # Record pre-merge state for adjacency combination. - pre_merge_canons: set[str] = set() - pre_merge_graphs: dict[str, set[str]] = {} - for ca, cb in new_merges: - pre_merge_canons.add(ca) - pre_merge_canons.add(cb) - pre_merge_graphs[ca] = set(canon_graphs.get(ca, set())) - pre_merge_graphs[cb] = set(canon_graphs.get(cb, set())) - + all_merged = {e for ca, cb in new_merges for e in (ca, cb)} for ca, cb in new_merges: uf.union(ca, cb) - # Group pre-merge canonical reps by their new representative. - merge_groups: dict[str, list[str]] = defaultdict(list) - for old_canon in pre_merge_canons: - merge_groups[uf.find(old_canon)].append(old_canon) - # Update canonical_adj incrementally: combine + dedup. + merge_groups: dict[str, list[str]] = defaultdict(list) + for e in all_merged: + merge_groups[uf.find(e)].append(e) for new_canon, old_canons in merge_groups.items(): combined: list[Neighbor] = [] for oc in old_canons: @@ -532,55 +530,24 @@ def propagate_similarity( ) canonical_adj[new_canon] = deduped - # Update canon_graphs for merged reps. - for new_canon, old_canons in merge_groups.items(): - combined_graphs: set[str] = set() - for oc in old_canons: - combined_graphs |= pre_merge_graphs.get( - oc, canon_graphs.get(oc, set()) - ) - canon_graphs[new_canon] = combined_graphs - - # Remap pairs to canonical reps, dedup, drop self-pairs. - new_pairs: list[tuple[str, str]] = [] + # Remap pairs, confidence, name_seed to canonical reps. pair_set: set[tuple[str, str]] = set() + new_pairs: list[tuple[str, str]] = [] for a, b in pairs: ra, rb = uf.find(a), uf.find(b) if ra == rb: continue pair = (min(ra, rb), max(ra, rb)) - if pair in pair_set: - continue - if len(canon_graphs.get(ra, set()) | canon_graphs.get(rb, set())) == 1: - continue - pair_set.add(pair) - new_pairs.append(pair) + if pair not in pair_set: + pair_set.add(pair) + new_pairs.append(pair) pairs = new_pairs - # Remap confidence and name_seed to canonical reps. - new_conf: Confidence = {} - for (a, b), score in confidence.items(): - ra, rb = uf.find(a), uf.find(b) - if ra == rb: - continue - old = new_conf.get((ra, rb), 0.0) - new_conf[(ra, rb)] = max(old, score) - new_conf[(rb, ra)] = max(old, score) - confidence = new_conf - - new_name: Confidence = {} - for (a, b), score in name_seed.items(): - ra, rb = uf.find(a), uf.find(b) - if ra == rb: - continue - old = new_name.get((ra, rb), 0.0) - new_name[(ra, rb)] = max(old, score) - new_name[(rb, ra)] = max(old, score) - name_seed = new_name + confidence = _remap_confidence(confidence, uf) + name_seed = _remap_confidence(name_seed, uf) # Re-seed: confidence should never fall below name similarity. - # Without this, negative dampening compounds across merge cycles - # (dampened values get dampened again after the next convergence). + # Without this, negative dampening compounds across merge cycles. for ca, cb in pairs: ns = name_seed.get((ca, cb), 0.0) if ns > confidence.get((ca, cb), 0.0): @@ -589,7 +556,6 @@ def propagate_similarity( if not pairs: break - # Merges happened — continue propagating with enriched neighborhoods. continue # Positive converged, no new merges — done. From db440aaa0e7342cca86da92d03f411146ca01892 Mon Sep 17 00:00:00 2001 From: Johan Schuijt Date: Sun, 29 Mar 2026 19:58:13 +0200 Subject: [PATCH 12/13] update docs for other convergence proof --- docs/negative_evidence.md | 48 +++--- docs/progressive_merging.md | 41 +++-- docs/similarity_flooding.md | 88 +++++++++- tests/test_propagation.py | 111 ++++++++++-- worldgraph/match.py | 327 ++++++++++++++---------------------- 5 files changed, 345 insertions(+), 270 deletions(-) diff --git a/docs/negative_evidence.md b/docs/negative_evidence.md index 7ffbc00..8cabfec 100644 --- a/docs/negative_evidence.md +++ b/docs/negative_evidence.md @@ -54,46 +54,49 @@ In knowledge base alignment (PARIS's domain), completeness is somewhat reasonabl FLORA (Peng et al. 2025) explicitly excludes negation from its framework. The "Simple Positive FIS" (Definition 1) requires all variables to be non-decreasing, which is what makes the Knaster-Tarski convergence proof work. Allowing scores to decrease would break monotonicity and void the convergence guarantee. -## Our approach: dampened negative evidence +By switching from Knaster-Tarski (monotone updates) to Banach (contraction mappings) as our convergence framework, this restriction is lifted — scores can decrease, and negative evidence integrates naturally into each iteration. See [similarity_flooding.md](similarity_flooding.md) for the full theoretical comparison. + +## Our approach: integrated negative evidence via damped iteration We need negative evidence but cannot afford PARIS's brittleness. The key insight is that negative evidence should be **weaker and more selective** than positive evidence, reflecting the fundamental asymmetry in our setting: - A match between neighbors is *reliable* positive evidence (two articles independently reporting the same fact) - A *missing* match could mean many things (incomplete coverage, relation phrasing mismatch, extraction error) -### Dampened negative factor +### How it works + +Positive and negative evidence are computed together in each propagation step, feeding into a single score per entity pair. For each pair `(a, b)`, we examine all neighbor pairs `(y, y')` connected via similar relations: -For each entity pair `(a, b)`, compute a negative factor: +- **Positive**: if the neighbor pair's confidence is above 0.5 (likely match), it contributes to `pos_strength`, weighted by inverse functionality — matching neighbors of a functional relation are strong evidence FOR the match. +- **Negative**: if the neighbor pair's confidence is below 0.5 (likely non-match), it contributes to `neg_strength`, weighted by forward functionality — a functional relation whose target doesn't match is evidence AGAINST the match. + +Both are aggregated via exp-sum and combined with the name-similarity seed: ``` -neg(a, b) = PRODUCT_{edge r(a, y)} max( - 1 - alpha × fun(r) × PRODUCT_{edge r'(b, y')} (1 - Pr(y ≡ y')), - floor -) +pos_agg = 1 - exp(-λ × pos_strength) +neg_agg = 1 - exp(-λ × neg_strength) + +seed = name_similarity(a, b) +computed = seed + pos_agg × (1 - seed) - neg_agg × seed ``` -Where: -- `alpha < 1` is a dampening coefficient (e.g. 0.3) that weakens the negative signal relative to PARIS's full-strength version -- `floor` (e.g. 0.5) prevents any single missing match from killing the score entirely -- `fun(r)` is forward functionality — only functional relations generate negative evidence -- The inner product checks whether `y` matches *any* of `b`'s neighbors via similar relations +The seed serves as the baseline. Positive evidence pushes toward 1.0 (proportional to the room above seed), negative evidence pushes toward 0.0 (proportional to the seed itself). With no structural evidence, the score equals the seed. With strong negative evidence and no positive evidence, the score approaches zero. -The dampening addresses the incompleteness problem: even with high forward functionality and no matching target, the penalty is at most `(1 - alpha × 1.0)` per path, clamped to `floor`. +### The 0.5 threshold as a natural gate -### When to apply +The threshold for contributing positive vs negative evidence is 0.5 — the point of maximum uncertainty. A neighbor pair with confidence 0.6 contributes weak positive evidence. One with confidence 0.1 contributes strong negative evidence. One at exactly 0.5 contributes nothing. -Negative evidence should activate only when there is already positive evidence to temper. If a pair has near-zero positive similarity, negative evidence is irrelevant. Apply as: +This replaces the separate "gate" mechanism from the dual-channel design. There is no need for a separate activation threshold — the 0.5 boundary naturally ensures that negative evidence only affects pairs whose neighbors have meaningful non-match signal. -``` -final(a, b) = positive(a, b) × neg(a, b) if positive(a, b) > gate - positive(a, b) otherwise -``` +### Self-correcting dynamics + +Unlike PARIS's one-shot negative factor, our approach is iterative and self-correcting. Consider two entities whose CEO neighbors initially have low name similarity (0.35). In early iterations, `1 - 0.35 = 0.65 > 0.5`, so the CEO pair generates negative evidence for the parent entities. But if the CEO pair has its own structural evidence (e.g. both graduated from the same university), its confidence rises across iterations. Once it crosses 0.5, it switches from generating negative evidence to generating positive evidence. The damped iteration converges to a consistent assignment. -The gate (e.g. 0.3) ensures negative evidence only modulates pairs that are already plausible matches. This prevents wasting computation on the vast majority of pairs that will never match. +This dynamic is impossible with the dual-channel monotone approach, where negative evidence is fixed at seed values to prevent circular reinforcement. Damped iteration allows circular reinforcement, bounded by the contraction property — feedback loops shrink geometrically rather than exploding. -### Convergence implications +### Convergence -Multiplying by a negative factor makes the update non-monotone — a pair's score can decrease between iterations. This breaks FLORA's strict monotone convergence guarantee. The dampening coefficient (`alpha`) and per-path floor bound the magnitude of any single negative adjustment, and scores can never drop below `floor^k` (where k is the number of edges), so they can't collapse to zero. This makes practical convergence likely but not formally guaranteed — oscillation across pairs in circular dependency chains is theoretically possible. If convergence issues arise, `alpha` and `floor` are the tuning knobs. +The damped update `new = (1-α) × old + α × computed` ensures convergence for sparse graphs (see [similarity_flooding.md](similarity_flooding.md) for the full convergence analysis). Negative evidence does not require special treatment — it is part of the same contraction mapping. Each iteration brings the score vector closer to the unique fixed point regardless of whether individual scores go up or down. ### What negative evidence does NOT replace @@ -114,3 +117,4 @@ Negative evidence interacts with several other components: - Suchanek, Abiteboul, Senellart. *PARIS: Probabilistic Alignment of Relations, Instances, and Schema.* VLDB 2011. Section 4 (Equations 4-7), Section 6.3 (experimental evaluation of negative evidence). - Peng, Bonald, Suchanek. *FLORA: Unsupervised Knowledge Graph Alignment by Fuzzy Logic.* 2025. Definition 1 (no-negation constraint), Theorem 1 (convergence requires monotonicity). +- Lizorkin, Velikhov, Grinev, Turdakov. *Accuracy Estimate and Optimization Techniques for SimRank Computation.* PVLDB 2008. (Contraction convergence proof for iterative graph similarity with decay factor.) diff --git a/docs/progressive_merging.md b/docs/progressive_merging.md index feecd82..c6f5d1e 100644 --- a/docs/progressive_merging.md +++ b/docs/progressive_merging.md @@ -38,38 +38,37 @@ This is relevant because it suggests two fundamentally different strategies: Strategy 2 avoids the convergence issues of progressive merging but misses the enriched-neighborhood benefit. -## Our approach: epoch-based progressive merging +## Our approach: progressive merging within damped iteration -Neither the literature nor standard fixpoint theory provides a clean answer for progressive merging during propagation. We propose a single-loop design that preserves most of the convergence properties while gaining the neighborhood enrichment benefit. +We use a single-loop design where propagation runs with damped updates (positive and negative evidence integrated into each step), and merges are committed when the iteration converges. Merged neighborhoods then compound structural evidence in subsequent iterations. ### The mechanism -A single propagation loop runs positive evidence to convergence, applies negative dampening, then checks for merges. When merges are found, the canonical adjacency is updated incrementally and propagation continues with enriched neighborhoods: - ``` for iteration in range(max_iter): - # Phase 1: Positive propagation (monotone non-decreasing) - confidence = propagate_one_step(confidence) + # Damped propagation step (positive + negative in one pass) + for each pair (a, b): + computed = seed + pos_agg(neighbors) × (1 - seed) - neg_agg(neighbors) × seed + confidence(a, b) = (1 - α) × old + α × computed if not converged: continue - # Phase 2: Negative dampening (post-convergence) - confidence = apply_negative(confidence) - - # Phase 3: Commit high-confidence merges + # Commit high-confidence merges new_merges = find_merges(confidence, threshold=merge_threshold) if not new_merges: break # Converged, no new merges → done - # Phase 4: Update adjacency incrementally, remap pairs/confidence + # Update adjacency incrementally, remap pairs/confidence for a, b in new_merges: uf.union(a, b) canonical_adj[uf.find(a)] = dedup(adj[a] + adj[b]) confidence, pairs = remap_to_canonical(confidence, pairs, uf) - confidence = reseed_from_names(confidence, name_seed) # prevent compounding dampening ``` -The key insight: maintaining a `canonical_adj` alongside the UnionFind, updated incrementally on merge, avoids rebuilding adjacency from scratch each cycle. Each merge costs O(degree), not O(|edges|). +Key properties: +- **No separate phases**: positive and negative evidence are computed together in each step, not sequentially. See [negative_evidence.md](negative_evidence.md) for how this works. +- **No reseeding**: the damped update naturally anchors to the seed — there is no compounding dampening effect that requires periodic reseeding. +- **Incremental adjacency**: maintaining a `canonical_adj` alongside the UnionFind avoids rebuilding adjacency from scratch each cycle. Each merge costs O(degree), not O(|edges|). ### What merging means concretely @@ -102,15 +101,15 @@ The merge threshold should be conservative. A false merge during propagation is ### Convergence properties -Within each epoch, propagation converges normally (monotone non-decreasing on a finite lattice). Between epochs, merging changes the graph structure, so the overall process is not a standard fixpoint iteration. +Within each convergence cycle, the damped iteration converges via the contraction mapping property (see [similarity_flooding.md](similarity_flooding.md)). Between cycles, merging changes the graph structure, so the overall process is not a single contraction mapping. -However, the process is still well-behaved: +However, the process is well-behaved: 1. **Merges are irreversible**: once committed, entities stay merged. The set of merged entities grows monotonically. 2. **The graph shrinks**: each merge reduces the entity count by one. The process must terminate in at most N-1 merge steps. -3. **Confidence is non-decreasing across epochs**: `max(confidence(a, x), confidence(b, x)) >= confidence(a, x)` for all `x`. -4. **Termination**: if no epoch produces new merges, the process halts. +3. **Within-cycle convergence**: each cycle converges to the unique fixed point of the current graph's contraction mapping. The fixed point changes when merges alter the graph structure. +4. **Termination**: if no cycle produces new merges, the process halts. -This is not a formal convergence guarantee in the FLORA sense (no Knaster-Tarski applies to the cross-epoch dynamics). But the monotonic reduction in entity count provides a strong termination guarantee, and the conservative merge threshold limits cascade risk. +The conservative merge threshold (0.9) limits cascade risk: only very high-confidence pairs are merged, and enriched neighborhoods from those merges are unlikely to create false matches above the same threshold. ### Interaction with negative evidence @@ -118,9 +117,7 @@ Progressive merging and [negative evidence](negative_evidence.md) interact in tw 1. **Enriched neighborhoods improve negative evidence quality.** After merging, a combined entity has more edges, which means more opportunities for both positive AND negative evidence. A false match candidate that survived against sparse individual neighborhoods may fail against the richer merged neighborhood. -2. **Negative evidence prevents false progressive merges.** If negative evidence runs within each epoch (before the merge step), it can suppress pairs that had high positive scores but contradictory functional relations. This is a safety mechanism against the cascade risk of progressive merging. - -The recommended order within each epoch: propagate positive evidence → apply negative dampening → commit merges above threshold. +2. **Negative evidence prevents false progressive merges.** Because negative evidence is integrated into each propagation step (not applied post-hoc), it suppresses pairs with contradictory functional relations before they ever reach the merge threshold. This is a natural safety mechanism against cascade risk. ## What progressive merging does NOT solve @@ -131,5 +128,5 @@ The recommended order within each epoch: propagate positive evidence → apply n ## References - Suchanek, Abiteboul, Senellart. *PARIS: Probabilistic Alignment of Relations, Instances, and Schema.* VLDB 2011. Section 5.2 (maximal assignment as soft progressive commitment). -- Peng, Bonald, Suchanek. *FLORA: Unsupervised Knowledge Graph Alignment by Fuzzy Logic.* 2025. Theorem 1 (convergence requires monotonicity — why merging during propagation is problematic). +- Peng, Bonald, Suchanek. *FLORA: Unsupervised Knowledge Graph Alignment by Fuzzy Logic.* 2025. Theorem 1 (convergence requires monotonicity — contrast with our contraction-based approach). - Liao, Sabetiansfahani, Bhatt, Ben-Hur. *IsoRankN: Spectral Methods for Global Alignment of Multiple Protein Networks.* Bioinformatics 2009. Sections 2.2-2.4 (star spread as alternative to progressive merging). diff --git a/docs/similarity_flooding.md b/docs/similarity_flooding.md index 8b5df0d..4b96c32 100644 --- a/docs/similarity_flooding.md +++ b/docs/similarity_flooding.md @@ -141,20 +141,90 @@ FLORA's alignment rule requires four functionality terms: `fun(R)`, `fun(R, H)`, Like PARIS, FLORA jointly aligns relations. It discovers asymmetric sub-relation mappings (`r ⊆ r'`) by checking whether facts in `r` have corresponding facts in `r'` for all matched entity pairs, using an `alpha-mean` aggregation (arithmetic mean scaled by a "benefit of the doubt" factor `alpha` to handle incomplete knowledge graphs). +## Our approach: damped fixed-point iteration + +We depart from FLORA's monotone framework. FLORA's Knaster-Tarski convergence proof requires all updates to be non-decreasing, which prevents negative evidence from being applied during iteration (scores can't go down). This forced our earlier design into a dual-channel architecture with separate positive and negative channels — complex and hard to reason about. + +Instead, we use **damped fixed-point iteration**, which guarantees convergence while allowing scores to both increase and decrease. + +### Banach vs Knaster-Tarski + +FLORA's convergence relies on the **Knaster-Tarski fixed point theorem**: on a complete lattice, every monotone function has a fixed point. This requires all score updates to be non-decreasing. Scores can only go up, which is why FLORA explicitly excludes negation. + +The **Banach fixed point theorem** (contraction mapping theorem) takes a different approach: on a complete metric space, every contraction mapping has a **unique** fixed point, and iterated application converges to it from any starting point. A contraction shrinks distances — applying the update to any two score vectors brings them closer together. Convergence is geometric: after k iterations, the error is bounded by `q^k × initial_error`, where `q < 1` is the contraction constant. + +| Property | Knaster-Tarski (FLORA) | Banach (ours) | +|----------|----------------------|---------------| +| **Space** | Complete lattice | Complete metric space | +| **Requirement** | Monotone (non-decreasing) | Contraction (distance-shrinking) | +| **Fixed point** | Exists (least/greatest) | Exists and is **unique** | +| **Monotonicity** | Required | Not required | +| **Scores decrease?** | Never | Yes — negative evidence integrated | +| **Convergence rate** | Not bounded | Geometric: `q^k` | + +### The damped update rule + +Given a score update function `f` that computes a new score from neighbor confidences, the **damped update** is: + +``` +new_score = (1 - α) × old_score + α × f(old_scores) +``` + +where `α ∈ (0, 1)` is the damping factor. Each iteration blends the old score with the newly computed one. This has two effects: + +1. **Smooths oscillation**: even if `f` would produce wild swings (e.g. negative evidence temporarily overcorrecting), the damping limits each step to a fraction `α` of the full change. +2. **Creates contraction**: the composed map `g(x) = (1-α)x + αf(x)` has Lipschitz constant `(1-α) + α·Lip(f)`. When `Lip(f) < 1` — which holds for sparse graphs with functionality-weighted evidence — the composed map is a contraction. + +### Precedents + +**SimRank** (Jeh & Widom, KDD 2002) is the canonical example of a contraction mapping for graph similarity. The SimRank equation `s(a,b) = C/(|I(a)|·|I(b)|) × Σ s(I_i(a), I_j(b))` uses a decay constant `C < 1` that serves as the contraction constant. Lizorkin et al. (PVLDB 2008) formally proved that the error after k iterations is bounded by `C^{k+1}` — exponential convergence with rate equal to the decay factor. This is a direct application of the Banach fixed point theorem. + +**PageRank** (Brin & Page, 1998) uses a damping/teleportation factor `(1-d)` that serves the same role. The iteration `r = d·M·r + (1-d)·v` is a contraction with constant `d` (typically 0.85). The teleportation term anchors the iteration and prevents oscillation, guaranteeing convergence regardless of graph structure. + +Our damped update follows the same pattern. The name-similarity seed acts as our "teleportation" anchor — it prevents scores from drifting arbitrarily and ensures the fixed point reflects both name and structural evidence. + +### Our aggregation formula + +For each entity pair `(a, b)`, we compute structural evidence from matching neighbor pairs: + +``` +seed = name_similarity(a, b) +computed = seed + pos_agg × (1 - seed) - neg_agg × seed +computed = clamp(computed, 0, 1) +new_score = (1 - α) × old_score + α × computed +``` + +Where: +- `pos_agg = 1 - exp(-λ × Σ pos_strengths)`: positive structural evidence, accumulated via exp-sum from neighbor pairs with matching relations whose confidence exceeds 0.5 +- `neg_agg = 1 - exp(-λ × Σ neg_strengths)`: negative structural evidence, from neighbor pairs whose confidence is below 0.5 (weighted by forward functionality) +- The seed serves as a baseline: positive evidence pushes toward 1.0 proportional to the room above seed, negative evidence pushes toward 0.0 proportional to the seed value +- With no structural evidence, the fixpoint equals the seed (name-only matching preserved) + +This replaces the dual-channel architecture (separate positive/negative channels combined via Bayesian log-odds) with a single score that directly integrates both types of evidence. + +### Convergence in practice + +For our sparse entity-relation graphs with functionality weights in [0, 1], the structural function's Lipschitz constant is typically well below 1 — each pair's score depends on a handful of neighbor pairs, each contributing through the saturating exp-sum. With damping `α = 0.5`, convergence is geometric with a practical rate around 0.3–0.5, reaching epsilon = 1e-4 within 10–15 iterations. + +The `epsilon` convergence check and `max_iter` bound provide practical guarantees in all cases. + ## Summary of evolution -| Aspect | Melnik 2002 | PARIS 2011 | FLORA 2025 | -|--------|-------------|------------|------------| -| **Framework** | Ad-hoc fixpoint | Probabilistic | Fuzzy logic | -| **Evidence combination** | Sum + normalize | Noisy-OR (product of complements) | min within rule, max across rules | -| **Relation weighting** | Inverse-product propagation coefficients | [Functionality](functionality.md) (harmonic mean of local fun.) | Functionality + local functionality | -| **Relation alignment** | Same labels only | Joint sub-relation discovery | Joint sub-relation discovery | -| **Convergence** | Empirical (residual check) | Not proven | Proven (Knaster-Tarski) | -| **Dangling entities** | Not addressed | Handled implicitly | Explicit (non-matched entities stay at 0) | -| **Match selection** | Relative similarity + stable marriage | Maximum assignment per entity | Maximum assignment per entity | +| Aspect | Melnik 2002 | PARIS 2011 | FLORA 2025 | Ours | +|--------|-------------|------------|------------|------| +| **Framework** | Ad-hoc fixpoint | Probabilistic | Fuzzy logic | Damped fixpoint | +| **Evidence combination** | Sum + normalize | Noisy-OR (product of complements) | min within rule, max across rules | Exp-sum, seed-as-baseline | +| **Negative evidence** | None | Optional (Eq. 7, abandoned) | Excluded (breaks monotonicity) | Integrated (damping allows decrease) | +| **Relation weighting** | Inverse-product coefficients | [Functionality](functionality.md) (harmonic mean) | Functionality + local functionality | [Functionality](functionality.md) (global) | +| **Relation alignment** | Same labels only | Joint sub-relation discovery | Joint sub-relation discovery | Embedding similarity threshold | +| **Convergence** | Empirical (residual check) | Not proven | Proven (Knaster-Tarski) | Contraction mapping (Banach) | +| **Match selection** | Relative similarity + stable marriage | Maximum assignment per entity | Maximum assignment per entity | Union-find on threshold | ## References - Melnik, Garcia-Molina, Rahm. *Similarity Flooding: A Versatile Graph Matching Algorithm and its Application to Schema Matching.* ICDE 2002. - Suchanek, Abiteboul, Senellart. *PARIS: Probabilistic Alignment of Relations, Instances, and Schema.* PVLDB 2011. - Peng, Bonald, Suchanek. *FLORA: Unsupervised Knowledge Graph Alignment by Fuzzy Logic.* 2025. +- Jeh, Widom. *SimRank: A Measure of Structural-Context Similarity.* KDD 2002. +- Lizorkin, Velikhov, Grinev, Turdakov. *Accuracy Estimate and Optimization Techniques for SimRank Computation.* PVLDB 2008. (Proposition 1: error bound `C^{k+1}`, formal proof of contraction convergence.) +- Brin, Page. *The Anatomy of a Large-Scale Hypertextual Web Search Engine.* WWW 1998. (PageRank damping as contraction.) diff --git a/tests/test_propagation.py b/tests/test_propagation.py index 3840aa7..b863758 100644 --- a/tests/test_propagation.py +++ b/tests/test_propagation.py @@ -18,6 +18,7 @@ from worldgraph.graph import Graph from worldgraph.match import match_graphs +from worldgraph.names import build_idf, soft_tfidf # --------------------------------------------------------------------------- @@ -440,9 +441,9 @@ def test_shared_anchor_does_not_override_name_dissimilarity(embedder): matches = _select_matches(confidence, threshold=0.8) matched_pairs = set(matches) # NovaTech Labs has identical names (seed ~1.0) but the only neighbor - # (founder) doesn't match — negative evidence legitimately reduces - # confidence. With neg_alpha=0.3 the final score can drop below 0.8. - # This is acceptable: the test's purpose is the assertion below. + # (founder) doesn't match — the negative channel propagates founder + # mismatch, and the Bayesian combination can push the final score + # below 0.8. This is acceptable: the test's purpose is below. # sharma/vasquez should NOT match — structural anchor alone # cannot override name dissimilarity @@ -573,10 +574,9 @@ def test_single_graph_produces_no_matches(embedder): assert confidence == {} -def test_positive_evidence_is_monotonically_nondecreasing(embedder): - """Without negative evidence, confidence should never decrease as more - iterations run. Negative evidence (applied when pairs exceed neg_gate) - can reduce scores, so we disable it here to test pure positive propagation.""" +def test_propagation_converges(embedder): + """Both positive and negative channels should converge: running with + more iterations than needed should not change the result.""" g1 = Graph(id="g1") meridian1 = g1.add_entity("Meridian Technologies") dv1 = g1.add_entity("DataVault Inc") @@ -591,16 +591,13 @@ def test_positive_evidence_is_monotonically_nondecreasing(embedder): g2.add_edge(meridian2, dv2, "purchased") g2.add_edge(meridian2, ceo2, "employed") - # Disable negative evidence by setting alpha=0 - prev_conf = match_graphs([g1, g2], embedder, max_iter=1, neg_alpha=0.0) - for n_iter in [2, 5, 10]: - curr_conf = match_graphs([g1, g2], embedder, max_iter=n_iter, neg_alpha=0.0) - for pair, val in curr_conf.items(): - assert val >= prev_conf[pair] - 1e-9, ( - f"Confidence decreased for {pair}: {prev_conf[pair]:.6f} → {val:.6f} " - f"at max_iter={n_iter}" - ) - prev_conf = curr_conf + conf_10 = match_graphs([g1, g2], embedder, max_iter=10) + conf_30 = match_graphs([g1, g2], embedder, max_iter=30) + for pair, val in conf_30.items(): + assert abs(val - conf_10.get(pair, 0.0)) < 1e-9, ( + f"Score changed between max_iter=10 and max_iter=30 for {pair}: " + f"{conf_10.get(pair, 0.0):.6f} → {val:.6f}" + ) # --------------------------------------------------------------------------- @@ -745,3 +742,83 @@ def test_progressive_merging_enriched_neighborhood(embedder): f"Progressive merging should improve C's match: " f"single={c_single:.3f}, progressive={c_progressive:.3f}" ) + + +# --------------------------------------------------------------------------- +# Negative evidence should use structural confidence (not name_seed) +# --------------------------------------------------------------------------- + + +def test_negative_evidence_does_not_over_penalize_structurally_matched_neighbors( + embedder, +): + """Negative evidence should not penalize an entity pair when its + functional neighbors are structurally matched. + + "Meridian Technologies" and "Meridian Tech" share two structural paths: + acquired → DataVault (identical names) and CEO → a person with weak name + similarity but shared sub-neighbor (Stanford University). + + The positive channel discovers the CEO match via Stanford. The negative + channel sees CEO name dissimilarity (~0.7) but Stanford's zero + dissimilarity blocks negative propagation for the CEO pair. The + Bayesian combination divides out the name prior, so only the structural + growth matters — and positive grew more than negative. + """ + g1 = Graph(id="g1") + m1 = g1.add_entity("Meridian Technologies") + dv1 = g1.add_entity("DataVault") + ceo1 = g1.add_entity("Dr. Alice M. Johnson") + uni1 = g1.add_entity("Stanford University") + g1.add_edge(m1, dv1, "acquired") + g1.add_edge(m1, ceo1, "CEO") + g1.add_edge(ceo1, uni1, "graduated from") + + g2 = Graph(id="g2") + m2 = g2.add_entity("Meridian Tech") + dv2 = g2.add_entity("DataVault") + ceo2 = g2.add_entity("A. Johnson") + uni2 = g2.add_entity("Stanford University") + g2.add_edge(m2, dv2, "purchased") + g2.add_edge(m2, ceo2, "CEO") + g2.add_edge(ceo2, uni2, "graduated from") + + # Background to establish CEO as functional (1:1) + bg_graphs = [] + for i, (person, org) in enumerate( + [ + ("Marcus Webb", "Alpha Corp"), + ("Sarah Chen", "Beta Inc"), + ("James Xu", "Gamma LLC"), + ] + ): + bg = Graph(id=f"bg{i}") + p = bg.add_entity(person) + o = bg.add_entity(org) + bg.add_edge(p, o, "CEO") + bg_graphs.append(bg) + + graphs = [g1, g2, *bg_graphs] + + # Premise: CEO name similarity is weak (structural propagation needed) + all_names = [n for g in graphs for node in g.nodes.values() for n in node.names] + idf = build_idf(all_names) + assert soft_tfidf("Dr. Alice M. Johnson", "A. Johnson", idf) < 0.5 + + confidence = match_graphs(graphs, embedder) + + # CEO pair should be structurally matched despite weak names + ceo_score = confidence.get( + (ceo1.id, ceo2.id), confidence.get((ceo2.id, ceo1.id), 0.0) + ) + assert ceo_score > 0.6, ( + f"CEO pair should be structurally matched, got {ceo_score:.3f}" + ) + + # Meridian should not be over-penalized — the CEO targets match + # structurally via Stanford. + meridian_score = confidence.get((m1.id, m2.id), confidence.get((m2.id, m1.id), 0.0)) + assert meridian_score > 0.8, ( + f"Negative evidence over-penalized Meridian: score={meridian_score:.3f} " + f"(CEO structural match={ceo_score:.3f})" + ) diff --git a/worldgraph/match.py b/worldgraph/match.py index c608904..c5307be 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -1,14 +1,19 @@ -"""Stage 2: Entity alignment via PARIS-style similarity propagation. - -Entity names are stored as lists on nodes (multi-label). Name similarity -seeds the confidence dict before the iteration loop using the max over all -name pairs, so structural evidence propagates from iteration 1. Relation -similarity is treated as binary -via a single threshold that defines equivalence classes over free-text -relation phrases — above threshold = same relation, below = different. -This threshold is used consistently for functionality pooling, positive -propagation gating, and negative evidence. Exponential sum aggregation, -threshold, merge via union-find. +"""Stage 2: Entity alignment via dual-channel similarity propagation. + +Two independent propagation channels run in parallel: + +- **Positive** (similarity): seeded from name similarity, propagated with + inverse functionality weights. Measures structural evidence FOR a match. +- **Negative** (dissimilarity): seeded from 1 − name similarity, propagated + with forward functionality weights. Measures structural evidence AGAINST. + +Both use the same algorithm (exp-sum aggregation, monotone max). They are +combined via Bayesian log-odds that divides out the shared name-similarity +prior to avoid double-counting. + +Relation similarity is treated as binary via a single threshold that defines +equivalence classes over free-text relation phrases. This threshold is used +consistently for functionality pooling and both propagation channels. """ import math @@ -40,14 +45,11 @@ class Functionality(NamedTuple): class Neighbor(NamedTuple): """An entry in a node's weighted adjacency list. - ``func_weight`` is used by positive propagation (inverse for outgoing, - forward for incoming — measures how uniquely the neighbor determines - this entity). + ``func_weight`` is used by the positive (similarity) channel — inverse + functionality for outgoing, forward for incoming. - ``neg_func_weight`` is used by negative evidence (forward for outgoing, - inverse for incoming — measures how uniquely this entity determines - the neighbor, so a missing match on a functional relation penalizes - heavily). + ``neg_func_weight`` is used by the negative (dissimilarity) channel — + forward functionality for outgoing, inverse for incoming. """ entity_id: str @@ -163,116 +165,33 @@ def compute_functionality( # --------------------------------------------------------------------------- -def compute_negative_factor( - id_a: str, - id_b: str, - adj: dict[str, list[Neighbor]], - rel_sim: dict[tuple[str, str], float], - confidence: Confidence, - alpha: float = 0.3, - floor: float = 0.5, - *, - rel_threshold: float, - uf: UnionFind, +def _combine_bayesian( + pos: float, + neg: float, + prior: float, + clamp: float = 0.01, ) -> float: - """Compute dampened negative factor for an entity pair. + """Combine positive and negative confidence via Bayesian log-odds. - For each neighbor y of a (via relation r), check whether y matches any - neighbor y' of b (via a similar relation r'). If no match is found and - the relation is functional, penalize the pair. + Both channels are seeded from the same name-similarity prior (pos from + ``prior``, neg from ``1 - prior``). To avoid double-counting, the + prior is divided out in log-odds space:: - Both directions are checked independently; the more charitable (higher) - factor is used. This reflects news graph reality: articles cover - different aspects of the same entity, so missing neighbors on one side - is common and should not compound penalties from both directions. + logit(final) = logit(pos) - logit(neg) - logit(prior) - ``rel_threshold`` is the same relation equivalence threshold used by - positive propagation and functionality pooling. + When there is no structural evidence (pos == prior, neg == 1 - prior), + the two structural terms cancel and the result equals the prior. - Returns a value in (0, 1] that multiplies the positive confidence. + Inputs are clamped to ``[clamp, 1 - clamp]`` before taking logit so + that the log-odds stay in a bounded range (±4.6 with the default). """ - neg_a = _one_sided_negative( - id_a, - id_b, - adj, - rel_sim, - confidence, - alpha, - floor, - rel_threshold=rel_threshold, - uf=uf, - ) - neg_b = _one_sided_negative( - id_b, - id_a, - adj, - rel_sim, - confidence, - alpha, - floor, - rel_threshold=rel_threshold, - uf=uf, - ) - return max(neg_a, neg_b) - - -def _one_sided_negative( - id_a: str, - id_b: str, - adj: dict[str, list[Neighbor]], - rel_sim: dict[tuple[str, str], float], - confidence: Confidence, - alpha: float, - floor: float, - *, - rel_threshold: float, - uf: UnionFind, -) -> float: - """Negative factor from a's perspective. - - Computes a functionality-weighted average mismatch across a's neighbors, - then converts to a multiplicative penalty. The weighted average - naturally normalizes for the number of neighbors: an entity with - 5 neighbors, 4 matching and 1 not, gets a mild penalty (20% mismatch), - while an entity with 1 non-matching neighbor gets a strong one (100%). - - Relation similarity is treated as binary (matching or not) using - ``rel_threshold`` — the same threshold used by positive propagation - and functionality pooling. - - Neighbor entity IDs are resolved through ``uf`` to handle merged - entities whose adjacency entries may reference pre-merge IDs. - """ - neighbors_a = adj.get(id_a, []) - if not neighbors_a: - return 1.0 - - total_weight = 0.0 - weighted_mismatch = 0.0 - for neighbor_a in neighbors_a: - ra = uf.find(neighbor_a.entity_id) - if ra == id_a: - continue - match_prob = 0.0 - for neighbor_b in adj.get(id_b, []): - rb = uf.find(neighbor_b.entity_id) - if rb == id_b: - continue - rs = rel_sim.get((neighbor_a.relation, neighbor_b.relation), 0.0) - if rs < rel_threshold: - continue - # Same canonical entity = perfect match (both sides merged). - nbr_conf = 1.0 if ra == rb else confidence.get((ra, rb), 0.0) - match_prob += nbr_conf - match_prob = min(match_prob, 1.0) - total_weight += neighbor_a.neg_func_weight - weighted_mismatch += neighbor_a.neg_func_weight * (1.0 - match_prob) - if total_weight == 0.0: - return 1.0 + def _logit(x: float) -> float: + x = max(clamp, min(1.0 - clamp, x)) + return math.log(x / (1.0 - x)) - avg_mismatch = weighted_mismatch / total_weight - return max(1.0 - alpha * avg_mismatch, floor) + log_odds = _logit(pos) - _logit(neg) - _logit(prior) + return 1.0 / (1.0 + math.exp(-max(-40.0, min(40.0, log_odds)))) def build_unified_graph(graphs: list[Graph]) -> Graph: @@ -352,26 +271,31 @@ def _seed_confidence( graph: Graph, idf: dict[str, float], pairs: list[tuple[str, str]], -) -> tuple[Confidence, Confidence]: - """Seed confidence from name similarity. +) -> tuple[Confidence, Confidence, Confidence]: + """Seed both propagation channels from name similarity. + + Returns (pos_conf, neg_conf, name_sim): - Returns (confidence, name_seed) — initially identical. ``name_seed`` - is kept fixed (modulo remapping on merge) and used by negative evidence - to prevent circular reinforcement. + - ``pos_conf``: seeded from name similarity (positive channel). + - ``neg_conf``: seeded from 1 − name similarity (negative channel). + - ``name_sim``: read-only prior for the Bayesian combination. """ - confidence: Confidence = {} - name_seed: Confidence = {} + pos_conf: Confidence = {} + neg_conf: Confidence = {} + name_sim: Confidence = {} for a, b in pairs: best = 0.0 for na in graph.nodes[a].names: for nb in graph.nodes[b].names: best = max(best, soft_tfidf(na, nb, idf)) best = max(0.0, best) - confidence[(a, b)] = best - confidence[(b, a)] = best - name_seed[(a, b)] = best - name_seed[(b, a)] = best - return confidence, name_seed + pos_conf[(a, b)] = best + pos_conf[(b, a)] = best + neg_conf[(a, b)] = 1.0 - best + neg_conf[(b, a)] = 1.0 - best + name_sim[(a, b)] = best + name_sim[(b, a)] = best + return pos_conf, neg_conf, name_sim def _remap_confidence(conf: Confidence, uf: UnionFind) -> Confidence: @@ -396,28 +320,28 @@ def propagate_similarity( max_iter: int = 30, epsilon: float = 1e-4, exp_lambda: float = 1.0, - neg_alpha: float = 0.3, - neg_floor: float = 0.5, - neg_gate: float = 0.3, merge_threshold: float = 0.9, ) -> tuple[Confidence, UnionFind]: - """Run similarity propagation with inline progressive merging. + """Run dual-channel similarity propagation with progressive merging. + + Two independent channels propagate in the same loop: - A single propagation loop alternates positive evidence accumulation - and negative dampening. When pairs exceed ``merge_threshold``, they - are merged via union-find and the canonical adjacency is updated - incrementally — no epoch rebuild required. + - **pos_conf**: seeded from name similarity, propagated with + ``func_weight`` (inverse functionality). Monotone non-decreasing. + - **neg_conf**: seeded from 1 − name similarity, propagated with + ``neg_func_weight`` (forward functionality). Monotone non-decreasing. - Positive updates use the monotone max rule (new = max(structural, old)) - so positive evidence never decreases. Negative dampening is applied - as a multiplicative factor after each positive pass. + Neither channel reads the other. Both use the same algorithm (exp-sum + aggregation, monotone max rule). They are combined only for merge + decisions and the final output via ``_combine_bayesian``, which divides + out the shared name-similarity prior to avoid double-counting. On merge, the canonical adjacency for the new representative is built by combining and deduplicating the adjacency lists of the merged entities — O(degree) per merge, not O(|edges|). Returns (confidence, union_find) where confidence maps original - entity-ID pairs to scores and union_find tracks all committed merges. + entity-ID pairs to combined scores and union_find tracks all merges. """ rel_sim = _build_rel_sim(graph, relation_embeddings) uf = UnionFind() @@ -430,15 +354,18 @@ def propagate_similarity( if not pairs: return {}, uf - confidence, name_seed = _seed_confidence(graph, idf, pairs) + pos_conf, neg_conf, name_sim = _seed_confidence(graph, idf, pairs) for _ in range(max_iter): - prev = dict(confidence) - pos_changed = False + prev_pos = dict(pos_conf) + prev_neg = dict(neg_conf) + changed = False - # --- Positive propagation --- + # --- Dual-channel propagation --- for ca, cb in pairs: - strength_sum = 0.0 + pos_strength = 0.0 + neg_strength = 0.0 + for nbr_a in canonical_adj.get(ca, []): ra = uf.find(nbr_a.entity_id) if ra == ca: @@ -450,51 +377,53 @@ def propagate_similarity( rs = rel_sim.get((nbr_a.relation, nbr_b.relation), 0.0) if rs < rel_threshold: continue - nc = prev.get((ra, rb), 0.0) - if nc <= 0.0: - continue - weight = min(nbr_a.func_weight, nbr_b.func_weight) - strength_sum += weight * nc - positive = ( - 1.0 - math.exp(-exp_lambda * strength_sum) if strength_sum > 0 else 0.0 + # Only propagate evidence that is "more likely than + # not" — prevents weak signals from bouncing between + # entity pairs and amplifying into false confidence. + pos_nc = prev_pos.get((ra, rb), 0.0) + if pos_nc > 0.5: + pos_strength += ( + min(nbr_a.func_weight, nbr_b.func_weight) * pos_nc + ) + + neg_nc = prev_neg.get((ra, rb), 0.0) + if neg_nc > 0.5: + neg_strength += ( + min(nbr_a.neg_func_weight, nbr_b.neg_func_weight) * neg_nc + ) + + # Exp-sum aggregation + monotone max for both channels. + pos_new = ( + 1.0 - math.exp(-exp_lambda * pos_strength) if pos_strength > 0 else 0.0 ) - old = prev[(ca, cb)] - new_val = max(positive, old) - confidence[(ca, cb)] = new_val - confidence[(cb, ca)] = new_val - if abs(new_val - old) > epsilon: - pos_changed = True - - # Keep iterating positive until convergence before applying - # negative dampening and checking for merges. - if pos_changed: + old_pos = prev_pos[(ca, cb)] + pos_val = max(pos_new, old_pos) + pos_conf[(ca, cb)] = pos_val + pos_conf[(cb, ca)] = pos_val + + neg_new = ( + 1.0 - math.exp(-exp_lambda * neg_strength) if neg_strength > 0 else 0.0 + ) + old_neg = prev_neg[(ca, cb)] + neg_val = max(neg_new, old_neg) + neg_conf[(ca, cb)] = neg_val + neg_conf[(cb, ca)] = neg_val + + if abs(pos_val - old_pos) > epsilon or abs(neg_val - old_neg) > epsilon: + changed = True + + if changed: continue - # --- Negative dampening (positive has converged) --- - for ca, cb in pairs: - base = confidence[(ca, cb)] - if base > neg_gate: - neg = compute_negative_factor( - ca, - cb, - canonical_adj, - rel_sim, - name_seed, - alpha=neg_alpha, - floor=neg_floor, - rel_threshold=rel_threshold, - uf=uf, - ) - combined = base * neg - confidence[(ca, cb)] = combined - confidence[(cb, ca)] = combined - - # --- Progressive merging --- + # --- Progressive merging (on Bayesian combined score) --- new_merges = [ (ca, cb) for ca, cb in pairs - if confidence.get((ca, cb), 0.0) >= merge_threshold + if _combine_bayesian( + pos_conf[(ca, cb)], neg_conf[(ca, cb)], name_sim[(ca, cb)] + ) + >= merge_threshold and uf.find(ca) != uf.find(cb) ] @@ -530,7 +459,7 @@ def propagate_similarity( ) canonical_adj[new_canon] = deduped - # Remap pairs, confidence, name_seed to canonical reps. + # Remap pairs and all three dicts to canonical reps. pair_set: set[tuple[str, str]] = set() new_pairs: list[tuple[str, str]] = [] for a, b in pairs: @@ -543,22 +472,15 @@ def propagate_similarity( new_pairs.append(pair) pairs = new_pairs - confidence = _remap_confidence(confidence, uf) - name_seed = _remap_confidence(name_seed, uf) - - # Re-seed: confidence should never fall below name similarity. - # Without this, negative dampening compounds across merge cycles. - for ca, cb in pairs: - ns = name_seed.get((ca, cb), 0.0) - if ns > confidence.get((ca, cb), 0.0): - confidence[(ca, cb)] = ns - confidence[(cb, ca)] = ns + pos_conf = _remap_confidence(pos_conf, uf) + neg_conf = _remap_confidence(neg_conf, uf) + name_sim = _remap_confidence(name_sim, uf) if not pairs: break continue - # Positive converged, no new merges — done. + # Both channels converged, no new merges — done. break # Expand canonical-rep confidence to original entity-ID pairs. @@ -567,9 +489,14 @@ def propagate_similarity( members[uf.find(eid)].append(eid) final: Confidence = {} - for (ca, cb), score in confidence.items(): + for (ca, cb), pos_score in pos_conf.items(): if ca == cb: continue + score = _combine_bayesian( + pos_score, + neg_conf.get((ca, cb), 0.0), + name_sim.get((ca, cb), 0.5), + ) for ma in members.get(ca, [ca]): for mb in members.get(cb, [cb]): if graph.nodes[ma].graph_id == graph.nodes[mb].graph_id: From 115b5875ea9a020c01a7b8458996df487b928b54 Mon Sep 17 00:00:00 2001 From: Johan Schuijt Date: Sun, 29 Mar 2026 21:01:47 +0200 Subject: [PATCH 13/13] refactor to new convergence --- CLAUDE.md | 10 +- tests/test_propagation.py | 18 ++-- worldgraph/match.py | 197 +++++++++++++++----------------------- 3 files changed, 89 insertions(+), 136 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 508eae2..d965d21 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -38,17 +38,17 @@ The core propagation loop (`match.py`): 1. **Name-similarity seeding** — Soft TF-IDF + Jaro-Winkler seeds the confidence dict before iteration starts. This gives propagation initial signal to work with. See [docs/name_similarity.md](docs/name_similarity.md). -2. **Relation similarity via sentence embeddings** — relation phrase similarity is a continuous multiplier on propagation paths, not a binary gate. "acquired" ↔ "purchased" (~0.85) contributes proportionally; "acquired" ↔ "located in" (~0.1) contributes almost nothing. This replaces the identical-label requirement in standard SF/PARIS. +2. **Relation similarity via sentence embeddings** — relation phrase similarity is thresholded into equivalence classes. "acquired" ↔ "purchased" (above threshold) are treated as equivalent; "acquired" ↔ "located in" (below) are not. The threshold is used consistently for functionality pooling and propagation gating. -3. **Functionality weighting** — global forward and inverse functionality (1/avg_degree), with similar relation phrases pooled. See [docs/functionality.md](docs/functionality.md). +3. **Functionality weighting** — global forward and inverse functionality (1/avg_degree), with equivalent relation phrases pooled. See [docs/functionality.md](docs/functionality.md). -4. **Exponential sum aggregation** — `1 - exp(-λ × Σ strengths)` where each path contributes `rel_sim × min(func_a, func_b) × neighbor_confidence`. Rewards breadth over single strong paths. +4. **Exponential sum aggregation** — `1 - exp(-λ × Σ strengths)` where each path contributes `min(func_a, func_b) × neighbor_confidence`. Rewards breadth over single strong paths. -5. **Monotone non-decreasing updates** — confidence only goes up, never down. Preserves convergence guarantees (FLORA-style). +5. **Damped fixed-point iteration** — `new = (1-d)*old + d*computed` where computed integrates positive and negative evidence around the name-similarity seed. Converges via contraction (see [docs/similarity_flooding.md](docs/similarity_flooding.md)). 6. **Unified N-graph matching** — all article graphs merged into one, propagation runs once over all cross-graph pairs. Final grouping via union-find. -7. **Negative evidence** ([docs/negative_evidence.md](docs/negative_evidence.md)) — dampened negative factor penalizes entity pairs whose functional neighbors don't match. Applied once per convergence cycle, after positive evidence stabilizes. Uses name-seed confidence (not structural) for neighbor matching to prevent circular reinforcement. +7. **Negative evidence** ([docs/negative_evidence.md](docs/negative_evidence.md)) — integrated directly into the single propagation score. Neighbors with confidence < 0.5 contribute negative evidence weighted by forward functionality, pushing the score toward 0. Damped iteration bounds circular reinforcement geometrically. 8. **Progressive merging** ([docs/progressive_merging.md](docs/progressive_merging.md)) — high-confidence merges are committed inline during the single propagation loop. Canonical adjacency is updated incrementally on merge (O(degree) per merge), avoiding full adjacency rebuilds. Enriched neighborhoods compound structural evidence across merge cycles. diff --git a/tests/test_propagation.py b/tests/test_propagation.py index b863758..0065c7a 100644 --- a/tests/test_propagation.py +++ b/tests/test_propagation.py @@ -441,9 +441,9 @@ def test_shared_anchor_does_not_override_name_dissimilarity(embedder): matches = _select_matches(confidence, threshold=0.8) matched_pairs = set(matches) # NovaTech Labs has identical names (seed ~1.0) but the only neighbor - # (founder) doesn't match — the negative channel propagates founder - # mismatch, and the Bayesian combination can push the final score - # below 0.8. This is acceptable: the test's purpose is below. + # (founder) doesn't match — negative evidence from the founder mismatch + # can push the score below 0.8. This is acceptable: the test's purpose + # is below. # sharma/vasquez should NOT match — structural anchor alone # cannot override name dissimilarity @@ -575,8 +575,8 @@ def test_single_graph_produces_no_matches(embedder): def test_propagation_converges(embedder): - """Both positive and negative channels should converge: running with - more iterations than needed should not change the result.""" + """Propagation should converge: running with more iterations than + needed should not change the result.""" g1 = Graph(id="g1") meridian1 = g1.add_entity("Meridian Technologies") dv1 = g1.add_entity("DataVault Inc") @@ -759,11 +759,9 @@ def test_negative_evidence_does_not_over_penalize_structurally_matched_neighbors acquired → DataVault (identical names) and CEO → a person with weak name similarity but shared sub-neighbor (Stanford University). - The positive channel discovers the CEO match via Stanford. The negative - channel sees CEO name dissimilarity (~0.7) but Stanford's zero - dissimilarity blocks negative propagation for the CEO pair. The - Bayesian combination divides out the name prior, so only the structural - growth matters — and positive grew more than negative. + Propagation discovers the CEO match via Stanford. Even though the CEO + names are dissimilar, the structural evidence from the shared Stanford + neighbor outweighs the negative signal from name mismatch. """ g1 = Graph(id="g1") m1 = g1.add_entity("Meridian Technologies") diff --git a/worldgraph/match.py b/worldgraph/match.py index c5307be..8b36742 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -1,19 +1,18 @@ -"""Stage 2: Entity alignment via dual-channel similarity propagation. +"""Stage 2: Entity alignment via damped similarity propagation. -Two independent propagation channels run in parallel: +A single confidence score per entity pair is iteratively refined using +damped fixed-point iteration. Each step computes structural evidence +from matching neighbor pairs (positive: neighbors likely match, weighted +by inverse functionality; negative: neighbors likely don't match, weighted +by forward functionality) and blends it with the previous score via a +damping factor. -- **Positive** (similarity): seeded from name similarity, propagated with - inverse functionality weights. Measures structural evidence FOR a match. -- **Negative** (dissimilarity): seeded from 1 − name similarity, propagated - with forward functionality weights. Measures structural evidence AGAINST. - -Both use the same algorithm (exp-sum aggregation, monotone max). They are -combined via Bayesian log-odds that divides out the shared name-similarity -prior to avoid double-counting. +Name similarity seeds the initial scores and serves as the baseline that +structural evidence modulates up or down. Relation similarity is treated as binary via a single threshold that defines equivalence classes over free-text relation phrases. This threshold is used -consistently for functionality pooling and both propagation channels. +consistently for functionality pooling and propagation gating. """ import math @@ -45,17 +44,17 @@ class Functionality(NamedTuple): class Neighbor(NamedTuple): """An entry in a node's weighted adjacency list. - ``func_weight`` is used by the positive (similarity) channel — inverse - functionality for outgoing, forward for incoming. + ``pos_weight`` weights positive evidence — inverse functionality for + outgoing edges, forward functionality for incoming edges. - ``neg_func_weight`` is used by the negative (dissimilarity) channel — - forward functionality for outgoing, inverse for incoming. + ``neg_weight`` weights negative evidence — forward functionality for + outgoing edges, inverse functionality for incoming edges. """ entity_id: str relation: str - func_weight: float - neg_func_weight: float + pos_weight: float + neg_weight: float # Type aliases for the main data structures flowing through the pipeline. @@ -165,35 +164,6 @@ def compute_functionality( # --------------------------------------------------------------------------- -def _combine_bayesian( - pos: float, - neg: float, - prior: float, - clamp: float = 0.01, -) -> float: - """Combine positive and negative confidence via Bayesian log-odds. - - Both channels are seeded from the same name-similarity prior (pos from - ``prior``, neg from ``1 - prior``). To avoid double-counting, the - prior is divided out in log-odds space:: - - logit(final) = logit(pos) - logit(neg) - logit(prior) - - When there is no structural evidence (pos == prior, neg == 1 - prior), - the two structural terms cancel and the result equals the prior. - - Inputs are clamped to ``[clamp, 1 - clamp]`` before taking logit so - that the log-odds stay in a bounded range (±4.6 with the default). - """ - - def _logit(x: float) -> float: - x = max(clamp, min(1.0 - clamp, x)) - return math.log(x / (1.0 - x)) - - log_odds = _logit(pos) - _logit(neg) - _logit(prior) - return 1.0 / (1.0 + math.exp(-max(-40.0, min(40.0, log_odds)))) - - def build_unified_graph(graphs: list[Graph]) -> Graph: """Combine N article graphs into one. Node IDs are UUIDs — unique across graphs.""" unified = Graph() @@ -244,13 +214,17 @@ def _build_adjacency( if key_src not in seen[src]: seen[src].add(key_src) adjacency[src].append( - Neighbor(tgt, edge.relation, func.inverse, func.forward) + Neighbor( + tgt, edge.relation, pos_weight=func.inverse, neg_weight=func.forward + ) ) key_tgt = (src, edge.relation) if key_tgt not in seen[tgt]: seen[tgt].add(key_tgt) adjacency[tgt].append( - Neighbor(src, edge.relation, func.forward, func.inverse) + Neighbor( + src, edge.relation, pos_weight=func.forward, neg_weight=func.inverse + ) ) return dict(adjacency) @@ -271,17 +245,14 @@ def _seed_confidence( graph: Graph, idf: dict[str, float], pairs: list[tuple[str, str]], -) -> tuple[Confidence, Confidence, Confidence]: - """Seed both propagation channels from name similarity. +) -> tuple[Confidence, Confidence]: + """Seed confidence from name similarity. - Returns (pos_conf, neg_conf, name_sim): - - - ``pos_conf``: seeded from name similarity (positive channel). - - ``neg_conf``: seeded from 1 − name similarity (negative channel). - - ``name_sim``: read-only prior for the Bayesian combination. + Returns (conf, name_sim) where both are initialized from the best + soft-TF-IDF score across all name pairs. ``name_sim`` is kept as a + read-only baseline for the seed-as-baseline update formula. """ - pos_conf: Confidence = {} - neg_conf: Confidence = {} + conf: Confidence = {} name_sim: Confidence = {} for a, b in pairs: best = 0.0 @@ -289,13 +260,11 @@ def _seed_confidence( for nb in graph.nodes[b].names: best = max(best, soft_tfidf(na, nb, idf)) best = max(0.0, best) - pos_conf[(a, b)] = best - pos_conf[(b, a)] = best - neg_conf[(a, b)] = 1.0 - best - neg_conf[(b, a)] = 1.0 - best + conf[(a, b)] = best + conf[(b, a)] = best name_sim[(a, b)] = best name_sim[(b, a)] = best - return pos_conf, neg_conf, name_sim + return conf, name_sim def _remap_confidence(conf: Confidence, uf: UnionFind) -> Confidence: @@ -321,27 +290,27 @@ def propagate_similarity( epsilon: float = 1e-4, exp_lambda: float = 1.0, merge_threshold: float = 0.9, + damping: float = 0.5, ) -> tuple[Confidence, UnionFind]: - """Run dual-channel similarity propagation with progressive merging. + """Run damped similarity propagation with progressive merging. - Two independent channels propagate in the same loop: + A single confidence score per entity pair integrates both positive and + negative structural evidence. Each iteration computes a new score from + neighbor confidences and blends it with the old score via damping:: - - **pos_conf**: seeded from name similarity, propagated with - ``func_weight`` (inverse functionality). Monotone non-decreasing. - - **neg_conf**: seeded from 1 − name similarity, propagated with - ``neg_func_weight`` (forward functionality). Monotone non-decreasing. + computed = seed + pos_agg * (1 - seed) - neg_agg * seed + new = (1 - damping) * old + damping * computed - Neither channel reads the other. Both use the same algorithm (exp-sum - aggregation, monotone max rule). They are combined only for merge - decisions and the final output via ``_combine_bayesian``, which divides - out the shared name-similarity prior to avoid double-counting. + Name similarity (``seed``) is the baseline: positive evidence pushes + toward 1.0, negative evidence pushes toward 0.0. With no structural + evidence the fixpoint equals the seed. On merge, the canonical adjacency for the new representative is built by combining and deduplicating the adjacency lists of the merged entities — O(degree) per merge, not O(|edges|). Returns (confidence, union_find) where confidence maps original - entity-ID pairs to combined scores and union_find tracks all merges. + entity-ID pairs to scores and union_find tracks all merges. """ rel_sim = _build_rel_sim(graph, relation_embeddings) uf = UnionFind() @@ -354,14 +323,12 @@ def propagate_similarity( if not pairs: return {}, uf - pos_conf, neg_conf, name_sim = _seed_confidence(graph, idf, pairs) + conf, name_sim = _seed_confidence(graph, idf, pairs) for _ in range(max_iter): - prev_pos = dict(pos_conf) - prev_neg = dict(neg_conf) + prev = dict(conf) changed = False - # --- Dual-channel propagation --- for ca, cb in pairs: pos_strength = 0.0 neg_strength = 0.0 @@ -378,53 +345,47 @@ def propagate_similarity( if rs < rel_threshold: continue - # Only propagate evidence that is "more likely than - # not" — prevents weak signals from bouncing between - # entity pairs and amplifying into false confidence. - pos_nc = prev_pos.get((ra, rb), 0.0) - if pos_nc > 0.5: - pos_strength += ( - min(nbr_a.func_weight, nbr_b.func_weight) * pos_nc - ) + if ra == rb: + # Neighbors already merged — strongest positive evidence. + pos_strength += min(nbr_a.pos_weight, nbr_b.pos_weight) + continue - neg_nc = prev_neg.get((ra, rb), 0.0) + nc = prev.get((ra, rb), 0.0) + if nc > 0.5: + pos_strength += min(nbr_a.pos_weight, nbr_b.pos_weight) * nc + + neg_nc = 1.0 - nc if neg_nc > 0.5: - neg_strength += ( - min(nbr_a.neg_func_weight, nbr_b.neg_func_weight) * neg_nc - ) + neg_strength += min(nbr_a.neg_weight, nbr_b.neg_weight) * neg_nc - # Exp-sum aggregation + monotone max for both channels. - pos_new = ( + # Exp-sum aggregation + seed-as-baseline combination. + pos_agg = ( 1.0 - math.exp(-exp_lambda * pos_strength) if pos_strength > 0 else 0.0 ) - old_pos = prev_pos[(ca, cb)] - pos_val = max(pos_new, old_pos) - pos_conf[(ca, cb)] = pos_val - pos_conf[(cb, ca)] = pos_val - - neg_new = ( + neg_agg = ( 1.0 - math.exp(-exp_lambda * neg_strength) if neg_strength > 0 else 0.0 ) - old_neg = prev_neg[(ca, cb)] - neg_val = max(neg_new, old_neg) - neg_conf[(ca, cb)] = neg_val - neg_conf[(cb, ca)] = neg_val - if abs(pos_val - old_pos) > epsilon or abs(neg_val - old_neg) > epsilon: + seed = name_sim[(ca, cb)] + computed = seed + pos_agg * (1.0 - seed) - neg_agg * seed + computed = max(0.0, min(1.0, computed)) + + old = prev[(ca, cb)] + new_val = (1.0 - damping) * old + damping * computed + conf[(ca, cb)] = new_val + conf[(cb, ca)] = new_val + + if abs(new_val - old) > epsilon: changed = True if changed: continue - # --- Progressive merging (on Bayesian combined score) --- + # --- Progressive merging (directly on single score) --- new_merges = [ (ca, cb) for ca, cb in pairs - if _combine_bayesian( - pos_conf[(ca, cb)], neg_conf[(ca, cb)], name_sim[(ca, cb)] - ) - >= merge_threshold - and uf.find(ca) != uf.find(cb) + if conf[(ca, cb)] >= merge_threshold and uf.find(ca) != uf.find(cb) ] if new_merges: @@ -453,13 +414,13 @@ def propagate_similarity( Neighbor( canon_nbr, nbr.relation, - nbr.func_weight, - nbr.neg_func_weight, + nbr.pos_weight, + nbr.neg_weight, ) ) canonical_adj[new_canon] = deduped - # Remap pairs and all three dicts to canonical reps. + # Remap pairs and confidence dicts to canonical reps. pair_set: set[tuple[str, str]] = set() new_pairs: list[tuple[str, str]] = [] for a, b in pairs: @@ -472,15 +433,14 @@ def propagate_similarity( new_pairs.append(pair) pairs = new_pairs - pos_conf = _remap_confidence(pos_conf, uf) - neg_conf = _remap_confidence(neg_conf, uf) + conf = _remap_confidence(conf, uf) name_sim = _remap_confidence(name_sim, uf) if not pairs: break continue - # Both channels converged, no new merges — done. + # Converged, no new merges — done. break # Expand canonical-rep confidence to original entity-ID pairs. @@ -489,14 +449,9 @@ def propagate_similarity( members[uf.find(eid)].append(eid) final: Confidence = {} - for (ca, cb), pos_score in pos_conf.items(): + for (ca, cb), score in conf.items(): if ca == cb: continue - score = _combine_bayesian( - pos_score, - neg_conf.get((ca, cb), 0.0), - name_sim.get((ca, cb), 0.5), - ) for ma in members.get(ca, [ca]): for mb in members.get(cb, [cb]): if graph.nodes[ma].graph_id == graph.nodes[mb].graph_id: @@ -537,7 +492,7 @@ def match_graphs( and runs similarity propagation. ``rel_cluster_threshold`` is the single relation equivalence threshold: relation pairs with embedding similarity above this value are treated as the same relation for functionality - pooling, positive propagation gating, and negative evidence. + pooling and propagation gating. Returns the confidence dict. """