From f9e2649c3e371ebce53daf4ef705935d2f14698a Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 15:02:23 +0100 Subject: [PATCH 1/2] Replace Entity.name with Entity.names for multi-label entity support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Entities can carry multiple names (e.g. "Meridian Technologies" and "Meridian Tech"). Name similarity seeding now computes max(soft_tfidf) across all name pairs, ensuring the closest match is always used. - Node.name: str → Node.names: list[str] - Graph.add_entity() accepts str or list[str] - IDF built from all names across all entities - Graph I/O serializes names list, loads legacy single-name format - Functionality pooling uses first name as representative Closes #27 Co-Authored-By: Claude Opus 4.6 --- tests/test_graph_io.py | 49 +++++++++++++++++++++++++++++++--- tests/test_propagation.py | 56 ++++++++++++++++++++++++++++++++++++++- worldgraph/graph.py | 17 +++++++----- worldgraph/match.py | 23 +++++++++------- 4 files changed, 125 insertions(+), 20 deletions(-) diff --git a/tests/test_graph_io.py b/tests/test_graph_io.py index 6158223..d0d657e 100644 --- a/tests/test_graph_io.py +++ b/tests/test_graph_io.py @@ -1,4 +1,4 @@ -"""Tests for graph save/load round-trip, including per-node graph_id.""" +"""Tests for graph save/load round-trip, including per-node graph_id and multi-label names.""" import json from pathlib import Path @@ -31,9 +31,9 @@ def test_save_load_roundtrip_unified_graph(tmp_path: Path): """Unified graph with nodes from different source graphs preserves graph_id.""" g = Graph(id="unified") # Manually add nodes with different source graph_ids - g.nodes["n1"] = Node(id="n1", graph_id="article-1", name="Alice") - g.nodes["n2"] = Node(id="n2", graph_id="article-2", name="Bob") - g.nodes["n3"] = Node(id="n3", graph_id="unified", name="Carol") + g.nodes["n1"] = Node(id="n1", graph_id="article-1", names=["Alice"]) + g.nodes["n2"] = Node(id="n2", graph_id="article-2", names=["Bob"]) + g.nodes["n3"] = Node(id="n3", graph_id="unified", names=["Carol"]) g.edges.append(Edge(source="n1", target="n2", relation="knows")) path = tmp_path / "unified.json" @@ -52,3 +52,44 @@ def test_save_load_roundtrip_unified_graph(tmp_path: Path): assert loaded.nodes["n1"].graph_id == "article-1" assert loaded.nodes["n2"].graph_id == "article-2" assert loaded.nodes["n3"].graph_id == "unified" + + +def test_save_load_roundtrip_multi_label_names(tmp_path: Path): + """Entities with multiple names survive save/load round-trip.""" + g = Graph(id="article-1") + n1 = g.add_entity(["Meridian Technologies", "Meridian Tech"]) + n2 = g.add_entity("DataVault") + g.add_edge(n1, n2, "acquired") + + path = tmp_path / "g.json" + save_graph(g, path) + + with open(path) as f: + data = json.load(f) + node_by_id = {n["id"]: n for n in data["nodes"]} + assert node_by_id[n1.id]["names"] == ["Meridian Technologies", "Meridian Tech"] + assert node_by_id[n2.id]["names"] == ["DataVault"] + + loaded = load_graph(path) + assert loaded.nodes[n1.id].names == ["Meridian Technologies", "Meridian Tech"] + assert loaded.nodes[n2.id].names == ["DataVault"] + + +def test_load_legacy_single_name_format(tmp_path: Path): + """Loading a graph saved with the old single-name format works.""" + data = { + "id": "legacy", + "nodes": [ + {"id": "n1", "graph_id": "legacy", "name": "Alice"}, + {"id": "n2", "graph_id": "legacy", "name": "Bob"}, + ], + "edges": [{"source": "n1", "target": "n2", "relation": "knows"}], + "matches": [], + } + path = tmp_path / "legacy.json" + with open(path, "w") as f: + json.dump(data, f) + + loaded = load_graph(path) + assert loaded.nodes["n1"].names == ["Alice"] + assert loaded.nodes["n2"].names == ["Bob"] diff --git a/tests/test_propagation.py b/tests/test_propagation.py index 89ae71e..75c31f5 100644 --- a/tests/test_propagation.py +++ b/tests/test_propagation.py @@ -13,6 +13,7 @@ - Name variation with structural reinforcement (the core use case) - Dangling entities get no structural evidence - Exponential sum accumulates evidence from multiple paths (bidirectional > unidirectional) +- Multi-label entities use max similarity across all names during seeding """ from worldgraph.graph import Graph @@ -427,7 +428,9 @@ def test_shared_anchor_does_not_override_name_dissimilarity(embedder): # Premise: name similarity alone is below threshold from worldgraph.names import build_idf, soft_tfidf - names = [n.name for g in [g1, g2, *bg_graphs] for n in g.nodes.values()] + names = [ + name for g in [g1, g2, *bg_graphs] for n in g.nodes.values() for name in n.names + ] idf = build_idf(names) sv_name_sim = soft_tfidf("Dr. Priya Sharma", "Dr. Elena Vasquez", idf) assert sv_name_sim < 0.8 @@ -598,3 +601,54 @@ def test_positive_evidence_is_monotonically_nondecreasing(embedder): f"at max_iter={n_iter}" ) prev_conf = curr_conf + + +# --------------------------------------------------------------------------- +# Multi-label name seeding +# --------------------------------------------------------------------------- + + +def test_multi_label_entity_uses_best_name_pair(embedder): + """An entity with multiple names should seed similarity using the best + name pair across both entities' name lists. + + "Meridian Technologies" stored as names=["Meridian Technologies"] in g1, + and names=["Meridian Tech", "Meridian Technologies"] in g2. The best + pair is "Meridian Technologies"/"Meridian Technologies" (score ~1.0), + not "Meridian Technologies"/"Meridian Tech" (~0.88). + + Without multi-label support, only one name is stored and the closest + pair may be missed, under-estimating similarity.""" + g1 = Graph(id="g1") + m1 = g1.add_entity("Meridian Technologies") + dv1 = g1.add_entity("DataVault") + g1.add_edge(m1, dv1, "acquired") + + g2 = Graph(id="g2") + m2 = g2.add_entity(["Meridian Tech", "Meridian Technologies"]) + dv2 = g2.add_entity("DataVault") + g2.add_edge(m2, dv2, "purchased") + + confidence = match_graphs([g1, g2], embedder) + + # With multi-label, the best name pair is exact match → seed ~1.0 + # Without, if only "Meridian Tech" is stored, seed would be ~0.88 + assert confidence[(m1.id, m2.id)] > 0.8 + + +def test_multi_label_all_names_contribute_to_idf(embedder): + """All names in an entity's name list should contribute to IDF + computation, not just the first.""" + g1 = Graph(id="g1") + m1 = g1.add_entity(["Meridian Technologies", "Meridian Tech"]) + dv1 = g1.add_entity("DataVault") + g1.add_edge(m1, dv1, "acquired") + + g2 = Graph(id="g2") + m2 = g2.add_entity("Meridian Technologies") + dv2 = g2.add_entity("DataVault") + g2.add_edge(m2, dv2, "purchased") + + # Should not raise — multi-label names flow through the pipeline + confidence = match_graphs([g1, g2], embedder) + assert confidence[(m1.id, m2.id)] > 0.8 diff --git a/worldgraph/graph.py b/worldgraph/graph.py index 8838626..8dece87 100644 --- a/worldgraph/graph.py +++ b/worldgraph/graph.py @@ -10,7 +10,7 @@ class Node: id: str graph_id: str - name: str + names: list[str] @dataclass @@ -26,9 +26,11 @@ class Graph: nodes: dict[str, Node] = field(default_factory=dict) edges: list[Edge] = field(default_factory=list) - def add_entity(self, name: str) -> Node: - """Add an entity node with the given name.""" - entity = Node(id=str(uuid.uuid4()), graph_id=self.id, name=name) + def add_entity(self, names: str | list[str]) -> Node: + """Add an entity node with the given name(s).""" + if isinstance(names, str): + names = [names] + entity = Node(id=str(uuid.uuid4()), graph_id=self.id, names=names) self.nodes[entity.id] = entity return entity @@ -47,10 +49,11 @@ def load_graph(path: Path) -> Graph: for node_data in data["nodes"]: node_id = node_data["id"] + raw_names = node_data.get("names") or [node_data["name"]] nodes[node_id] = Node( id=node_id, graph_id=node_data["graph_id"], - name=node_data["name"], + names=raw_names if isinstance(raw_names, list) else [raw_names], ) edges: list[Edge] = [] @@ -74,7 +77,9 @@ def save_graph( """Write graph to JSON, with optional match groups.""" nodes_out = [] for node in graph.nodes.values(): - nodes_out.append({"id": node.id, "graph_id": node.graph_id, "name": node.name}) + nodes_out.append( + {"id": node.id, "graph_id": node.graph_id, "names": node.names} + ) edges_out = [] for edge in graph.edges: diff --git a/worldgraph/match.py b/worldgraph/match.py index f442710..b8d573b 100644 --- a/worldgraph/match.py +++ b/worldgraph/match.py @@ -1,8 +1,9 @@ """Stage 2: Entity alignment via PARIS-style similarity propagation. -Entity names are stored directly on nodes. Name similarity seeds the -confidence dict before the iteration loop, so structural evidence -propagates from iteration 1. Relation similarity is treated as binary +Entity names are stored as lists on nodes (multi-label). Name similarity +seeds the confidence dict before the iteration loop using the max over all +name pairs, so structural evidence propagates from iteration 1. Relation +similarity is treated as binary via a single threshold that defines equivalence classes over free-text relation phrases — above threshold = same relation, below = different. This threshold is used consistently for functionality pooling, positive @@ -116,8 +117,8 @@ def compute_functionality( phrase_pairs: dict[str, list[tuple[str, str]]] = defaultdict(list) for graph in graphs: for edge in graph.edges: - source_name = graph.nodes[edge.source].name - target_name = graph.nodes[edge.target].name + source_name = graph.nodes[edge.source].names[0] + target_name = graph.nodes[edge.target].names[0] phrase_pairs[edge.relation].append((source_name, target_name)) result: dict[str, Functionality] = {} @@ -380,9 +381,11 @@ def propagate_similarity( if graph.nodes[id_a].graph_id == graph.nodes[id_b].graph_id: continue name_sim = max( - 0.0, - soft_tfidf(graph.nodes[id_a].name, graph.nodes[id_b].name, idf), + soft_tfidf(na, nb, idf) + for na in graph.nodes[id_a].names + for nb in graph.nodes[id_b].names ) + name_sim = max(0.0, name_sim) confidence[(id_a, id_b)] = name_sim confidence[(id_b, id_a)] = name_sim pairs.append((id_a, id_b)) @@ -487,7 +490,9 @@ def match_graphs( """ unified = build_unified_graph(graphs) - all_names = [node.name for graph in graphs for node in graph.nodes.values()] + all_names = [ + name for graph in graphs for node in graph.nodes.values() for name in node.names + ] all_relations = sorted({edge.relation for graph in graphs for edge in graph.edges}) idf = build_idf(all_names) @@ -564,7 +569,7 @@ def run_matching( click.echo(f"\n{len(match_groups)} match groups:") for members in match_groups: - names = {unified.nodes[eid].name for eid in members} + names = {n for eid in members for n in unified.nodes[eid].names} click.echo(f" {' / '.join(sorted(names))}") click.echo(f"\nWrote {output_path}") From a224470b320786e5195780b9633484fb9b700635 Mon Sep 17 00:00:00 2001 From: Johan Schuijt Date: Fri, 27 Mar 2026 15:26:30 +0100 Subject: [PATCH 2/2] Remove backward-compatibility shim for legacy single-name format Per review: no old-format files exist and no external users, so the fallback in load_graph and its test are unnecessary shims. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_graph_io.py | 20 -------------------- worldgraph/graph.py | 3 +-- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/tests/test_graph_io.py b/tests/test_graph_io.py index d0d657e..ab377e3 100644 --- a/tests/test_graph_io.py +++ b/tests/test_graph_io.py @@ -73,23 +73,3 @@ def test_save_load_roundtrip_multi_label_names(tmp_path: Path): loaded = load_graph(path) assert loaded.nodes[n1.id].names == ["Meridian Technologies", "Meridian Tech"] assert loaded.nodes[n2.id].names == ["DataVault"] - - -def test_load_legacy_single_name_format(tmp_path: Path): - """Loading a graph saved with the old single-name format works.""" - data = { - "id": "legacy", - "nodes": [ - {"id": "n1", "graph_id": "legacy", "name": "Alice"}, - {"id": "n2", "graph_id": "legacy", "name": "Bob"}, - ], - "edges": [{"source": "n1", "target": "n2", "relation": "knows"}], - "matches": [], - } - path = tmp_path / "legacy.json" - with open(path, "w") as f: - json.dump(data, f) - - loaded = load_graph(path) - assert loaded.nodes["n1"].names == ["Alice"] - assert loaded.nodes["n2"].names == ["Bob"] diff --git a/worldgraph/graph.py b/worldgraph/graph.py index 8dece87..fe4320e 100644 --- a/worldgraph/graph.py +++ b/worldgraph/graph.py @@ -49,11 +49,10 @@ def load_graph(path: Path) -> Graph: for node_data in data["nodes"]: node_id = node_data["id"] - raw_names = node_data.get("names") or [node_data["name"]] nodes[node_id] = Node( id=node_id, graph_id=node_data["graph_id"], - names=raw_names if isinstance(raw_names, list) else [raw_names], + names=node_data["names"], ) edges: list[Edge] = []