-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcore.py
More file actions
830 lines (686 loc) · 31 KB
/
core.py
File metadata and controls
830 lines (686 loc) · 31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
"""
Core implementation of the v2 context-aware row merge algorithm.
The implementation reconstructs sparse rows by searching a graph of partially
overlapping rows. The important v2 behavior is:
- Graph connectivity is based on exact label/value tuple equality.
- Search only traverses rows that stay compatible with the current selected
state for the row being reconstructed.
- Missing labels are resolved with a weighted best-first search that keeps the
best score seen for each row and preserves ties.
- Every selected value carries provenance describing which rows led to it.
`main.py` is the thin CLI layer; this file contains the actual merge model,
graph construction, search, and merge orchestration.
"""
import heapq
import json
import logging
import re
from collections import defaultdict
from dataclasses import dataclass
from enum import Enum
from typing import List, Dict, Set
logger = logging.getLogger("CustomLogger")
logger.setLevel(logging.DEBUG)
if not logger.handlers:
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# constants
DEFAULT_TUPLE_MATCH_GAIN = 1.0
SELECTED_MATCH_GAIN = 1.5
PATH_PENALTY = 0.1
# type aliases
Label = str
RowIdT = str
BucketIdT = str
HashcodeT = int
class Format(Enum):
JSON = "json"
XML = "xml"
CSV = "csv"
TEXT = "text"
class Type(Enum):
INT = "int"
DOUBLE = "double"
BOOL = "bool"
STRING = "string"
DATE = "date"
UNKNOWN = "unknown"
class Tokenizer:
"""
Minimal tokenizer kept for tuple metadata.
The v2 merge graph no longer uses token overlap for connectivity or scoring;
graph edges are created from exact normalized values. Tokens are still
materialized on each tuple because they remain useful for debugging,
inspection, and potential future weighting heuristics.
"""
@staticmethod
def tokenize(value: str) -> List[str]:
return value.lower().split()
@dataclass(frozen=True)
class Tuple:
"""One extracted label/value observation belonging to a source row."""
row_id: str
index: int
path: str
value: str
source_value: str
value_type: Type
label: Label
tokens: Dict[str, int]
tokens_hashes: Set[int]
weight: float
@classmethod
def create(cls, row_id: str, index: int, path: str, value: str,
source_value: str, value_type: Type,
label: Label,
weight: float):
tokens = set(Tokenizer.tokenize(value)) # dedup tokens
hashed_tokens = {}
tokens_hashes = set()
for token in tokens:
hashed_tokens[token] = hash(token)
tokens_hashes.add(hashed_tokens[token])
return cls(row_id, index, path, value, source_value, value_type,
label, hashed_tokens, tokens_hashes, weight)
def __repr__(self):
return (
f"Tuple("
f"row_id='{self.row_id}', "
f"path='{self.path}', "
f"value='{self.value}', "
f"tokens='{self.tokens}', "
f"source_value='{self.source_value}', "
f"value_type={self.value_type.name}, "
f"label={self.label}, "
f"weight={self.weight}"
f")"
)
@dataclass
class Row:
"""A sparse source record made of extracted tuples."""
id: str
tuples: List[Tuple]
format: Format
file_path: str
def unique_labels(self) -> Set[Label]:
labels = set()
for t in self.tuples:
labels.add(t.label)
return labels
def find_tuple_by_label(self, label: Label) -> None | Tuple:
for t in self.tuples:
if t.label == label:
return t
return None
def __repr__(self):
return (f"Row(id={self.id}, tuples={self.tuples}, format={self.format.name}, "
f"file_path={self.file_path})")
@dataclass
class DataRow:
"""
Final merged output row.
`values` may contain multiple tuples for one label when the search ends in a
score tie. `provenance` stores one or more row-id paths for each resolved
label so the output can be explained or audited.
"""
id: str
score: float
values: Dict[Label, List[Tuple]]
provenance: Dict[Label, List[List[str]]]
@dataclass
class Node:
"""Lightweight index entry stored inside a label/value bucket."""
row_id: str
tuple_index: int
token_hash: int
value: str # for debugging only
source_value: str # for debugging only
@dataclass
class Bucket:
"""
Index for one label.
`nodes` groups tuples by normalized value hash. `connected` stores the
row-level adjacency induced by those exact shared tuples.
"""
label: Label
nodes: Dict[HashcodeT, List[Node]]
connected: Dict[RowIdT, Dict[RowIdT, int]] # row_id -> connected row + tuple match count
def connect(self, row_id1: RowIdT, row_id2: RowIdT):
"""Create a symmetric row-to-row connection inside this label bucket."""
if row_id1 != row_id2:
if row_id1 not in self.connected:
self.connected[row_id1] = defaultdict(int)
if row_id2 not in self.connected:
self.connected[row_id2] = defaultdict(int)
self.connected[row_id1][row_id2] += 1
self.connected[row_id2][row_id1] += 1
@dataclass
class SearchResult:
"""
Result of resolving one missing label.
`tuples` may contain multiple winning tuples when several values achieve the
same best score. `provenance` is keyed by tuple identity so each winning
value can keep all paths that support it.
"""
score: float
tuples: List[Tuple]
provenance: Dict[tuple[str, int, Label, str], List[List[str]]]
def is_placeholder_tuple(t: Tuple) -> bool:
"""Return True for the sentinel tuple used by some legacy helpers/tests."""
return t.row_id == "N/A" and t.value == "N/A"
def make_placeholder_tuple(label: Label) -> Tuple:
"""Create the legacy placeholder tuple for an unresolved label."""
return Tuple.create("N/A", 0, path="N/A", value="N/A", source_value="N/A",
value_type=Type.UNKNOWN, label=label, weight=0.0)
def unique_tuples(tuples: List[Tuple]) -> Dict[tuple[Label, str], Tuple]:
"""Collapse tuples by exact `(label, value)` identity, dropping placeholders."""
unique = {}
for t in tuples:
if is_placeholder_tuple(t):
continue
key = (t.label, t.value)
if key not in unique or unique[key].row_id == "N/A":
unique[key] = t
return unique
def provenance_key(t: Tuple) -> tuple[str, int, Label, str]:
"""Stable dictionary key for provenance entries tied to one concrete tuple."""
return (t.row_id, t.index, t.label, t.value)
def tuples_match(t1: Tuple, t2: Tuple) -> bool:
"""Return True when two tuples are exactly equal for graph-merge purposes."""
return t1.label == t2.label and t1.value == t2.value
def dedupe_tuples(tuples: List[Tuple]) -> List[Tuple]:
"""Return one representative tuple per exact `(label, value)` pair."""
return list(unique_tuples(tuples).values())
def tuples_by_label_values(tuples: List[Tuple]) -> Dict[Label, Set[str]]:
"""Group a tuple list into `label -> set(values)` for compatibility checks."""
values_by_label: Dict[Label, Set[str]] = defaultdict(set)
for t in tuples:
if not is_placeholder_tuple(t):
values_by_label[t.label].add(t.value)
return values_by_label
def tuples_compatible(left: List[Tuple], right: List[Tuple]) -> bool:
"""
Check the v2 compatibility rule between two tuple sets.
Two tuple sets are compatible when every overlapping label has at least one
identical value on both sides. If a shared label has disjoint value sets, the
rows cannot belong to the same merged state.
"""
left_values = tuples_by_label_values(left)
right_values = tuples_by_label_values(right)
for label in left_values.keys() & right_values.keys():
if left_values[label].isdisjoint(right_values[label]):
return False
return True
def overlap_count(left: List[Tuple], right: List[Tuple]) -> int:
"""Count exact shared `(label, value)` pairs between two tuple collections."""
return len(set(unique_tuples(left).keys()) & set(unique_tuples(right).keys()))
def compatible_with_selected(tuples: List[Tuple], selected: Dict[Label, List[Tuple]]) -> bool:
"""Check whether a candidate row can be added to the current selected state."""
selected_tuples = [t for tuples_for_label in selected.values() for t in tuples_for_label]
return tuples_compatible(tuples, selected_tuples)
def merge_search_results(current: SearchResult | None, candidate: SearchResult | None) -> SearchResult | None:
"""
Combine two search results while preserving score ties and provenance.
The higher-scoring result wins. If both results have the same score, the
tuple sets and provenance paths are merged.
"""
if candidate is None or len(candidate.tuples) == 0:
return current
candidate = SearchResult(
score=candidate.score,
tuples=dedupe_tuples(candidate.tuples),
provenance=candidate.provenance,
)
if current is None:
return candidate
if candidate.score > current.score:
return candidate
if candidate.score < current.score:
return current
merged = dedupe_tuples(current.tuples + candidate.tuples)
provenance: Dict[tuple[str, int, Label, str], List[List[str]]] = defaultdict(list)
for source in (current.provenance, candidate.provenance):
for t, paths in source.items():
provenance[t].extend(paths)
return SearchResult(score=current.score, tuples=merged, provenance=dict(provenance))
def data_row_subsumes(left: DataRow, right: DataRow) -> bool:
"""
Return True when `left` is a strictly more general ambiguity-preserving row.
This is used as a final cleanup step so the merge does not emit both a
narrower row and a broader row that already contains all of the narrower
row's values.
"""
strict = False
for label, right_tuples in right.values.items():
left_values = {t.value for t in left.values[label]}
right_values = {t.value for t in right_tuples}
if not right_values.issubset(left_values):
return False
if left_values != right_values:
strict = True
return strict
def strength_tuple(t1: Tuple, t2: Tuple, gain: float = DEFAULT_TUPLE_MATCH_GAIN) -> float:
"""
Score one exact shared tuple.
In v2 a tuple contributes to graph evidence only when both its label and
normalized value match exactly. When that happens the score is a constant
match gain plus both tuple weights.
"""
if not tuples_match(t1, t2):
return 0.0
return gain + t1.weight + t2.weight
def strength_tuples(tuples1: List[Tuple], tuples2: List[Tuple], gain: float = DEFAULT_TUPLE_MATCH_GAIN) -> float:
"""
Sum the evidence from all exact shared tuples between two rows/states.
The function deduplicates by `(label, value)` first, then adds one
`strength_tuple()` contribution for every shared exact tuple.
"""
total = 0.0
tuples1_by_key = unique_tuples(tuples1)
tuples2_by_key = unique_tuples(tuples2)
for key in tuples1_by_key.keys() & tuples2_by_key.keys():
total += strength_tuple(tuples1_by_key[key], tuples2_by_key[key], gain)
return total
def best_strength_tuple(t1: Tuple, tuples2: List[Tuple], gain: float = DEFAULT_TUPLE_MATCH_GAIN) -> float:
"""
Return the strongest exact match for one tuple against a tuple list.
This helper is mostly retained for scoring utilities. With exact matching the
result is either the weighted score of the matching tuple or `0.0`.
"""
return max(
(strength_tuple(t1, t2, gain) for t2 in tuples2 if t1.label == t2.label),
default=0.0
)
def strength_selected(tuples: List[Tuple], selected: Dict[Label, List[Tuple]]) -> float:
"""
Score how well a candidate row agrees with the current selected state.
Each exact `(label, value)` pair shared with the selected state contributes
one `SELECTED_MATCH_GAIN`. This term is the "context-aware" part of the
search: rows that fit the partially reconstructed output are preferred over
rows that merely share local overlap with the current frontier.
"""
total_strength = 0.0
selected_lookup = {
key
for tuples_for_label in selected.values()
for key in unique_tuples(tuples_for_label).keys()
}
for key in unique_tuples(tuples).keys():
if key in selected_lookup:
total_strength += SELECTED_MATCH_GAIN
return total_strength
def is_perfect(tuples: List[Tuple], selected: Dict[Label, List[Tuple]]) -> bool:
"""
Identify rows that are immediately consistent with the selected state.
The v2 seed phase prefers rows that already fit the current selected state.
In this implementation "perfect" is treated as compatibility with the
selected tuples, not as a stricter exact-cover notion.
"""
return compatible_with_selected(tuples, selected)
def strength(row1: Row, row2: Row, gain: float = DEFAULT_TUPLE_MATCH_GAIN) -> float:
"""Convenience wrapper for row-to-row exact-overlap strength."""
return strength_tuples(row1.tuples, row2.tuples, gain)
def rows_compatible(row1: Row, row2: Row) -> bool:
"""Apply the tuple-level compatibility rule to two whole rows."""
return tuples_compatible(row1.tuples, row2.tuples)
class Graph:
"""Row graph plus label/value indexes used by the merge search."""
def __init__(self):
self.buckets: Dict[Label, Bucket] = {}
self.rows: Dict[RowIdT, Row] = {}
def get_nodes_by_label(self, label: Label) -> List[Node]:
"""Return all indexed nodes for one label across all normalized values."""
if label not in self.buckets:
return []
ids = []
for nodes in self.buckets[label].nodes.values():
ids += nodes
return ids
def get_rows_ids_by_label(self, label: Label) -> Set[str]:
"""Return the ids of all rows that contain the given label."""
if label not in self.buckets:
return set()
ids = set()
for nodes in self.buckets[label].nodes.values():
for n in nodes:
ids.add(n.row_id)
return ids
def get_directly_connected_rows_with_scores(self, row: Row) -> Dict[str, float]:
"""
Return compatible one-hop neighbors and their local overlap scores.
A neighbor is considered directly connected only if the graph already
contains an exact shared tuple edge and the two rows remain compatible on
all overlapping labels.
"""
res = {}
ids = set()
for label in row.unique_labels():
bucket = self.buckets[label]
if row.id in bucket.connected:
ids.update(bucket.connected[row.id].keys())
if row.id in ids:
raise ValueError(f"row_id {row.id} connected to itself")
for id in ids:
candidate = self.rows[id]
if not rows_compatible(row, candidate):
continue
local_score = strength_tuples(row.tuples, candidate.tuples)
if local_score > 0.0:
res[id] = local_score
return res
def add_tuple(self, row_id: str, t: Tuple, tuple_index: int):
"""
Insert one tuple into the label/value index and connect matching rows.
Connections are created only for exact matches on both label and
normalized value.
"""
if t.label not in self.buckets:
self.buckets[t.label] = Bucket(label=t.label, nodes=dict(), connected=dict())
value_hash = hash(t.value)
if value_hash not in self.buckets[t.label].nodes:
self.buckets[t.label].nodes[value_hash] = []
self.buckets[t.label].nodes[value_hash].append(
Node(row_id=row_id, tuple_index=tuple_index,
token_hash=value_hash,
value=t.value,
source_value=t.source_value))
bucket = self.buckets[t.label]
for n in bucket.nodes[value_hash]:
if n.row_id != row_id and n.value == t.value:
logger.debug(f'connect label={t.label} ,{n.row_id}->{row_id}')
bucket.connect(n.row_id, row_id)
def add_row(self, row: Row):
"""Add a row to the graph and index all of its tuples."""
if row.id in self.rows:
raise ValueError(f'duplicated row id={row.id}')
self.rows[row.id] = row
for index, value in enumerate(row.tuples):
self.add_tuple(row.id, value, index)
def node_to_tuple(self, n: Node) -> Tuple:
"""Resolve an index node back to the concrete tuple stored in the row."""
return self.rows[n.row_id].tuples[n.tuple_index]
def get_rows_ids_by_labels(self, labels: Set[Label]) -> Set[str]:
"""Return all rows that contain at least one of the requested labels."""
ids = set()
for label in labels:
ids.update(self.get_rows_ids_by_label(label))
return ids
def bfs_find_best(self, start_row: str, seed_rows: List[tuple[str, float]], label: Label,
selected: Dict[Label, List[Tuple]]) -> SearchResult | None:
"""
Best-first search for the highest-scoring value of one missing label.
The queue is seeded with compatible starting rows chosen by
`find_best_tuple()`. Search then expands through compatible one-hop
neighbors. Each expansion adds:
- exact local overlap gain with the frontier row,
- selected-state gain for agreement with the current partial merge,
- a constant path penalty for taking another edge.
The search keeps `best_score_by_row` rather than a simple visited set,
which is important in v2 because the same row may be reachable through
different-scoring paths. If multiple tuples for the target label finish
with the same best score, all of them are returned together with their
provenance paths.
"""
pq = []
best_score_by_row = defaultdict(lambda: float("-inf"))
best_tuple_score = float("-inf")
best_tuples: List[Tuple] = []
best_provenance: Dict[tuple[str, int, Label, str], List[List[str]]] = defaultdict(list)
for row_id, seed_score in seed_rows:
if seed_score < best_score_by_row[row_id]:
continue
best_score_by_row[row_id] = seed_score
heapq.heappush(pq, (-seed_score, row_id, 0, [start_row, row_id]))
while pq:
neg_score, current_row_id, path_length, path = heapq.heappop(pq)
current_score = -neg_score
if current_score < best_score_by_row[current_row_id]:
continue
current_row = self.rows[current_row_id]
if not compatible_with_selected(current_row.tuples, selected):
continue
# Check if current row contains the target label
for t in current_row.tuples:
if t.label == label:
if current_score > best_tuple_score:
best_tuple_score = current_score
best_tuples = [t]
best_provenance = defaultdict(list)
best_provenance[provenance_key(t)].append(path)
elif current_score == best_tuple_score:
best_tuples.append(t)
best_provenance[provenance_key(t)].append(path)
if current_score <= best_tuple_score:
continue
# Explore connected rows, i.e. rows that share the same labels
candidates = self.get_directly_connected_rows_with_scores(current_row)
sorted_candidates = sorted(candidates, key=candidates.get, reverse=True)
logger.debug(f'connected_row_ids={sorted_candidates}')
for next_row_id in sorted_candidates:
if next_row_id != current_row_id:
if next_row_id in path:
continue
next_row = self.rows[next_row_id]
if not compatible_with_selected(next_row.tuples, selected):
continue
edge_gain = candidates[next_row_id]
# Bonus for selected matches
selected_score = strength_selected(next_row.tuples, selected)
# v2 defines the path penalty as a constant cost per traversed edge.
total_score = current_score + edge_gain + selected_score - PATH_PENALTY
if total_score > best_score_by_row[next_row_id]:
best_score_by_row[next_row_id] = total_score
heapq.heappush(pq, (-total_score, next_row_id, path_length + 1, path + [next_row_id]))
if best_tuple_score == float("-inf"):
return None
return SearchResult(
score=best_tuple_score,
tuples=dedupe_tuples(best_tuples),
provenance=dict(best_provenance),
)
def find_best_tuple(self, label: Label, source_row_id: str,
selected: Dict[Label, List[Tuple]]) -> None | SearchResult:
"""
Resolve one missing label for a source row.
Search happens in two stages:
- First, try compatible high-priority seeds ("perfect" rows).
- If that does not yield an answer, try directly connected compatible
neighbors of the source row.
Both stages feed into `bfs_find_best()`, which performs the actual
best-first graph search and tie preservation.
"""
logger.debug(f'Finding best value for row_id={source_row_id}, label={label}, selected={selected.values()}')
if label not in self.buckets:
return None
best_result = None
source_row = self.rows[source_row_id]
# Step 1: Select starting rows based on selected
# rows that fully match `selected` are called: perfect
# We treat these perfect rows as high-priority BFS starts with an initial score of |selected|
perfect_rows = []
direct_rows = []
if len(selected) > 0:
for candidate in self.rows.values():
if candidate.id == source_row_id:
continue
if not compatible_with_selected(candidate.tuples, selected):
continue
if overlap_count(candidate.tuples, source_row.tuples) == 0 and \
overlap_count(candidate.tuples, [t for values in selected.values() for t in values]) == 0:
continue
seed_score = strength_selected(candidate.tuples, selected)
if is_perfect(candidate.tuples, selected):
perfect_rows.append((candidate.id, seed_score))
elif overlap_count(candidate.tuples, source_row.tuples) > 0:
direct_rows.append((
candidate.id,
seed_score + strength_tuples(source_row.tuples, candidate.tuples),
))
logger.debug(f"Perfect rows: {perfect_rows}")
# check perfect rows first
if perfect_rows:
res = self.bfs_find_best(source_row_id, perfect_rows, label, selected)
best_result = merge_search_results(best_result, res)
if best_result and len(best_result.tuples) > 0:
# we found a tuple from perfect rows
logger.debug(
f'Found best value from perfect rows={perfect_rows}. label={label}, row_id={source_row_id}: '
f'best result={best_result}')
return best_result
# Step2: try directly connected rows
if not direct_rows:
candidates = self.get_directly_connected_rows_with_scores(source_row)
for row_id, score in sorted(candidates.items(), key=lambda item: item[1], reverse=True):
if not compatible_with_selected(self.rows[row_id].tuples, selected):
continue
direct_rows.append((
row_id,
score + strength_selected(self.rows[row_id].tuples, selected),
))
if direct_rows:
res = self.bfs_find_best(source_row_id, direct_rows, label, selected)
best_result = merge_search_results(best_result, res)
if best_result and len(best_result.tuples) > 0:
logger.debug(
f'Found best value for label={label}, row={source_row_id}: best_result{best_result}')
else:
logger.debug(f'No valid value found for label={label}, row={source_row_id}')
return best_result
def debug_info(self):
"""Emit a compact summary of which rows appear in each label bucket."""
for label, bucket in self.buckets.items():
row_ids = set()
for nodes in bucket.nodes.values():
for n in nodes:
row_ids.add(n.row_id)
logger.debug(f'bucket {label} has rows: {row_ids}')
def create_graph(rows: List[Row]) -> Graph:
"""Build a graph from a list of rows by indexing every tuple."""
graph = Graph()
for row in rows:
graph.add_row(row)
return graph
def merge(graph: Graph,
columns: List[Label], **kwargs) -> List[DataRow]:
"""
Merge every eligible source row into one or more output rows.
The merge starts from a copy of each source row's existing tuples, which
means source-row values become hard constraints for the rest of the search.
Missing output labels are resolved one by one with `find_best_tuple()`.
The result table is keyed by the resolved value sets for the requested
columns so duplicate reconstructions collapse to the highest-scoring row.
After table construction a subsumption pass removes rows that are strictly
dominated by a broader ambiguity-preserving row.
Supported kwargs:
- `row_filter`: optional set of source row ids to merge.
"""
row_filter = set()
if "row_filter" in kwargs:
row_filter = kwargs["row_filter"]
table = {} # primary key ->? (score,row)
for row in graph.rows.values():
total_score = 0.0
if len(row_filter) == 0 or row.id in row_filter:
selected: Dict[Label, List[Tuple]] = defaultdict(list)
provenance: Dict[Label, List[List[str]]] = defaultdict(list)
for t in row.tuples:
selected[t.label].append(t)
for label, tuples_for_label in list(selected.items()):
selected[label] = dedupe_tuples(tuples_for_label)
for t in selected[label]:
provenance[label].append([row.id, t.row_id])
for col in columns:
if col not in selected:
res = graph.find_best_tuple(col, row.id, selected)
logger.debug(f'merge label={col} result={res}')
logger.debug('=' * 100)
if res and len(res.tuples) > 0:
total_score += res.score
selected[col] = dedupe_tuples(res.tuples)
for t in selected[col]:
provenance[col].extend(res.provenance.get(provenance_key(t), [[row.id, t.row_id]]))
resolved_values = {}
for col in columns:
resolved_values[col] = selected.get(col, [])
key_parts = []
for col in columns:
col_values = []
for t in resolved_values[col]:
col_values.append(t.value)
key_parts.append(f"[{','.join(sorted(col_values))}]")
key = "|".join(key_parts)
if key not in table or total_score > table[key].score:
output_values = {col: resolved_values[col] for col in columns}
output_provenance = {col: provenance.get(col, []) for col in columns}
table[key] = DataRow(id=row.id, score=total_score, values=output_values, provenance=output_provenance)
logger.debug("\n".join(table.keys()))
rows = list(table.values())
filtered_rows = []
for row in rows:
if any(other is not row and data_row_subsumes(other, row) for other in rows):
continue
filtered_rows.append(row)
return filtered_rows
def _doc_to_rows(data) -> List[Row]:
"""Convert the repository's entity JSON structure into `Row` objects."""
nodes = []
for node_data in data["rows"]:
row = Row(id=node_data["id"], tuples=[], format=Format(node_data["format"]),
file_path=node_data["file_path"])
for tuple_index, tuple_data in enumerate(node_data["tuples"]):
row.tuples.append(
Tuple.create(
row_id=row.id,
index=tuple_index,
path=tuple_data["path"],
value=tuple_data["value"],
source_value=tuple_data["source_value"],
value_type=Type(tuple_data["value_type"]),
label=tuple_data["label"],
weight=0.0
)
)
nodes.append(row)
return nodes
def load_entities(file_paths: List[str]) -> List[Row]:
"""Load and concatenate entity rows from one or more JSON files."""
nodes = []
for file_path in file_paths:
with open(file_path, 'r') as json_file:
data = json.load(json_file)
nodes += _doc_to_rows(data)
return nodes
def _parse_rows(input_text: str, initial_weights={}) -> List[Row]:
"""
Parse the small `R1=[(L1,V1), ...]` text fixture format used in experiments.
This helper is mainly for toy examples and tests; production/demo flows use
`load_entities()` with JSON files.
"""
rows = []
pattern = re.compile(r"(R\d+)=\[(.*?)\]") # matches R1=[(L1,V1)]
tuple_pattern = re.compile(r"\((L\d+),([^)]+)\)") # matches (L1,V1)
for match in pattern.finditer(input_text):
row_id = match.group(1) # extract row ID, e.g., R1
tuple_str = match.group(2) # extract tuples, e.g., (L1,V1), (L2,V2)
tuples = []
for index, tuple_match in enumerate(tuple_pattern.finditer(tuple_str)):
label = tuple_match.group(1) # extract label, e.g., L1
value = tuple_match.group(2) # extract value, e.g., V1
weight = 0.0
if row_id in initial_weights and index in initial_weights[row_id]:
weight = initial_weights[row_id][index]
tuples.append(Tuple.create(
row_id=row_id,
index=index,
path="",
value=value,
source_value=value,
value_type=Type.STRING,
label=getattr(Label, label),
weight=weight
))
rows.append(Row(id=row_id, tuples=tuples, format=Format.TEXT, file_path=""))
return rows