NHSOS/deduplicator.py at master · narayan-sajeev/NHSOS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# processing/deduplicator.py
"""Business deduplication using exact matching only"""

import glob
import os

import pandas as pd

import config
from cleaner import normalize_for_matching


def load_comparison_data():
    """Load HubSpot and target files for comparison."""
    existing_names = []
    sources = []

    # Load HubSpot data
    hubspot_path = os.path.join(config.CSV_DIR, "hubspot.csv")
    if os.path.exists(hubspot_path):
        try:
            hubspot_df = pd.read_csv(hubspot_path)
            company_names = hubspot_df['Associated Company'].dropna().tolist()
            existing_names.extend(company_names)
            sources.extend(['hubspot'] * len(company_names))
            print(f"Loaded {len(company_names)} companies from HubSpot")
        except Exception as e:
            print(f"Could not load HubSpot data: {e}")

    # Load all target files (targets_*.csv)
    target_pattern = os.path.join(config.CSV_DIR, "targets_*.csv")
    target_files = glob.glob(target_pattern)

    for target_file in target_files:
        try:
            target_df = pd.read_csv(target_file)
            names = target_df['business_name'].dropna().tolist()
            existing_names.extend(names)
            sources.extend([f'target_{os.path.basename(target_file)}'] * len(names))
            print(f"Loaded {len(names)} businesses from {os.path.basename(target_file)}")
        except Exception as e:
            print(f"Could not load {target_file}: {e}")

    print(f"\nTotal existing businesses to check against: {len(existing_names)}")
    return existing_names, sources


def create_normalized_lookup(names, sources):
    """
    Create a dictionary mapping normalized names to original names and sources.
    If multiple entries have the same normalized name, keep track of all.
    """
    lookup = {}
    for name, source in zip(names, sources):
        norm_name = normalize_for_matching(name)
        if norm_name:  # Skip empty normalized names
            if norm_name not in lookup:
                lookup[norm_name] = []
            lookup[norm_name].append({
                'original': name,
                'source': source
            })
    return lookup


def deduplicate_businesses(df):
    """
    Deduplicate businesses using exact matching only (case-insensitive).
    Returns: new_prospects_df, matched_existing_df, match_details
    """
    # Load comparison data
    existing_names, sources = load_comparison_data()

    if not existing_names:
        print("Warning: No comparison data found. All businesses will be considered new.")
        return df, pd.DataFrame(), {}

    # Create normalized lookup dictionary
    print("\nCreating lookup dictionary for exact matching...")
    lookup_dict = create_normalized_lookup(existing_names, sources)

    # Normalize scraped business names
    df['normalized'] = df['business_name'].apply(normalize_for_matching)

    # Perform exact matching
    print("Performing exact matches (case-insensitive)...")
    matches = {}
    matched_rows = []
    new_rows = []

    for idx, row in df.iterrows():
        norm_name = row['normalized']

        if norm_name and norm_name in lookup_dict:
            # Found exact match
            match_info = lookup_dict[norm_name][0]  # Take first match if multiple
            matches[idx] = {
                'matched_name': match_info['original'],
                'source': match_info['source'],
                'match_type': 'exact'
            }
            matched_rows.append({
                'business_name': row['business_name'],
                'matched_name': match_info['original']
            })
        else:
            # No match found
            new_rows.append(row.to_dict())

    # Create dataframes
    new_df = pd.DataFrame(new_rows)
    matched_df = pd.DataFrame(matched_rows)

    # Remove normalized column from new dataframe
    if 'normalized' in new_df.columns:
        new_df = new_df.drop('normalized', axis=1)

    # Remove normalized column from original dataframe
    if 'normalized' in df.columns:
        df.drop('normalized', axis=1, inplace=True)

    print(f"Found {len(matched_df)} exact matches")
    print(f"Found {len(new_df)} new prospects")

    return new_df, matched_df, matches