-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdeduplicate_json.py
More file actions
157 lines (126 loc) · 4.74 KB
/
deduplicate_json.py
File metadata and controls
157 lines (126 loc) · 4.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python3
"""
Deduplicate entries in trades.json and depth_chart.json files.
Removes duplicate entries based on unique identifiers.
"""
import json
import sys
from typing import List, Dict, Any
def get_trade_key(trade: Dict[str, Any]) -> tuple:
"""
Generate a unique key for a trade entry.
A trade is unique based on: date, time_et, exchange, price, volume, buyer, seller
"""
return (
trade.get('date', ''),
trade.get('time_et', ''),
trade.get('exchange', ''),
trade.get('price', 0),
trade.get('volume', 0),
trade.get('buyer', ''),
trade.get('seller', '')
)
def get_depth_chart_key(entry: Dict[str, Any]) -> tuple:
"""
Generate a unique key for a depth chart entry.
An entry is unique based on: timestamp, price, volume, buyer_broker, seller_broker,
bid_price, ask_price, bid_size, ask_size
"""
return (
entry.get('timestamp', ''),
entry.get('price', 0),
entry.get('volume', 0),
entry.get('buyer_broker', ''),
entry.get('seller_broker', ''),
entry.get('bid_price', 0),
entry.get('ask_price', 0),
entry.get('bid_size', 0),
entry.get('ask_size', 0)
)
def deduplicate_trades(filename: str) -> int:
"""
Remove duplicate trades from trades.json file.
Returns the number of duplicates removed.
"""
try:
with open(filename, 'r', encoding='utf-8') as f:
trades = json.load(f)
if not isinstance(trades, list):
print(f"Error: {filename} is not a JSON array")
return 0
original_count = len(trades)
seen = set()
unique_trades = []
for trade in trades:
key = get_trade_key(trade)
if key not in seen:
seen.add(key)
unique_trades.append(trade)
duplicates_removed = original_count - len(unique_trades)
if duplicates_removed > 0:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(unique_trades, f, indent=2, ensure_ascii=False)
print(f" Removed {duplicates_removed} duplicate(s) from {filename}")
print(f" {original_count} -> {len(unique_trades)} entries")
return duplicates_removed
except FileNotFoundError:
print(f" File {filename} not found, skipping")
return 0
except json.JSONDecodeError as e:
print(f" Error parsing {filename}: {e}")
return 0
except Exception as e:
print(f" Error processing {filename}: {e}")
return 0
def deduplicate_depth_chart(filename: str) -> int:
"""
Remove duplicate entries from depth_chart.json file.
Returns the number of duplicates removed.
"""
try:
with open(filename, 'r', encoding='utf-8') as f:
entries = json.load(f)
if not isinstance(entries, list):
print(f"Error: {filename} is not a JSON array")
return 0
original_count = len(entries)
seen = set()
unique_entries = []
for entry in entries:
key = get_depth_chart_key(entry)
if key not in seen:
seen.add(key)
unique_entries.append(entry)
duplicates_removed = original_count - len(unique_entries)
if duplicates_removed > 0:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(unique_entries, f, indent=2, ensure_ascii=False)
print(f" Removed {duplicates_removed} duplicate(s) from {filename}")
print(f" {original_count} -> {len(unique_entries)} entries")
return duplicates_removed
except FileNotFoundError:
print(f" File {filename} not found, skipping")
return 0
except json.JSONDecodeError as e:
print(f" Error parsing {filename}: {e}")
return 0
except Exception as e:
print(f" Error processing {filename}: {e}")
return 0
def main():
"""Main function to deduplicate both JSON files."""
print("Deduplicating JSON files...")
total_removed = 0
# Deduplicate trades.json
total_removed += deduplicate_trades('trades.json')
total_removed += deduplicate_trades('dashboard/public/trades.json')
# Deduplicate depth_chart.json
total_removed += deduplicate_depth_chart('depth_chart.json')
total_removed += deduplicate_depth_chart('dashboard/public/depth_chart.json')
if total_removed > 0:
print(f"\nTotal duplicates removed: {total_removed}")
else:
print("\nNo duplicates found.")
return 0 if total_removed >= 0 else 1
if __name__ == "__main__":
sys.exit(main())