-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract.py
More file actions
153 lines (126 loc) · 5.17 KB
/
extract.py
File metadata and controls
153 lines (126 loc) · 5.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import sqlite3
from bs4 import BeautifulSoup
import re
import json
# Create database and table with all required fields
conn = sqlite3.connect('products.db')
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT,
price REAL,
discount TEXT,
discount_price REAL,
quantity_ml REAL,
pack_size TEXT
)
''')
conn.commit()
# Read and parse HTML
with open('tesco_product_persistent.html', 'r', encoding='utf-8') as f:
html = f.read()
soup = BeautifulSoup(html, 'html.parser')
# Find all product cards
product_cards = soup.find_all('article', {'data-testid': re.compile(r'ProductCardWrapper')})
def extract_quantity_ml(title, pack_size_text):
"""Extract quantity in ml from title or pack size text"""
# Try to find ml or L in title first
title_match = re.search(r'(\d+(?:\.\d+)?)\s*(ml|L|l)', title, re.IGNORECASE)
if title_match:
value = float(title_match.group(1))
unit = title_match.group(2).lower()
if unit == 'l':
return value * 1000 # Convert L to ml
return value
# Try pack size text
if pack_size_text:
pack_match = re.search(r'(\d+(?:\.\d+)?)\s*(ml|L|l)', pack_size_text, re.IGNORECASE)
if pack_match:
value = float(pack_match.group(1))
unit = pack_match.group(2).lower()
if unit == 'l':
return value * 1000
return value
return None
def extract_price_from_text(text):
"""Extract numeric price from text like '€2.55' or 'was €5.50'"""
price_match = re.search(r'€?\s*(\d+(?:\.\d+)?)', text.replace(',', ''))
if price_match:
return float(price_match.group(1))
return None
def calculate_discount_price(price, discount_text, was_price):
"""Calculate discount price from promotion or was price"""
if was_price:
return price # Current price is the discount price
# Check for "Buy X for €Y" promotions
if discount_text:
buy_match = re.search(r'Buy\s+(\d+)\s+for\s+€?(\d+(?:\.\d+)?)', discount_text, re.IGNORECASE)
if buy_match:
quantity = int(buy_match.group(1))
total_price = float(buy_match.group(2))
return total_price / quantity
return None
products_data = []
for card in product_cards:
# Extract title
title_elem = card.find('div', {'data-testid': re.compile(r'ProductNameTestId')})
title = title_elem.get_text(strip=True) if title_elem else None
if not title:
continue
# Extract current price
price_elem = card.find('div', class_=re.compile(r'ProductPrice'))
price = None
if price_elem:
price_text = price_elem.get_text(strip=True)
price = extract_price_from_text(price_text)
# Extract "was" price (original price before discount)
was_price_elem = card.find('div', {'data-testid': re.compile(r'ProductCardWasPrice')})
was_price = None
if was_price_elem:
was_price_text = was_price_elem.get_text(strip=True)
was_price = extract_price_from_text(was_price_text)
# Extract discount/promotion badge
discount_elem = card.find('div', {'data-testid': re.compile(r'promotionBadge')})
discount = discount_elem.get_text(strip=True) if discount_elem else None
# Calculate discount price
discount_price = calculate_discount_price(price, discount, was_price)
# Extract pack size from aria-hidden description
pack_size = None
aria_title = card.find('div', class_=re.compile(r'AriaProductTitle'))
if aria_title:
pack_size_match = re.search(r'Pack Size.*?<br/>([^&]+)', str(aria_title), re.IGNORECASE)
if pack_size_match:
pack_size = pack_size_match.group(1).strip()
# Clean up HTML entities
pack_size = pack_size.replace('&#8494;', '').strip()
# Extract quantity in ml
quantity_ml = extract_quantity_ml(title, pack_size)
# Insert into database
cursor.execute('''
INSERT INTO products (title, price, discount, discount_price, quantity_ml, pack_size)
VALUES (?, ?, ?, ?, ?, ?)
''', (title, price, discount, discount_price, quantity_ml, pack_size))
products_data.append({
'title': title,
'price': price,
'discount': discount,
'discount_price': discount_price,
'quantity_ml': quantity_ml,
'pack_size': pack_size
})
print(f"Extracted: {title}")
print(f" Price: €{price}")
print(f" Discount: {discount}")
print(f" Discount Price: €{discount_price if discount_price else 'N/A'}")
print(f" Quantity: {quantity_ml}ml" if quantity_ml else " Quantity: N/A")
print(f" Pack Size: {pack_size}" if pack_size else " Pack Size: N/A")
print("-" * 80)
conn.commit()
conn.close()
# Also save to JSON for easy viewing
with open('products_extracted.json', 'w', encoding='utf-8') as f:
json.dump(products_data, f, indent=2, ensure_ascii=False)
print(f"\n✓ Extracted {len(products_data)} products!")
print("✓ Data saved to products.db (SQLite database)")
print("✓ Data also saved to products_extracted.json (JSON file)")