forked from sophiie-ai/hackathon
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathqdrant_manager.py
More file actions
180 lines (149 loc) · 6.03 KB
/
qdrant_manager.py
File metadata and controls
180 lines (149 loc) · 6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import os
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer
import uuid
from datetime import datetime
from inngest_monitor import track_qdrant_save, track_qdrant_search
# Initialize Qdrant client (using in-memory for simplicity, can switch to server)
qdrant_client = QdrantClient(":memory:") # Use ":memory:" for in-memory or provide URL for server
# Initialize sentence transformer for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
COLLECTION_NAME = "product_ingredients"
def initialize_qdrant():
"""Initialize Qdrant collection for storing product ingredients"""
try:
# Check if collection exists
collections = qdrant_client.get_collections().collections
collection_exists = any(col.name == COLLECTION_NAME for col in collections)
if not collection_exists:
# Create collection
qdrant_client.create_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(
size=384, # all-MiniLM-L6-v2 embedding size
distance=Distance.COSINE
)
)
print(f"Created Qdrant collection: {COLLECTION_NAME}")
else:
print(f"Collection {COLLECTION_NAME} already exists")
return True
except Exception as e:
print(f"Error initializing Qdrant: {e}")
return False
def extract_ingredients_from_content(content, title):
"""
Extract ingredients information from product content.
This is a simple extraction - looks for ingredient-related text.
"""
ingredients_text = ""
# Common keywords that indicate ingredients section
keywords = ["ingredients", "contains", "composition", "made with", "made from"]
content_lower = content.lower()
title_lower = title.lower()
# Check if content mentions ingredients
for keyword in keywords:
if keyword in content_lower:
# Try to extract the relevant part
start_idx = content_lower.find(keyword)
# Get text after the keyword (next 200 chars)
ingredients_text = content[start_idx:start_idx+300]
break
# If no ingredients found in content, use the full content as context
if not ingredients_text:
ingredients_text = content[:300] # First 300 chars
return ingredients_text
def save_product_to_qdrant(product_data):
"""
Save product with ingredients to Qdrant
Args:
product_data: Dict with keys - title, url, content, store, ingredients, groq_analysis
"""
try:
# Extract or get ingredients
ingredients = product_data.get('ingredients', '')
if not ingredients:
ingredients = extract_ingredients_from_content(
product_data.get('content', ''),
product_data.get('title', '')
)
# Create embedding for the product (title + ingredients)
text_to_embed = f"{product_data.get('title', '')} {ingredients}"
embedding = model.encode(text_to_embed).tolist()
# Create point
point = PointStruct(
id=str(uuid.uuid4()),
vector=embedding,
payload={
"title": product_data.get('title', ''),
"url": product_data.get('url', ''),
"store": product_data.get('store', ''),
"ingredients": ingredients,
"content": product_data.get('content', '')[:500], # Store first 500 chars
"timestamp": datetime.now().isoformat(),
"product_description": product_data.get('product_description', ''),
"groq_analysis": product_data.get('groq_analysis', '') # Store Groq analysis
}
)
# Upsert to Qdrant
qdrant_client.upsert(
collection_name=COLLECTION_NAME,
points=[point]
)
# Track with Inngest
track_qdrant_save(product_data.get('title', ''), product_data.get('store', ''), True)
return True, ingredients
except Exception as e:
print(f"Error saving to Qdrant: {e}")
# Track error with Inngest
track_qdrant_save(product_data.get('title', ''), product_data.get('store', ''), False, error=e)
return False, str(e)
def search_similar_products(query, limit=5):
"""
Search for similar products in Qdrant based on query
Args:
query: Search query (product name or ingredients)
limit: Number of results to return
"""
try:
# Create embedding for query
query_embedding = model.encode(query).tolist()
# Search in Qdrant
results = qdrant_client.search(
collection_name=COLLECTION_NAME,
query_vector=query_embedding,
limit=limit
)
# Track with Inngest
track_qdrant_search(query, len(results), True)
return results
except Exception as e:
print(f"Error searching Qdrant: {e}")
# Track error with Inngest
track_qdrant_search(query, 0, False, error=e)
return []
def get_all_products():
"""Get all stored products from Qdrant"""
try:
# Scroll through all points
results = qdrant_client.scroll(
collection_name=COLLECTION_NAME,
limit=100
)
return results[0] # Returns list of points
except Exception as e:
print(f"Error getting products: {e}")
return []
def get_collection_stats():
"""Get statistics about the collection"""
try:
info = qdrant_client.get_collection(collection_name=COLLECTION_NAME)
return {
"total_products": info.points_count,
"vector_size": info.config.params.vectors.size,
"distance": info.config.params.vectors.distance
}
except Exception as e:
print(f"Error getting stats: {e}")
return {}