-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathembed.py
More file actions
63 lines (43 loc) · 1.91 KB
/
embed.py
File metadata and controls
63 lines (43 loc) · 1.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import cohere #Install with: pip install cohere
import hnswlib #Install with: pip install hnswlib
import pandas as pd
def sort_array_by_frequency(arr):
# Step 1: Create a dictionary to store item counts
item_counts = {}
# Step 2: Count the occurrences of each item
for item in arr:
if item in item_counts:
item_counts[item] += 1
else:
item_counts[item] = 1
# Step 3: Sort the array based on counts
sorted_arr = sorted(arr, key=lambda x: item_counts[x], reverse=True)
return sorted_arr
class embed:
def __init__(self, key, dataset):
self.co = cohere.Client(key)
self.posts = pd.read_csv(dataset)
self.posts = self.posts.drop_duplicates(subset="text")
# Reset the index of the DataFrame
self.posts.reset_index(drop=True, inplace=True)
docs = []
for i in range(0, len(self.posts)):
docs.append(self.posts["text"][i])
#Get your document embeddings
doc_embs = self.co.embed(texts=docs, model='embed-multilingual-v2.0').embeddings
self.posts["embeddings"] = doc_embs
#Create a search index
self.search_index = hnswlib.Index(space='ip', dim=768)
self.search_index.init_index(max_elements=len(doc_embs), ef_construction=512, M=64)
self.search_index.add_items(doc_embs, list(range(len(doc_embs))))
def recomend_music(self, input):
query_emb = self.co.embed(texts=[input], model='embed-multilingual-v2.0').embeddings
indexs = self.search_index.knn_query(query_emb, k=10)[0][0]
musicas = []
for index in indexs:
musicas.append(self.posts["musicMeta/musicAuthor"][index] + ":" + " " + str(self.posts["musicMeta/musicName"][index]))
arr = set(sort_array_by_frequency(musicas))
if len(arr) > 5:
arr = list(arr)
return arr[0:5]
return arr