-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathwvlinear.py
More file actions
72 lines (61 loc) · 2.59 KB
/
wvlinear.py
File metadata and controls
72 lines (61 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
# Automatically searches for approximate word equations like those in wvarith using combinations of words in a given list
# For variety, it avoids using multiple equations with the same pair
# Name of model file, output of wvgen.py
modelf = "mariomodel.bin"
# Name of file with list of words, separated by "," or newlines with no commas
wordf = "mariochars.txt"
# Number of equations printed
num_outputs = 15
def filter_results(outputs):
newoutputs = []
# Sort by result similarity
outputs.sort(key=lambda x: -x[1])
# Track occurrences of each word
occs = {w: 0 for w in [o[0][n] for n in range(4) for o in outputs]}
# Don't delete more than necessary
toDelete = len(outputs) - num_outputs
# Filter to prevent too many repeats of the same word
for o in outputs:
# Increment occurrences
for n in range(4):
occs[o[0][n]] += 1
# Check number of occurrences of each word
uniqueResult = all([occs[o[0][n]] <= 4 for n in range(4)])
# Don't delete if it's unique enough
if uniqueResult or toDelete == 0:
newoutputs.append(o)
else:
toDelete -= 1
return newoutputs
def approx_linear(model, words):
outputs = []
# Iterate through all equations, don't use the same word twice in the equation
for i, first in enumerate(words):
for j, second in enumerate(words[i+1:]):
for third in words[(i+1)+(j+1):]:
# Find equation results
result = model.wv.most_similar_cosmul(
positive=[first, second], negative=[third], topn=3)
outputs += ([([first, second, third, result[n][0]],
result[n][1]) for n in range(3)])
# Filter lowest similarity repeats
outputs = filter_results(outputs)
# Delete least similar
outputs = outputs[:num_outputs]
for o in outputs:
print(o[0][0], "+", o[0][1], "-", o[0][2],
"=", o[0][3]+" :", round(o[1], 3))
if __name__ == '__main__':
model = Word2Vec.load("model/"+modelf)
# Get words from file
w = open("list/"+wordf, "r")
words = ",".join(w.read().split("\n"))
w.close()
# Handle multi-word expressions, assuming MWETokenizer separator=' ' in wvgen.py
words = [' '.join(word_tokenize(x)) for x in words.split(",")]
# Remove words not in vocabulary
words = [f for f in filter(lambda x: x in model.wv.vocab.keys(), words)]
approx_linear(model, words)