-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdistance_morph.py
More file actions
133 lines (117 loc) · 3.09 KB
/
distance_morph.py
File metadata and controls
133 lines (117 loc) · 3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
'''
python version of distance_morph.c
will also read in similarity dataset and compute Spearman coefficient
'''
import sys
import numpy as np
max_size = 2000 # max length of strings
N = 40 # number of closest words that will be shown
max_w = 50 # max length of vocabulary entries
if len(sys.argv) < 3:
print "Usage: python distance_morph.py <FILE> <word or word+lemma or lemma>"
print "where FILE contains word projections in the BINARY FORMAT,"
print "word compares just the word embedding,"
print "and word+lemma compares the word embedding concatenated with a weighted average of its lemma vectors"
print "Note: just use word+lemma parameter to read whole vector of lemma file"
exit(0)
file_name = sys.argv[1]
word_lemma_opt = sys.argv[2]
f = open(file_name, "rb")
line_index = 0
words = 0
size = 0
word_size = 0
vocab = []
bestw = []
bestd = []
bi = np.zeros(100, dtype=int)
M = None
vec = None
b = 0
for line in f:
fields = line.split()
# Grab the numbers from the header of the file
if line_index == 0:
header_nums = [int(x) for x in fields]
words = header_nums[0]
size = header_nums[1]
M = np.zeros((words, size))
if word_lemma_opt != "word+lemma":
if len(header_nums) < 3:
sys.exit("Passed a file with not enough header arguments")
word_size = header_nums[2]
line_index += 1
continue
vocab.append(fields[0])
# read in vector
float_fields = [float(x) for x in fields[1:]]
# if we only want to compare word, stop at word_size, fill the rest with zero
if word_lemma_opt == "word":
for a in xrange(size):
if a >= word_size:
float_fields[a] = 0.0
# if we only want to compare lemma, fill with zero until word_size
if word_lemma_opt == "lemma":
for a in xrange(size):
if a < word_size:
float_fields[a] = 0.0
M[b] = float_fields
# normalize
Z = np.linalg.norm(M[b])
M[b] /= Z
line_index += 1
b += 1
f.close()
while True:
for a in xrange(N):
bestd.append(0)
bestw.append("")
st1 = raw_input("Enter word or sentence (EXIT to break): ")
if st1 == "EXIT":
break
st = st1.split()
cn = len(st)
for a in xrange(cn):
reached_end = True
for b in xrange(words):
if vocab[b] == st[a]:
reached_end = False
break
if reached_end:
b = -1
bi[a] = b
print "\nWord: ", st[a], " Position in vocabulary: ", bi[a]
if b == -1:
print "Out of dictionary word!"
break
if b == -1:
continue
print "\n Word Cosine distance\n------------------------------------------------------------------------"
vec = np.zeros(size)
for b in xrange(cn):
if bi[b] == -1:
continue
vec += M[bi[b]]
Z = np.linalg.norm(vec)
vec /= Z
for a in xrange(N):
bestd[a] = -1
bestw[a] = ""
for c in xrange(words):
a = 0
for b in xrange(cn):
if bi[b] == c:
a = 1
if a == 1:
continue
dist = np.dot(vec,M[c])
for a in xrange(N):
if dist > bestd[a]:
for d in xrange(N - 1, a, -1):
bestd[d] = bestd[d - 1]
bestw[d] = bestw[d - 1]
bestd[a] = dist
bestw[a] = vocab[c]
break
for a in xrange(N):
print "%50s\t\t%f" % (bestw[a], bestd[a])