forked from danlou/LMMS
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathemb_lemmas.py
More file actions
executable file
·47 lines (34 loc) · 1.67 KB
/
emb_lemmas.py
File metadata and controls
executable file
·47 lines (34 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import logging
import argparse
from nltk.corpus import wordnet as wn
import fastText
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%d-%b-%y %H:%M:%S')
def get_senselemma(sensekey): # replicating method used in SenseVSM
return sensekey.split('%')[0]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Creates static word embeddings for WordNet synsets (lemmas only).')
parser.add_argument('-ft_path', help='Path to fastText vectors', required=False,
default='external/fasttext/crawl-300d-2M-subword.bin')
parser.add_argument('-out_path', help='Path to resulting lemma vectors', required=True)
args = parser.parse_args()
logging.info('Loading fastText model ...')
model = fastText.load_model(args.ft_path)
logging.info('Creating lemma embeddings ...')
sensekey_vecs = []
for synset_idx, synset in enumerate(wn.all_synsets()):
for lemma in synset.lemmas():
sensekey = lemma.key()
sensekey_lemma = get_senselemma(sensekey)
lemma_vec = model.get_word_vector(sensekey_lemma)
sensekey_vecs.append((sensekey, lemma_vec))
if synset_idx % 10000 == 0:
logging.info('at synset %d' % synset_idx)
sensekey_vecs = sorted(sensekey_vecs, key=lambda x: x[0])
logging.info('Writing lemma embeddings ...')
with open(args.out_path, 'w') as vecs_f:
for sensekey, vec in sensekey_vecs:
vec_str = ' '.join([str(round(v, 6)) for v in vec.tolist()])
vecs_f.write('%s %s\n' % (sensekey, vec_str))
logging.info('Done')