-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathw2v.py
More file actions
85 lines (72 loc) · 2.41 KB
/
w2v.py
File metadata and controls
85 lines (72 loc) · 2.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import MeCab
import gensim
import numpy
import os
class TopicCorpus():
def __init__(self):
self.modelPath = os.environ['NLP_MODEL_PATH']
self.wordModelPath = self.modelPath+'/ja.bin'
self.topicModelPath = self.modelPath+'/topic.bin'
# 単語モデル、トピックモデル(トピック空間)の読み込み
self.wordModel = gensim.models.Word2Vec.load(self.wordModelPath)
self.topicModel = gensim.models.Word2Vec.load(self.topicModelPath)
# MeCabをセット
mecabPath = os.environ['MECAB_DIC_PATH']
self.mecab = MeCab.Tagger("-d "+mecabPath)
# topicのしきい値を設定
self.threshold = 0.9
def getNewsVector(self, newsTitle):
topicVector = numpy.zeros(300)
node = self.mecab.parseToNode(newsTitle)
node = node.next
while node:
if node.next == None:
break
# 単語のVector化と重み付けをしてtopicVectorに加算
word = node.feature.split(",")[6]
score = self.wordScore(node)
try:
wordVector = self.wordModel[word]*score
except :
wordVector = numpy.zeros(300)
topicVector = topicVector + wordVector
node = node.next
return topicVector
# 既存のtopicVectorに追加されたVectorの要素を追加して更新
def updateTopicVector(self, newsVector, TopicID):
# self.topicModel[TopicID] = numpy.mean(self.topicModel[TopicID] + newsVector)
pass
def addNewTopic(self, newsVector):
newTopicID = len(self.topicModel.wv.vocab)-7
# newTopicID = 0
self.topicModel.wv.add(str(newTopicID), newsVector)
return newTopicID
def getTopicID(self, newsTitle):
newsVector = self.getNewsVector(newsTitle)
nearestTopic = self.topicModel.most_similar([newsVector],[],1)
# print(nearestTopic)
# nearestTopic:[(string)TopicID, (float?)distance]
if abs(nearestTopic[0][1]) > self.threshold:
self.updateTopicVector(newsVector, nearestTopic[0][1])
return nearestTopic[0][0]
else:
newTopicID = self.addNewTopic(newsVector)
return newTopicID
def wordScore(self, node):
# if node.feature.split(",")[1] == "固有名詞":
# score = 2
# elif node.feature.split(",")[1] in {"句点","格助詞"}: #適宜条件追加
# score = 0
# else:
# score = 1
# return score
return 1
if __name__ == "__main__":
tc = TopicCorpus()
file = open(tc.modelPath+"/newsList.txt")
newsList = file.readlines()
newsTitle = None
for newsTitle in newsList:
topicID = tc.getTopicID(newsTitle)
print(topicID)
tc.topicModel.save(tc.topicModelPath)