makeTopic/w2v.py at master · topicnote/makeTopic · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import MeCab
import gensim
import numpy
import os


class TopicCorpus():
	def __init__(self):
		self.modelPath = os.environ['NLP_MODEL_PATH']
		self.wordModelPath = self.modelPath+'/ja.bin'
		self.topicModelPath = self.modelPath+'/topic.bin'
		# 単語モデル、トピックモデル（トピック空間）の読み込み
		self.wordModel = gensim.models.Word2Vec.load(self.wordModelPath)
		self.topicModel = gensim.models.Word2Vec.load(self.topicModelPath)
		# MeCabをセット
		mecabPath = os.environ['MECAB_DIC_PATH']
		self.mecab = MeCab.Tagger("-d "+mecabPath)
		# topicのしきい値を設定
		self.threshold = 0.9


	def getNewsVector(self, newsTitle):
		topicVector = numpy.zeros(300)
		node = self.mecab.parseToNode(newsTitle)
		node = node.next
		while node:
			if node.next == None:
				break
			# 単語のVector化と重み付けをしてtopicVectorに加算
			word = node.feature.split(",")[6]
			score = self.wordScore(node)
			try:
				wordVector = self.wordModel[word]*score
			except :
				wordVector = numpy.zeros(300)
			topicVector = topicVector + wordVector
			node = node.next
		return topicVector

	# 既存のtopicVectorに追加されたVectorの要素を追加して更新
	def updateTopicVector(self, newsVector, TopicID):
		# self.topicModel[TopicID] = numpy.mean(self.topicModel[TopicID] + newsVector)
		pass


	def addNewTopic(self, newsVector):
		newTopicID = len(self.topicModel.wv.vocab)-7
		# newTopicID = 0
		self.topicModel.wv.add(str(newTopicID), newsVector)
		return newTopicID


	def getTopicID(self, newsTitle):
		newsVector = self.getNewsVector(newsTitle)
		nearestTopic = self.topicModel.most_similar([newsVector],[],1)
		# print(nearestTopic)
		# nearestTopic:[(string)TopicID, (float?)distance]
		if abs(nearestTopic[0][1]) > self.threshold:
			self.updateTopicVector(newsVector, nearestTopic[0][1])
			return nearestTopic[0][0]
		else:
			newTopicID = self.addNewTopic(newsVector)
			return newTopicID

	def wordScore(self, node):
		# if node.feature.split(",")[1] == "固有名詞":
		# 	score = 2
		# elif node.feature.split(",")[1] in {"句点","格助詞"}: #適宜条件追加
		# 	score = 0
		# else:
		# 	score = 1
		# return score
		return 1

if __name__ == "__main__":
	tc = TopicCorpus()
	file = open(tc.modelPath+"/newsList.txt")
	newsList = file.readlines()
	newsTitle = None

	for newsTitle in newsList:
		topicID = tc.getTopicID(newsTitle)
		print(topicID)

	tc.topicModel.save(tc.topicModelPath)