-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathentity_service.py
More file actions
104 lines (87 loc) · 3.59 KB
/
entity_service.py
File metadata and controls
104 lines (87 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tornado.ioloop
from tornado.web import Application, RequestHandler, asynchronous
from tornado.ioloop import IOLoop
from ner import Ner
import time
import json
import sys
import logging
from conf import INLINKS_THRESHOLD
# Main class
class NerService(tornado.web.RequestHandler):
def initialize(self, concepts_inlinks, stopwords, inlinks_threshold=400, MAX_WORDS=400, MAX_CHARS=2000):
"""
"""
self.MAX_WORDS = MAX_WORDS
self.MAX_CHARS = MAX_CHARS
self.inlinks_threshold=inlinks_threshold
self.ner = Ner(concepts_inlinks, stopwords, inlinks_threshold=inlinks_threshold, max_words=MAX_WORDS)
def get(self):
# Get parameters
inlinks_threshold = int(self.get_argument("inlinks_threshold", default=self.inlinks_threshold))
self.ner.inlinks_threshold=inlinks_threshold
text = self.get_argument("text")
debug = self.get_argument("debug", default=False)
# Check warnings if exists
warning = []
if len(text) > self.MAX_CHARS:
warning.append('Only the first %d chars will be processed. This request is over this limit.' % self.MAX_CHARS)
if len(text.split(' ')) > self.MAX_WORDS:
warning.append('Only the first %d words will be processed. This request is over this limit.' % self.MAX_WORDS)
result = self.ner.fetch_entities(text)
# Erase text at response
del(result['text'])
# if exists warning, append the flags to the output
if len(warning) > 0:
result['warnings'] = warning
if debug:
self.write(result)
else:
self.write({"concepts": list(result["results"].keys())})
def post(self):
results = list()
for line in str(self.request.body, 'utf8').split('\n'):
if line:
fields = line.split('\t')
text = fields[0]
concepts = self.ner.fetch_entities(text)
concept_names = list(concepts['results'].keys())
results.append({"text":text, "concepts":concept_names})
self.write({"response":results})
def __format_post_result(self, response):
response_dict = json.loads(response)
concepts = response_dict['results'].keys()
if concepts:
return ";;".join(concepts)
else:
return ""
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# data structures to load inside the data
concepts_inlinks={}
stopwords=set()
# Restrictions
MAX_WORDS = 400
MAX_CHARS = MAX_WORDS * 50
logging.info("Loading concepts...")
with(open('data/pagelinks_all.tsv', encoding='utf-8', errors='ignore')) as concepts_file:
for concept in concepts_file.readlines():
parts = concept.split('\t')
concepts_inlinks[parts[0]]=parts[1]
logging.info("%s concepts loaded." % len(concepts_inlinks))
logging.info("Loading stopwords...")
with(open('data/stopwords.txt', encoding='utf-8', errors='ignore')) as sw_file:
for sw in sw_file:
stopwords.add(sw.replace('\n','').lower())
logging.info("%s stopwords loaded." % len(stopwords))
logging.info("Concept service Started")
# run application
app = tornado.web.Application([
(r"/", NerService, dict(concepts_inlinks = concepts_inlinks,
stopwords = stopwords,
inlinks_threshold = INLINKS_THRESHOLD,
MAX_WORDS = MAX_WORDS,
MAX_CHARS = MAX_CHARS))])
app.listen(sys.argv[1])
IOLoop.instance().start()