-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfromMongo.py
More file actions
133 lines (108 loc) · 3.68 KB
/
fromMongo.py
File metadata and controls
133 lines (108 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# -*- coding: utf-8 -*-
import json
import re ,string
import db_config
import unicodedata
from redirect_indexing_new import index_main
from py2neo import Graph,Node,Relationship,authenticate
authenticate("localhost:7474",db_config.username,db_config.password)
graph = Graph()
import urllib
import pymongo
from srmse import text
client = pymongo.MongoClient()
mdb = client["cron-dbpedia"]
col = mdb["redirects_en"]
instance_col = mdb["instance-types-transitive_en"]
instance_list = instance_col.find()
instance_count = instance_col.find().count()
print instance_count
index_log = open("indexlogFromMongo.txt","a")
doc_list = col.find(no_cursor_timeout=True) #To avoid pymongo.errors.CursorNotFound
doc_count = doc_list.count()
def splittingCamelCase(word):
split_list = re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)', word)
sentence = " ".join(split_list).lower()
return sentence
def remove_punctuations(s):
try:
#regex without .
exclude=set(string.punctuation.replace(".",""))
l=[]
for ch in s:
if ch in exclude:
#punc
l.append(" ")
else:
l.append(ch)
output = ''.join(l)
output = re.sub(r'\s+'," ", output).strip()
return output
except Exception as e:
print "[ERROR] in remove_punctuations()"
raise Exception
def link_filter(original_link):
try :
original_link = original_link.replace(",_"," ").replace("#dot#",".")
original_link = text.removeStopWords(original_link)
#print "inside ",original_link
original_link = remove_punctuations(original_link)
#print "outside ",original_link
original_link = str(filter(lambda x:ord(x)>31 and ord(x)<128,original_link)).strip().lower()
return original_link
except Exception as e:
return original_link
for ind in range(doc_count):
doc=None
try:
doc=doc_list[ind]
except Exception as e:
print e
continue
list_of_doc=[]
doc_dic_batch = {}
ct=1
try:
instance_doc = instance_col.find_one({"_id":doc["_id"]})
prop = instance_doc["dbc"]
prop_list = []
for i in prop:
rmvd_base_url = i.replace("http://dbpedia.org/ontology/","")
filtered_camel_case = splittingCamelCase(rmvd_base_url)
filtered = link_filter(filtered_camel_case)
prop_list.append(filtered)
doc_dic_batch["property"] = prop_list
except Exception as e:
doc_dic_batch["property"] = [""]
print "------------------------------------------------------"
link = doc["_id"].replace("http://dbpedia#dot#org/resource/","").encode("utf-8")
#print unicodedata.normalize('NFD',link).encode('ascii','ignore')
#print (urllib.quote(link))
#test.write(urllib.quote(link)+"\n")
#urllib.quote(link)
if "Template:" in link:
continue
else:
print link
filtered_link = link_filter(link)
print "after filter",filtered_link
doc_dic_batch["real_link"] = filtered_link
redirects = doc["from"]
filtered_redirects_link = []
for redirect_link in redirects:
org_redirect_link = redirect_link.replace("http://dbpedia.org/resource/","")
filtered_org_redirect_link = link_filter(org_redirect_link)
filtered_redirects_link.append(filtered_org_redirect_link)
doc_dic_batch["from_link"] = filtered_redirects_link
doc_dic_batch["redirect"] = 1
list_of_doc.append(doc_dic_batch)
for indi_link in filtered_redirects_link:
temp_dic = {}
temp_dic["from_link"] = indi_link
temp_dic["redirect"] = 0
temp_dic["real_link"] = filtered_link
temp_dic["property"] = doc_dic_batch["property"]
list_of_doc.append(temp_dic)
index_main(list_of_doc)
index_log.write((doc["_id"].encode("utf-8")+"\n"))
print "========================================================="