-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindexer.py
More file actions
119 lines (99 loc) · 3.27 KB
/
indexer.py
File metadata and controls
119 lines (99 loc) · 3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Assignment 3 Indexer
from pathlib import Path
import nltk, json, os, sys, requests, re
from bs4 import BeautifulSoup
index = {}
files = []
def extractFileContent(f):
try:
file = open(f,'r')
files.append(file)
data = json.load(file)
content = data["content"]
soup = BeautifulSoup(content, "html.parser")
return soup
except:
return ''
def getFilePaths():
global files
directory = '/Users/ovyabarani/Desktop/assignment3/ANALYST'
paths = Path(directory).glob('**/*.json')
pathlist = map(str, paths)
return list(pathlist)
def tokenize(content, file):
''' any regular tokens '''
try:
splittext = nltk.word_tokenize(content.get_text())
wordsonly = [word.lower() for word in splittext if (word.isalpha() and len(word) > 1)]
wordcounts = {}
for word in wordsonly:
if word in wordcounts:
wordcounts[word][0] += 1
else:
wordcounts[word] = [1,0]
wordcounts = findSpecialWords(wordcounts, content)
return wordcounts
except:
return []
def stripTags(words:list):
validWords = []
for match in words:
strippedHeading = match.get_text().lower()
words = nltk.word_tokenize(strippedHeading)
validWords = validWords + [word for word in words if word.isalpha()]
return validWords
def findSpecialWords(wordcounts, content):
''' page titles '''
title = content.find_all('title')
formattedTitle = stripTags(title)
for word in formattedTitle:
if word in wordcounts:
wordcounts[word][1] += 15
''' h1-h2 '''
headingsBig = content.find_all(re.compile('^h[1-2]$'))
validHeadingsBig = stripTags(headingsBig)
for heading in validHeadingsBig:
if heading in wordcounts:
wordcounts[heading][1] += 10
''' h3-h6 '''
headingsSmall = content.find_all(re.compile('^h[3-6]$'))
validHeadingsSmall = stripTags(headingsSmall)
for heading in validHeadingsSmall:
if heading in wordcounts:
wordcounts[heading][1] += 5
''' bolded words '''
bold = content.find_all(['b', 'strong'])
validBold = stripTags(bold)
for bolded in validBold:
if bolded in wordcounts:
wordcounts[bolded][1] += 1
return wordcounts
def updateIndex(filepath, content, tokens):
global index, files
print(f'updating index for {str(filepath)}')
docID = files.index(filepath)
for key in tokens:
if key in index:
index[key].append((docID, tokens[key][0], tokens[key][1]))
else:
index[key] = [(docID, tokens[key][0], tokens[key][1])]
def update():
with open('indexjson.json', 'w') as n:
data = {}
for key in index:
data[key] = index[key]
json.dump(data, n)
print(f'number of documents: {len(files)}')
print(f'number of unique tokens: {len(index)}')
size = int(os.path.getsize('indexjson.json'))/1000
print(f'size of index: {size} kb')
n.close()
def main():
global files
files = getFilePaths()
for file in files:
content = extractFileContent(file)
updateIndex(file, content, tokenize(content, file))
update()
if __name__ == "__main__":
main()