e-learning-generic/siteindex.py at master · engagelab/e-learning-generic · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/python
from HTMLParser import HTMLParser
from os.path import join
from os import walk
import json
import re

fPath = 'views/'
fList = []
metaTags = ['meta']
dataTags = ['p', 'strong', 'span', 'h1', 'h2', 'h3', 'h4']
anchorableTags = ['section', 'aside']
freetext_dictionary = {}
meta_dictionary = {}
minWordLength = 4
extractWordLength = 20
metaNameToSearch = "module"
moduleRegex = "views/m(\d)/sections"
sectionRegex = "s(\d+)_([a-z]{2}).html"

# e.g. <meta name="categories" content="one,two,three,four">

# create a subclass for the parser and override the handler methods
class MyHTMLParser(HTMLParser):
	takeTheData = False
	currentPath = ""
	parsingEnabled = False
	currentContent = []
	currentModule = ""
	currentSection = ""
	currentLanguage = ""

	#Decide whether to look within this tag
	def handle_starttag(self, tag, attrs):
		if tag in dataTags:
			self.takeTheData = True
		if tag in metaTags:
			for key, value in attrs:
				if key == 'name' and value == metaNameToSearch:
					self.parsingEnabled = True
				elif key == 'content' and self.parsingEnabled:
					keywordlist = value.split(",")
					self.currentContent = keywordlist
					print '   Found metadata: ' + value
					for word in keywordlist:
						self.add_to_meta_dictionary(word)
					self.parsingEnabled = False

	#Decide whether the section is finished
	# def handle_endtag(self, tag):

	#Handle data in tag
	def handle_data(self, data):
		if self.takeTheData:
			extract = ""
			wordlist = data.split(" ")
			for idx, word in enumerate(wordlist):
				stripped = word.strip().lower()
				if len(stripped) >= minWordLength:
					extract = self.create_extract(stripped, idx, wordlist)
					self.add_to_freetext_dictionary(stripped, extract)
		self.takeTheData = False

	def add_to_freetext_dictionary(self, word, extract):
		#code = self.currentContent[:]   # [:] copies the list
		#code.extend(self.currentSection)
		newEntry = {'extract':extract,'code':[self.currentSection,self.currentModule,self.currentLanguage]}   #Include path like this:  'path':self.currentPath,
		if word in freetext_dictionary:
			freetext_dictionary[word].append(newEntry)
		else:
			freetext_dictionary[word] = [newEntry]

	def add_to_meta_dictionary(self, word):
		newEntry = {'path' : self.currentPath}
		if word in meta_dictionary:
			meta_dictionary[word].append(newEntry)
		else:
			meta_dictionary[word] = [newEntry]

	# Create an extract of the words surrounding the search word
	def create_extract(self, word, index, wordlist):
		extract = ""
		end = len(wordlist)
		i = 0
		j = index - extractWordLength // 2
		if j < 0:
			j = 0
		if len(wordlist) <= extractWordLength:
			extract = " ".join(wordlist)
		else:
			while (i <= extractWordLength and j < end):
				extract+=wordlist[i]+" "
				i+=1
				j+=1
		return extract

	def set_path(self, path, section, module, language):
		self.currentPath = path
		self.currentModule = module
		self.currentSection = section
		self.currentLanguage = language
		self.currentContent = []
		print 'Opening file: ' + path + ' Section: ' + section + ' for Module: ' + self.currentModule

# instantiate the parser
parser = MyHTMLParser()

#build file list and feed into parser
smatch = re.compile(sectionRegex)
mmatch = re.compile(moduleRegex)
for (dirpath, dirnames, filenames) in walk(fPath):
	for fname in filenames:
		s = smatch.match(fname)
		if s:
			m = mmatch.match(dirpath)
			theFile = join(dirpath, fname)
			f = open(theFile, 'r')
			fr = f.read()
			parser.set_path(theFile, s.group(1), m.group(1), s.group(2))
			parser.feed(fr)

#write out to a JSON file
with open('json/freetext_dictionary.json', 'w') as fp:
    json.dump(freetext_dictionary, fp)
with open('json/meta_dictionary.json', 'w') as fp:
    json.dump(meta_dictionary, fp)