-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlanguageprocessing.py
More file actions
79 lines (64 loc) · 1.83 KB
/
Copy pathlanguageprocessing.py
File metadata and controls
79 lines (64 loc) · 1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import json
import string
from nltk.stem import PorterStemmer
def extract_words(text):
text = remove_punctuation(text)
text = text.lower()
words = text.split()
ps = PorterStemmer()
words = [ps.stem(w) for w in words]
ret_dict = {}
for w in words:
ret_dict.setdefault(w, 0)
ret_dict[w] += 1
return ret_dict
def remove_stopwords(word_dict, stopwords):
discard = []
for word in word_dict:
if word in stopwords or word.isnumeric():
discard.append(word)
for word in discard:
del word_dict[word]
def remove_punctuation(s):
exclusions = string.punctuation.join(['\u00bb', '\u2026', '\u201c', '\u201d', '\u2014', '\u2013', '\u2018', '\u2019'])
#exclusions.extend(string.punctuation)
table = {ord(c): " " for c in exclusions}
return s.translate(table)
def load_stopwords(path):
stopwords = set()
with open(path, 'r') as f:
for line in f:
stopwords.add(line.strip())
return list(stopwords)
def save_word_dict(word_dict, path):
with open(path, 'w') as fo:
json.dump(word_dict, fo, sort_keys=True,
indent=4, separators=(',', ': '))
def read_word_dict(file):
with open(file, 'r') as wordfile:
return json.load(wordfile)
def count_all_words(files):
ret_dict = {}
for file in files:
filewords = read_word_dict(file)
for word in filewords:
ret_dict.setdefault(word, 0)
ret_dict[word] += filewords[word]
return ret_dict
def cleanup_all_words(all_words):
discard = []
for word in all_words:
num_digit = 0
if all_words[word] == 1:
discard.append(word)
continue
for l in word:
if l.isdigit():
num_digit += 1
if num_digit > 0.5 * len(word):
discard.append(word)
for word in discard:
del all_words[word]
if __name__ == '__main__':
word_dict = {"bleh":1}
save_word_dict(word_dict, "bleh.json")