-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
72 lines (59 loc) · 1.78 KB
/
preprocessing.py
File metadata and controls
72 lines (59 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#from nltk import PorterStemmer
from stemming.porter2 import stem
from sets import Set
import os
import sys
import cPickle as pickle
#stem()
#stemmer.stem()
dictionary = Set()
ham_folder = os.listdir("enron1/ham")
spam_folder = os.listdir("enron1/spam")
for ham_email in ham_folder:
if ham_email.endswith('.txt'):
f = open('enron1/ham/' + ham_email)
for word in f.read().split():
#stemming
word = unicode(word, errors='ignore')
dictionary.add(stem(word))
for spam_email in spam_folder:
if spam_email.endswith('.txt'):
#print spam_email
f = open('enron1/spam/' + spam_email)
for word in f.read().split():
#stemming
word = unicode(word, errors='ignore')
dictionary.add(stem(word))
dictionary = list(dictionary)
dictionary = sorted(dictionary)
dict_file = open('dict.txt', 'wb')
pickle.dump(dictionary, dict_file)
dict_file.close()
ham = []
#create bag-of words for ham emails
for ham_email in ham_folder:
if ham_email.endswith('.txt'):
#print ham_email
email_vec = [0] * len(dictionary)
f = open('enron1/ham/' + ham_email)
for word in f.read().split():
word = unicode(word, errors='ignore')
email_vec[dictionary.index(stem(word))] +=1
ham.append(email_vec)
ham_file = open('ham_vec.txt', 'wb')
pickle.dump(ham, ham_file)
ham_file.close()
spam = []
#create bag-of words for spam emails
for spam_email in spam_folder:
if spam_email.endswith('.txt'):
#print spam_email
email_vec = [0] * len(dictionary)
f = open('enron1/spam/' + spam_email)
for word in f.read().split():
word = unicode(word, errors='ignore')
email_vec[dictionary.index(stem(word))] +=1
spam.append(email_vec)
spam_file = open('spam_vec.txt', 'wb')
pickle.dump(spam, spam_file)
spam_file.close()