-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
54 lines (41 loc) · 1.76 KB
/
preprocess.py
File metadata and controls
54 lines (41 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import numpy as np
import matplotlib.pyplot as plt
import nltk
from datetime import datetime
import itertools
import operator
import csv
import sys
# Reading data and pre-processing
vocabularySize = 8000
unknownToken = "UNKNOWN"
sentenceStartToken = "SENTENCE_START"
sentenceEndToken = "SENTENCE_END"
with open('dataset/reddit-comments.csv') as f:
reader = csv.reader(f, skipinitialspace=True)
reader.next()
sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
sentences = ["%s %s %s" % (sentenceStartToken, x, sentenceEndToken) for x in sentences]
tokenizedSentences = [nltk.word_tokenize(x) for x in sentences]
wordFrequency = nltk.FreqDist(itertools.chain(*tokenizedSentences))
# print len(wordFrequency.items())
vocabulary = wordFrequency.most_common(vocabularySize-1)
indexToWord = [x[0] for x in vocabulary]
indexToWord.append(unknownToken)
wordToIndex = dict([(w,i) for i, w in enumerate(indexToWord)])
# print wordToIndex
for i, sentence in enumerate(tokenizedSentences):
tokenizedSentences[i] = [w if w in wordToIndex else unknownToken for w in sentence]
# print tokenizedSentences[0]
# print wordToIndex["SENTENCE_START"]
X_train = np.asarray([[wordToIndex[w] for w in sent[:-1]] for sent in tokenizedSentences])
y_train = np.asarray([[wordToIndex[w] for w in sent[1:]] for sent in tokenizedSentences])
# Saving the preprocessed arrays into files
X_outfile = open('saved-states/X_train.npy', 'w')
y_outfile = open('saved-states/y_train.npy', 'w')
indexToWord_outfile = open('saved-states/indexToWord.npy', 'w')
wordToIndex_outfile = open('saved-states/wordToIndex.npy', 'w')
np.save(X_outfile, X_train)
np.save(y_outfile, y_train)
np.save(indexToWord_outfile, indexToWord)
np.save(wordToIndex_outfile, wordToIndex)