-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel_creation.py
More file actions
104 lines (90 loc) · 3.54 KB
/
model_creation.py
File metadata and controls
104 lines (90 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import random
from collections import Counter
from string import punctuation
from keras.preprocessing import sequence
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense
# Represents the number of unique words we will use in our counter
words = 15000
word_min_length = 3
max_words_per_review = 1500
# Extracts the text files and respective labels from the given path and shuffles them
def extract_reviews(file):
file_neg = "{}/neg".format(file)
file_pos = "{}/pos".format(file)
neg_X, neg_Y = extract_reviews_per_sentiment(file_neg, 0)
pos_X, pos_Y = extract_reviews_per_sentiment(file_pos, 1)
X_total = neg_X + pos_X
Y_total = neg_Y + pos_Y
total = list(zip(X_total, Y_total))
random.shuffle(total)
X, Y = zip(*total)
return X, list(Y)
# Extracts the text files and respective labels from the given path
def extract_reviews_per_sentiment(file, label):
X = []
Y = []
for i in os.listdir(file):
f = open("{}/{}" .format(file, i), encoding="utf8")
text = f.read()
text = text.translate(str.maketrans('', '', punctuation))
new_text = []
for word in text.split():
if len(word) >= word_min_length:
word = word.lower()
new_text.append(word)
X.append(new_text)
Y.append(label)
return X, Y
# Create the lists of text files and labels for training and testing
X_train, Y_train = extract_reviews("train")
X_test, Y_test = extract_reviews("test")
# Create a dictionary for converting the text files to vectors
total_words = [item for sublist in X_train for item in sublist]
word_counts = Counter(total_words)
total_words = sorted(word_counts, key=word_counts.get, reverse = True)
ranking = {}
for num, word in enumerate(total_words, 1):
if num < words:
ranking[word] = num
else:
break
# Encodes the given text files
def vectorize(text_data, ranking):
text_data_vectorized = []
for review in text_data:
review_vectorized = []
for word in review:
rank = ranking.get(word, 0)
if rank != 0:
review_vectorized.append(rank)
text_data_vectorized.append(review_vectorized)
return text_data_vectorized
# Encode the training and testing data
X_train_vectorized = vectorize(X_train, ranking)
X_test_vectorized = vectorize(X_test, ranking)
# Pad the reviews with zeros to ensure that they are all the same length (max_words_per_review)
X_train = sequence.pad_sequences(X_train_vectorized, maxlen = max_words_per_review)
X_test = sequence.pad_sequences(X_test_vectorized, maxlen = max_words_per_review)
# Define the model's layers
embedding_size = 32
model = Sequential()
model.add(Embedding(words, embedding_size, input_length = max_words_per_review))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
# Specify loss and optimization
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
# Split the training data into batches, using the last batch as validation, and train the model
batch_size = 64
num_epochs = 3
X_val, Y_val = X_train[:batch_size], Y_train[:batch_size]
X_training, Y_training = X_train[batch_size:], Y_train[batch_size:]
model.fit(X_training, Y_training, validation_data=(X_val, Y_val), batch_size=batch_size, epochs=num_epochs)
# Judge model accuracy based on the testing data, and save the model
scores = model.evaluate(X_test, Y_test)
print('Test Accuracy:', scores[1])
model.save("sentiment_analysis_2.h5")
np.save("ranking.npy", ranking)