-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsentiment_analysis.py
More file actions
135 lines (101 loc) · 4.83 KB
/
sentiment_analysis.py
File metadata and controls
135 lines (101 loc) · 4.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""SciBERT demo.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1HUoQiRKgwlTfZv44ZT8vrJqtSY9B2sXE
"""
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model,Input
from tensorflow.keras.layers import *
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,f1_score,classification_report
from prediction import predict
from model import sentiment_model
train_path = 'https://raw.githubusercontent.com/malinphy/datasets/main/tweet_sentiment_extraction/twitter_sentiment_analysis/twitter_training.csv'
test_path = 'https://raw.githubusercontent.com/malinphy/datasets/main/tweet_sentiment_extraction/twitter_sentiment_analysis/twitter_validation.csv'
train_df = pd.read_csv(train_path,header = None).dropna().reset_index(drop= True)
test_df = pd.read_csv(test_path,header = None).reset_index(drop= True)
train_df = train_df.rename(columns={0: 'tweet_id', 1: 'entity',2:'sentiment',3:'content'})
test_df = test_df.rename(columns={0: 'tweet_id', 1: 'entity',2:'sentiment',3:'content'})
vocab_size = 45000
embed_dim = 32
input_len = 170
vectorize_layer = tf.keras.layers.TextVectorization(
max_tokens=vocab_size,
output_mode='int',
output_sequence_length=170
)
vectorize_layer.adapt(train_df['content'])
train_tokens = vectorize_layer(train_df['content'])
test_tokens = vectorize_layer(test_df['content'])
input_len = tf.shape(train_tokens)[1]
corpus_size = len(vectorize_layer.get_vocabulary())
input_len
LE = LabelEncoder()
train_encoded_labels = LE.fit_transform(train_df['sentiment'])
test_encoded_labels = LE.transform(test_df['sentiment'])
def sentiment_model():
input_len = 170
vocab_size = 45000
embed_dim = 32
input_layer = Input(shape=(input_len,), name = 'input_layer')
emb_layer = Embedding(vocab_size, embed_dim, name = 'embedding_layer')(input_layer)
flat_layer = Flatten(name = 'Flatten_layer')(emb_layer)
d1_layer = Dense(128,activation = 'relu',name = 'd1_layer')(flat_layer)
d2_layer = Dense(64,activation = 'relu',name = 'd2_layer')(d1_layer)
d3_layer = Dense(32,activation = 'relu',name = 'd3_layer')(d2_layer)
final_layer = Dense(4,activation = 'softmax',name = 'final_layer')(d3_layer)
return Model(inputs = input_layer, outputs = final_layer)
model = sentiment_model()
model.compile(
loss = tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer = tf.keras.optimizers.Adam(),
metrics = ['accuracy']
)
# history = model.fit(
# train_tokens,
# train_encoded_labels,
# epochs = 10,
# # validation_split = 0.2
# )
# model.save_weights('sentiment_model.h5')
model.load_weights('sentiment_model.h5')
preds = tf.math.top_k(model.predict(test_tokens), k=1 )[1]
confusion_matrix(test_encoded_labels, preds)
print('F1 SCORE',f1_score(test_encoded_labels, preds, average= 'micro'))
print(classification_report(test_encoded_labels, preds, target_names = list(LE.inverse_transform([0,1,2,3])) ))
pickle.dump({'config': vectorize_layer.get_config(),
'weights': vectorize_layer.get_weights()}
, open("tv_layer.pkl", "wb"))
from_disk = pickle.load(open("tv_layer.pkl", "rb"))
new_v = TextVectorization.from_config(from_disk['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
new_v.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
new_v.set_weights(from_disk['weights'])
output = open('LE.pkl', 'wb')
pickle.dump(LE, output)
output.close()
pkl_file = open('LE.pkl', 'rb')
le_departure = pickle.load(pkl_file)
pkl_file.close()
new_test = (new_v('I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣'))
def predict(x):
from_disk = pickle.load(open("tv_layer.pkl", "rb"))
new_v = TextVectorization.from_config(from_disk['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
new_v.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
new_v.set_weights(from_disk['weights'])
pkl_file = open('LE.pkl', 'rb')
le_departure = pickle.load(pkl_file)
pkl_file.close()
# y = new_v(x)
model.load_weights('sentiment_model.h5')
test_sent = new_v(x)
test_sent = tf.reshape(test_sent, shape = (1, input_len))
y = np.argmax(model.predict(test_sent))
return (le_departure.inverse_transform([y]))[0]
predict('I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣')