sentiment_classification/sentiment_analysis.py at main · malinphy/sentiment_classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""SciBERT demo.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1HUoQiRKgwlTfZv44ZT8vrJqtSY9B2sXE
"""

import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model,Input
from tensorflow.keras.layers import *
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,f1_score,classification_report

from prediction import predict
from model import sentiment_model

train_path = 'https://raw.githubusercontent.com/malinphy/datasets/main/tweet_sentiment_extraction/twitter_sentiment_analysis/twitter_training.csv'
test_path = 'https://raw.githubusercontent.com/malinphy/datasets/main/tweet_sentiment_extraction/twitter_sentiment_analysis/twitter_validation.csv'

train_df = pd.read_csv(train_path,header = None).dropna().reset_index(drop= True)
test_df = pd.read_csv(test_path,header = None).reset_index(drop= True)
train_df = train_df.rename(columns={0: 'tweet_id', 1: 'entity',2:'sentiment',3:'content'})
test_df = test_df.rename(columns={0: 'tweet_id', 1: 'entity',2:'sentiment',3:'content'})

vocab_size = 45000
embed_dim = 32
input_len = 170

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=170
    )

vectorize_layer.adapt(train_df['content'])
train_tokens = vectorize_layer(train_df['content'])
test_tokens = vectorize_layer(test_df['content'])

input_len = tf.shape(train_tokens)[1]
corpus_size = len(vectorize_layer.get_vocabulary())

input_len

LE = LabelEncoder()
train_encoded_labels = LE.fit_transform(train_df['sentiment'])
test_encoded_labels = LE.transform(test_df['sentiment'])

def sentiment_model():
    input_len = 170
    vocab_size = 45000
    embed_dim = 32
    input_layer = Input(shape=(input_len,), name = 'input_layer')
    emb_layer = Embedding(vocab_size, embed_dim, name = 'embedding_layer')(input_layer)
    flat_layer = Flatten(name = 'Flatten_layer')(emb_layer)
    d1_layer = Dense(128,activation = 'relu',name = 'd1_layer')(flat_layer)
    d2_layer = Dense(64,activation = 'relu',name = 'd2_layer')(d1_layer)
    d3_layer = Dense(32,activation = 'relu',name = 'd3_layer')(d2_layer)
    final_layer = Dense(4,activation = 'softmax',name = 'final_layer')(d3_layer)

    return Model(inputs = input_layer, outputs = final_layer)

model = sentiment_model()

model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
    )

# history = model.fit(
#     train_tokens,
#     train_encoded_labels,
#     epochs = 10,
#     # validation_split = 0.2
#     )

# model.save_weights('sentiment_model.h5')

model.load_weights('sentiment_model.h5')

preds = tf.math.top_k(model.predict(test_tokens), k=1 )[1]

confusion_matrix(test_encoded_labels, preds)

print('F1 SCORE',f1_score(test_encoded_labels, preds, average= 'micro'))
print(classification_report(test_encoded_labels, preds,  target_names = list(LE.inverse_transform([0,1,2,3]))  ))

pickle.dump({'config': vectorize_layer.get_config(),
             'weights': vectorize_layer.get_weights()}
            , open("tv_layer.pkl", "wb"))


from_disk = pickle.load(open("tv_layer.pkl", "rb"))
new_v = TextVectorization.from_config(from_disk['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
new_v.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
new_v.set_weights(from_disk['weights'])

output = open('LE.pkl', 'wb')
pickle.dump(LE, output)
output.close()

pkl_file = open('LE.pkl', 'rb')
le_departure = pickle.load(pkl_file)
pkl_file.close()

new_test = (new_v('I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣'))

def predict(x):
    from_disk = pickle.load(open("tv_layer.pkl", "rb"))
    new_v = TextVectorization.from_config(from_disk['config'])
    # You have to call `adapt` with some dummy data (BUG in Keras)
    new_v.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
    new_v.set_weights(from_disk['weights'])

    pkl_file = open('LE.pkl', 'rb')
    le_departure = pickle.load(pkl_file)
    pkl_file.close()

    # y = new_v(x)
    model.load_weights('sentiment_model.h5')
    test_sent = new_v(x)
    test_sent = tf.reshape(test_sent, shape = (1, input_len))
    y = np.argmax(model.predict(test_sent))

    return (le_departure.inverse_transform([y]))[0]

predict('I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣')