Skip to content

Commit 840bf06

Browse files
committed
functions_celia
1 parent 70b8ce1 commit 840bf06

1 file changed

Lines changed: 71 additions & 2 deletions

File tree

toolkit/plot.py

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
import pandas as pd
33
import numpy as np
44
from typing import Union
5-
5+
import plotly.graph_objs as go
6+
from plotly import tools
7+
from collections import defaultdict
8+
import plotly.offline as py
9+
from wordcloud import STOPWORDS
610

711
def plot_multiclass_prediction_image(df, row_index: int, X_test: Union[pd.DataFrame, np.ndarray], prediction_col: str = 'Top Prediction', label_col: str = 'Label'):
812
'''
@@ -39,4 +43,69 @@ def plot_multiclass_prediction_image(df, row_index: int, X_test: Union[pd.DataFr
3943
df.loc[row_index, df.dtypes != object].plot.bar(ax = ax[1])
4044
plt.xticks(rotation = 45, ha = 'right')
4145
plt.title('Probabilities of Each Class')
42-
plt.show()
46+
plt.show()
47+
48+
49+
50+
def plot_ngrams(df, target:str, text:str, n_gram:int):
51+
'''
52+
This function allows visualizing the frequency of occurrence of words and n-grams (sets of a number of consecutive words) based on the target variable.
53+
54+
Parameters:
55+
- df: Dataframe with a text variable.
56+
- target: Column of the dataframe where the target variable is located.
57+
- text: Column of the dataframe where the text is located.
58+
- n_gram: Number of consecutive words whose frequency we want to visualize.
59+
60+
Returns:
61+
- Bar chart representing the frequency of words, sorted from highest to lowest, divided by target variable.
62+
'''
63+
64+
df1 = df[df[target] ==1]
65+
df0 = df[df[target] ==0]
66+
67+
# Custom function for ngram generation
68+
def generate_ngrams(text, n_gram):
69+
token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
70+
ngrams = zip(*[token[i:] for i in range(n_gram)])
71+
return [" ".join(ngram) for ngram in ngrams]
72+
73+
# Custom function for horizontal bar chart
74+
def horizontal_bar_chart(df, color):
75+
trace = go.Bar(
76+
y=df["word"].values[::-1],
77+
x=df["wordcount"].values[::-1],
78+
showlegend=False,
79+
orientation = 'h',
80+
marker=dict(
81+
color=color,
82+
),
83+
)
84+
return trace
85+
86+
# Get the bar chart from text with label 0
87+
freq_dict = defaultdict(int)
88+
for sent in df0[text]:
89+
for word in generate_ngrams(sent, n_gram):
90+
freq_dict[word] += 1
91+
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
92+
fd_sorted.columns = ["word", "wordcount"]
93+
trace0 = horizontal_bar_chart(fd_sorted.head(50), 'blue')
94+
95+
# Get the bar chart from text with label 1
96+
freq_dict = defaultdict(int)
97+
for sent in df1[text]:
98+
for word in generate_ngrams(sent, n_gram):
99+
freq_dict[word] += 1
100+
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
101+
fd_sorted.columns = ["word", "wordcount"]
102+
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'blue')
103+
104+
# Creating two subplots
105+
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
106+
subplot_titles=["Frequent words from text with label 0",
107+
"Frequent words from text with label 1"])
108+
fig.append_trace(trace0, 1, 1)
109+
fig.append_trace(trace1, 1, 2)
110+
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
111+
py.iplot(fig, filename='word-plots')

0 commit comments

Comments
 (0)