Skip to content

Commit 1112e08

Browse files
committed
ngrams
1 parent 0b468af commit 1112e08

1 file changed

Lines changed: 69 additions & 0 deletions

File tree

toolkit/plot_ngrams.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
def plot_ngrams(df, target:str, text:str, n_gram:int):
2+
'''
3+
This function allows visualizing the frequency of occurrence of words and n-grams (sets of a number of consecutive words) based on the target variable.
4+
5+
Parameters:
6+
- df: Dataframe with a text variable.
7+
- target: Column of the dataframe where the target variable is located.
8+
- text: Column of the dataframe where the text is located.
9+
- n_gram: Number of consecutive words whose frequency we want to visualize.
10+
11+
Returns:
12+
- Bar chart representing the frequency of words, sorted from highest to lowest, divided by target variable.
13+
'''
14+
15+
import plotly.graph_objs as go
16+
from plotly import tools
17+
from collections import defaultdict
18+
import pandas as pd
19+
import plotly.offline as py
20+
from wordcloud import STOPWORDS
21+
22+
df1 = df[df[target] ==1]
23+
df0 = df[df[target] ==0]
24+
25+
# Custom function for ngram generation
26+
def generate_ngrams(text, n_gram):
27+
token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
28+
ngrams = zip(*[token[i:] for i in range(n_gram)])
29+
return [" ".join(ngram) for ngram in ngrams]
30+
31+
# Custom function for horizontal bar chart
32+
def horizontal_bar_chart(df, color):
33+
trace = go.Bar(
34+
y=df["word"].values[::-1],
35+
x=df["wordcount"].values[::-1],
36+
showlegend=False,
37+
orientation = 'h',
38+
marker=dict(
39+
color=color,
40+
),
41+
)
42+
return trace
43+
44+
# Get the bar chart from text with label 0
45+
freq_dict = defaultdict(int)
46+
for sent in df0[text]:
47+
for word in generate_ngrams(sent, n_gram):
48+
freq_dict[word] += 1
49+
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
50+
fd_sorted.columns = ["word", "wordcount"]
51+
trace0 = horizontal_bar_chart(fd_sorted.head(50), 'blue')
52+
53+
# Get the bar chart from text with label 1
54+
freq_dict = defaultdict(int)
55+
for sent in df1[text]:
56+
for word in generate_ngrams(sent, n_gram):
57+
freq_dict[word] += 1
58+
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
59+
fd_sorted.columns = ["word", "wordcount"]
60+
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'blue')
61+
62+
# Creating two subplots
63+
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
64+
subplot_titles=["Frequent words from text with label 0",
65+
"Frequent words from text with label 1"])
66+
fig.append_trace(trace0, 1, 1)
67+
fig.append_trace(trace1, 1, 2)
68+
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
69+
py.iplot(fig, filename='word-plots')

0 commit comments

Comments
 (0)