Skip to content

Commit dcf7803

Browse files
committed
2 parents 9514160 + c84b7da commit dcf7803

2 files changed

Lines changed: 14 additions & 15 deletions

File tree

toolkit/data_processing.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -169,18 +169,13 @@ def buffdescribe(df, stats=['mean', 'median', 'std']):
169169
describe = concatenado.sort_values(by=["MISSINGS (%)"], ascending=True)
170170

171171
# Descriptive statistics
172-
numeric_cols = df.select_dtypes(include=['int', 'float']).columns.tolist()
173-
numeric_stats = df[numeric_cols].agg(stats)
174-
numeric_stats.columns = [col + '_' + stat for col in numeric_stats.columns for stat in stats]
172+
agg = df.agg(stats).transpose()
175173

176174
# Merge in a new dataframe
177-
describe = pd.merge(concatenado, numeric_stats, left_index=True, right_index=True, how='left')
178-
describe.rename(columns={'mean': 'MEAN',
179-
'median': 'MEDIAN', 'std': 'STD'}, inplace=True),
175+
describe = pd.merge(concatenado, agg, left_index=True, right_index=True, how='left')
180176

181177
return describe
182178

183-
184179
def clean_text(df, column:str, language:str, target:str, filename:str='data_processed.csv'):
185180

186181
'''
@@ -206,7 +201,7 @@ def clean_text(df, column:str, language:str, target:str, filename:str='data_proc
206201
df[column] = df[column].str.replace(r'\s*@\w+', '', regex=True)
207202

208203
# Remove punctuation marks and convert to lowercase
209-
signos = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
204+
signos = re.compile(r"(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
210205

211206
def signs_tweets(tweet):
212207
return signos.sub('', tweet.lower())
@@ -220,8 +215,8 @@ def remove_links(df):
220215
df[column] = df[column].apply(remove_links)
221216

222217
# Remove stopwords
218+
stopwords = set(stopwords.words(language))
223219
def remove_stopwords(df):
224-
stopwords = set(stopwords.words(language))
225220
return " ".join([word for word in df.split() if word not in stopwords])
226221
df[column] = df[column].apply(remove_stopwords)
227222

toolkit/plot.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import numpy as np
44
from typing import Union
55
import plotly.graph_objs as go
6-
from plotly import tools
6+
from plotly import subplots
77
from collections import defaultdict
88
import plotly.offline as py
99
from wordcloud import STOPWORDS
@@ -64,7 +64,6 @@ def plot_ngrams(df, target:str, text:str, n_gram:int):
6464
----------
6565
- Bar chart representing the frequency of words, sorted from highest to lowest, divided by target variable.
6666
'''
67-
6867
df1 = df[df[target] ==1]
6968
df0 = df[df[target] ==0]
7069

@@ -106,14 +105,18 @@ def horizontal_bar_chart(df, color):
106105
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'blue')
107106

108107
# Creating two subplots
109-
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
108+
fig = subplots.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
110109
subplot_titles=["Frequent words from text with label 0",
111110
"Frequent words from text with label 1"])
112-
fig.append_trace(trace0, 1, 1)
113-
fig.append_trace(trace1, 1, 2)
111+
fig.add_trace(trace0, 1, 1)
112+
fig.add_trace(trace1, 1, 2)
114113
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
115114
py.iplot(fig, filename='word-plots')
116115

116+
<<<<<<< HEAD
117+
return fig
118+
119+
=======
117120
def sunburst(df, interior:str, exterior:str, col_num:str, title:str):
118121
'''
119122
This is a Plotly Graph similar to pie chart but with two levels, interior is for columns which have one or two unique values, and
@@ -164,4 +167,5 @@ def wordcloudviz(column):
164167
plt.imshow(wordcloud)
165168
plt.axis("off")
166169
plt.tight_layout(pad=0)
167-
plt.show()
170+
plt.show()
171+
>>>>>>> 0f8d8abe7a26aac02e768b21b22a1d2e58bd6d30

0 commit comments

Comments
 (0)