Skip to content

Commit 28d3992

Browse files
committed
celia
1 parent 119f66d commit 28d3992

2 files changed

Lines changed: 11 additions & 14 deletions

File tree

toolkit/data_processing.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -169,18 +169,13 @@ def buffdescribe(df, stats=['mean', 'median', 'std']):
169169
describe = concatenado.sort_values(by=["MISSINGS (%)"], ascending=True)
170170

171171
# Descriptive statistics
172-
numeric_cols = df.select_dtypes(include=['int', 'float']).columns.tolist()
173-
numeric_stats = df[numeric_cols].agg(stats)
174-
numeric_stats.columns = [col + '_' + stat for col in numeric_stats.columns for stat in stats]
172+
agg = df.agg(stats).transpose()
175173

176174
# Merge in a new dataframe
177-
describe = pd.merge(concatenado, numeric_stats, left_index=True, right_index=True, how='left')
178-
describe.rename(columns={'mean': 'MEAN',
179-
'median': 'MEDIAN', 'std': 'STD'}, inplace=True),
175+
describe = pd.merge(concatenado, agg, left_index=True, right_index=True, how='left')
180176

181177
return describe
182178

183-
184179
def clean_text(df, column:str, language:str, target:str, filename:str='data_processed.csv'):
185180

186181
'''
@@ -206,7 +201,7 @@ def clean_text(df, column:str, language:str, target:str, filename:str='data_proc
206201
df[column] = df[column].str.replace(r'\s*@\w+', '', regex=True)
207202

208203
# Remove punctuation marks and convert to lowercase
209-
signos = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
204+
signos = re.compile(r"(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
210205

211206
def signs_tweets(tweet):
212207
return signos.sub('', tweet.lower())
@@ -220,8 +215,8 @@ def remove_links(df):
220215
df[column] = df[column].apply(remove_links)
221216

222217
# Remove stopwords
218+
stopwords = set(stopwords.words(language))
223219
def remove_stopwords(df):
224-
stopwords = set(stopwords.words(language))
225220
return " ".join([word for word in df.split() if word not in stopwords])
226221
df[column] = df[column].apply(remove_stopwords)
227222

toolkit/plot.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import numpy as np
44
from typing import Union
55
import plotly.graph_objs as go
6-
from plotly import tools
6+
from plotly import subplots
77
from collections import defaultdict
88
import plotly.offline as py
99
from wordcloud import STOPWORDS
@@ -62,7 +62,6 @@ def plot_ngrams(df, target:str, text:str, n_gram:int):
6262
----------
6363
- Bar chart representing the frequency of words, sorted from highest to lowest, divided by target variable.
6464
'''
65-
6665
df1 = df[df[target] ==1]
6766
df0 = df[df[target] ==0]
6867

@@ -104,10 +103,13 @@ def horizontal_bar_chart(df, color):
104103
trace1 = horizontal_bar_chart(fd_sorted.head(50), 'blue')
105104

106105
# Creating two subplots
107-
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
106+
fig = subplots.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
108107
subplot_titles=["Frequent words from text with label 0",
109108
"Frequent words from text with label 1"])
110-
fig.append_trace(trace0, 1, 1)
111-
fig.append_trace(trace1, 1, 2)
109+
fig.add_trace(trace0, 1, 1)
110+
fig.add_trace(trace1, 1, 2)
112111
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
113112
py.iplot(fig, filename='word-plots')
113+
114+
return fig
115+

0 commit comments

Comments
 (0)