celia

celiacnavarro · celiacnavarro · commit 28d3992e4e32 · 2023-02-22T18:46:53.000+01:00
diff --git a/toolkit/data_processing.py b/toolkit/data_processing.py
@@ -169,18 +169,13 @@ def buffdescribe(df,  stats=['mean', 'median', 'std']):
     describe = concatenado.sort_values(by=["MISSINGS (%)"], ascending=True)
 
     # Descriptive statistics
-    numeric_cols = df.select_dtypes(include=['int', 'float']).columns.tolist()
-    numeric_stats = df[numeric_cols].agg(stats)
-    numeric_stats.columns = [col + '_' + stat for col in numeric_stats.columns for stat in stats]
+    agg = df.agg(stats).transpose()
 
     # Merge in a new dataframe
-    describe = pd.merge(concatenado, numeric_stats, left_index=True, right_index=True, how='left')
-    describe.rename(columns={'mean': 'MEAN',
-                   'median': 'MEDIAN', 'std': 'STD'}, inplace=True),
+    describe = pd.merge(concatenado, agg, left_index=True, right_index=True, how='left')
 
     return describe
 
-
 def clean_text(df, column:str, language:str, target:str, filename:str='data_processed.csv'):
     
     ''' 
@@ -206,7 +201,7 @@ def clean_text(df, column:str, language:str, target:str, filename:str='data_proc
     df[column] = df[column].str.replace(r'\s*@\w+', '', regex=True)
 
     # Remove punctuation marks and convert to lowercase
-    signos = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
+    signos = re.compile(r"(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
 
     def signs_tweets(tweet):
         return signos.sub('', tweet.lower())
@@ -220,8 +215,8 @@ def remove_links(df):
     df[column] = df[column].apply(remove_links)
 
     # Remove stopwords
+    stopwords = set(stopwords.words(language))
     def remove_stopwords(df):
-        stopwords = set(stopwords.words(language))
         return " ".join([word for word in df.split() if word not in stopwords])
     df[column] = df[column].apply(remove_stopwords)
 
diff --git a/toolkit/plot.py b/toolkit/plot.py
@@ -3,7 +3,7 @@
 import numpy as np
 from typing import Union
 import plotly.graph_objs as go
-from plotly import tools
+from plotly import subplots
 from collections import defaultdict
 import plotly.offline as py
 from wordcloud import STOPWORDS
@@ -62,7 +62,6 @@ def plot_ngrams(df, target:str, text:str, n_gram:int):
     ----------
     - Bar chart representing the frequency of words, sorted from highest to lowest, divided by target variable.
     '''
-
     df1 = df[df[target] ==1]
     df0 = df[df[target] ==0]
 
@@ -104,10 +103,13 @@ def horizontal_bar_chart(df, color):
     trace1 = horizontal_bar_chart(fd_sorted.head(50), 'blue')
 
     # Creating two subplots
-    fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
+    fig = subplots.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                           subplot_titles=["Frequent words from text with label 0", 
                                           "Frequent words from text with label 1"])
-    fig.append_trace(trace0, 1, 1)
-    fig.append_trace(trace1, 1, 2)
+    fig.add_trace(trace0, 1, 1)
+    fig.add_trace(trace1, 1, 2)
     fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
     py.iplot(fig, filename='word-plots')
+
+    return fig
+