33import re
44from datetime import datetime
55from typing import List
6-
6+ from nltk .corpus import stopwords
7+ from nltk .stem .snowball import SnowballStemmer
8+ import cv2
9+ import os
10+ from skimage .io import imread
711
812def extract_date (df , date_column_name ):
913 '''
@@ -112,4 +116,183 @@ def log_transform_data(df: pd.DataFrame, col_ignore: List[str]) -> pd.DataFrame:
112116 numeric = df_rest .select_dtypes (include = np .number ).apply (np .log1p )
113117 non_numeric = df_rest .select_dtypes (exclude = np .number )
114118
115- return pd .concat ([numeric , non_numeric , df_ignore ], axis = 1 )
119+ return pd .concat ([numeric , non_numeric , df_ignore ], axis = 1 )
120+
121+
122+
123+ def buffdescribe (df , stats = ['mean' , 'median' , 'std' ]):
124+
125+ '''
126+ Function to facilitate a first exploration of a dataframe's data by concentrating the most relevant information
127+
128+ Params:
129+ - df: Dataframe
130+ - stats: Descriptive statistics to calculate. Default: Mean, Median, and Standard Deviation
131+
132+ Returns: Dataframe with the following columns:
133+ - Column names from the original df
134+ - Data type of each column
135+ - Percentage of null values in each column
136+ - Total number of non-null values in each column
137+ - Unique values of each column
138+ - Percentage of unique values (cardinality)
139+ - Selected descriptive statistics (default: mean, median, std) of numeric variables (int or float)
140+ '''
141+
142+ # Column names
143+ cols = pd .DataFrame (df .columns .values , columns = ["COL_N" ])
144+
145+ # Data type
146+ types = pd .DataFrame (df .dtypes .values , columns = ["DATA_TYPE" ])
147+
148+ # Percentage of Null Values
149+ percent_missing = round (df .isnull ().sum () * 100 / len (df ), 2 )
150+ percent_missing_df = pd .DataFrame (percent_missing .values , columns = ["MISSINGS (%)" ])
151+
152+ # Total number of non-null values
153+ total_not_null = pd .DataFrame (df .count ().values , columns = ["NOT_NULL" ])
154+
155+ # Unique values
156+ unicos = pd .DataFrame (df .nunique ().values , columns = ["UNIQUE_VALUES" ])
157+
158+ # Percentage of unique values (cardinality)
159+ percent_cardin = round (unicos ['UNIQUE_VALUES' ]* 100 / len (df ), 2 )
160+ percent_cardin_df = pd .DataFrame (percent_cardin .values , columns = ["CARDIN (%)" ])
161+
162+ concatenado = pd .concat ([cols , types , percent_missing_df , total_not_null , unicos , percent_cardin_df ], axis = 1 , sort = False )
163+ concatenado .set_index ('COL_N' , drop = True , inplace = True )
164+
165+ # Sort values by missing percentage
166+ describe = concatenado .sort_values (by = ["MISSINGS (%)" ], ascending = True )
167+
168+ # Descriptive statistics
169+ numeric_cols = df .select_dtypes (include = ['int' , 'float' ]).columns .tolist ()
170+ numeric_stats = df [numeric_cols ].agg (stats )
171+ numeric_stats .columns = [col + '_' + stat for col in numeric_stats .columns for stat in stats ]
172+
173+ # Merge in a new dataframe
174+ describe = pd .merge (concatenado , numeric_stats , left_index = True , right_index = True , how = 'left' )
175+ describe .rename (columns = {'mean' : 'MEAN' ,
176+ 'median' : 'MEDIAN' , 'std' : 'STD' }, inplace = True ),
177+
178+ return describe
179+
180+
181+ def clean_text (df , column :str , language :str , target :str , filename :str = 'data_processed.csv' ):
182+
183+ '''
184+ Function to preprocess and clean a dataframe with text as a preliminary step for Natural Language Processing
185+
186+ Params:
187+ - df: Dataframe
188+ - column: The name of the column in which the text is located (str)
189+ - language: The language in which the text is written (str) in ENGLISH (e.g. 'spanish', 'english')
190+ - target: The name of the column in which the target to be predicted is located
191+ - filename: Name for the processed dataframe to be saved
192+
193+ Returns:
194+ - df_processed: Dataframe after cleaning. It contains only the text variable and the target variable
195+ '''
196+
197+ # Remove duplicated
198+ df .drop_duplicates (subset = column , inplace = True )
199+
200+ # Remove mentions (@)
201+ df [column ] = df [column ].str .replace (r'\s*@\w+' , '' , regex = True )
202+
203+ # Remove punctuation marks and convert to lowercase
204+ signos = re .compile ("(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\" )|(\()|(\))|(\[)|(\])|(\d+)" )
205+
206+ def signs_tweets (tweet ):
207+ return signos .sub ('' , tweet .lower ())
208+
209+ df [column ] = df [column ].apply (signs_tweets )
210+
211+ # Remove links
212+ def remove_links (df ):
213+ return " " .join (['{link}' if ('http' ) in word else word for word in df .split ()])
214+
215+ df [column ] = df [column ].apply (remove_links )
216+
217+ # Remove stopwords
218+ def remove_stopwords (df ):
219+ stopwords = set (stopwords .words (language ))
220+ return " " .join ([word for word in df .split () if word not in stopwords ])
221+ df [column ] = df [column ].apply (remove_stopwords )
222+
223+ # Apply Stemmer
224+ stemmer = SnowballStemmer (language )
225+
226+ def def_stemmer (x ):
227+ return " " .join ([stemmer .stem (word ) for word in x .split ()])
228+
229+ df [column ] = df [column ].apply (def_stemmer )
230+
231+ # Save processed data
232+ df_processed = df [[column , target ]]
233+ df_processed .to_csv (filename )
234+
235+ return df_processed
236+
237+
238+
239+ def load_imgs (path , im_size :int ):
240+
241+ '''
242+ Function to load a directory of images and resize them for training a Convolutional Neural Network (CNN) model.
243+ IMPORTANT: Images must be divided into subdirectories according to the target
244+ (e.g. one directory for dog photos and another for cat photos).
245+ It can be used for both binary and categorical classification.
246+
247+ Args:
248+ - path: Path where the subdirectories with the images are located.
249+ - im_size: Size to which we want to resize the image (e.g. 32).
250+
251+ Returns:
252+ - df: Dataframe with the names of the images and the category to which they belong (target).
253+ - X_train: Array with the image data loaded after resizing.
254+ - y_train: Array with the target values.
255+ '''
256+
257+
258+ filenames = []
259+ X = []
260+ y = []
261+
262+ # Create a dictionary with the target values.
263+ class_names = os .listdir (path )
264+ class_names_label = {class_name :i for i , class_name in enumerate (class_names )}
265+
266+ # Iterate over the subdirectories of the given path.
267+ for subdir , dirs , files in os .walk (path ):
268+ for file in files :
269+ # Save the names of the files in a list.
270+ filenames .append (file )
271+ if file [- 4 :] == '.jpg' or file [- 4 :] == '.png' :
272+ # Read the image in color.
273+ image = imread (subdir + '\\ ' + file )
274+ # Resize the image.
275+ smallimage = cv2 .resize (image , (im_size , im_size ))
276+ # Save the images in the X variable.
277+ X .append (smallimage )
278+
279+ # Save the target values of each image in a list.
280+ for i in range (len (class_names_label .keys ())):
281+ if list (class_names_label .keys ())[i ] in subdir :
282+ y .append (list (class_names_label .values ())[i ])
283+
284+ # Returns:
285+
286+ # 1. Dataframe with the names of the images and the category to which they belong.
287+ df = pd .DataFrame ({
288+ 'filename' : filenames ,
289+ 'category' : y
290+ })
291+
292+ # 2. Array with the image data loaded after resizing.
293+ X_train = np .array (X )
294+
295+ # 3. Array with the y values (target).
296+ y_train = np .array (y )
297+
298+ return df , X_train , y_train
0 commit comments