Skip to content

Commit 70b8ce1

Browse files
committed
functions_celia
1 parent 1112e08 commit 70b8ce1

1 file changed

Lines changed: 185 additions & 2 deletions

File tree

toolkit/data_processing.py

Lines changed: 185 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
import re
44
from datetime import datetime
55
from typing import List
6-
6+
from nltk.corpus import stopwords
7+
from nltk.stem.snowball import SnowballStemmer
8+
import cv2
9+
import os
10+
from skimage.io import imread
711

812
def extract_date(df, date_column_name):
913
'''
@@ -112,4 +116,183 @@ def log_transform_data(df: pd.DataFrame, col_ignore: List[str]) -> pd.DataFrame:
112116
numeric = df_rest.select_dtypes(include=np.number).apply(np.log1p)
113117
non_numeric = df_rest.select_dtypes(exclude=np.number)
114118

115-
return pd.concat([numeric, non_numeric, df_ignore], axis = 1)
119+
return pd.concat([numeric, non_numeric, df_ignore], axis = 1)
120+
121+
122+
123+
def buffdescribe(df, stats=['mean', 'median', 'std']):
124+
125+
'''
126+
Function to facilitate a first exploration of a dataframe's data by concentrating the most relevant information
127+
128+
Params:
129+
- df: Dataframe
130+
- stats: Descriptive statistics to calculate. Default: Mean, Median, and Standard Deviation
131+
132+
Returns: Dataframe with the following columns:
133+
- Column names from the original df
134+
- Data type of each column
135+
- Percentage of null values in each column
136+
- Total number of non-null values in each column
137+
- Unique values of each column
138+
- Percentage of unique values (cardinality)
139+
- Selected descriptive statistics (default: mean, median, std) of numeric variables (int or float)
140+
'''
141+
142+
# Column names
143+
cols = pd.DataFrame(df.columns.values, columns=["COL_N"])
144+
145+
# Data type
146+
types = pd.DataFrame(df.dtypes.values, columns=["DATA_TYPE"])
147+
148+
# Percentage of Null Values
149+
percent_missing = round(df.isnull().sum() * 100 / len(df), 2)
150+
percent_missing_df = pd.DataFrame(percent_missing.values, columns=["MISSINGS (%)"])
151+
152+
# Total number of non-null values
153+
total_not_null = pd.DataFrame(df.count().values, columns=["NOT_NULL"])
154+
155+
# Unique values
156+
unicos = pd.DataFrame(df.nunique().values, columns=["UNIQUE_VALUES"])
157+
158+
# Percentage of unique values (cardinality)
159+
percent_cardin = round(unicos['UNIQUE_VALUES']*100/len(df), 2)
160+
percent_cardin_df = pd.DataFrame(percent_cardin.values, columns=["CARDIN (%)"])
161+
162+
concatenado = pd.concat([cols, types, percent_missing_df, total_not_null, unicos, percent_cardin_df], axis=1, sort=False)
163+
concatenado.set_index('COL_N', drop=True, inplace=True)
164+
165+
# Sort values by missing percentage
166+
describe = concatenado.sort_values(by=["MISSINGS (%)"], ascending=True)
167+
168+
# Descriptive statistics
169+
numeric_cols = df.select_dtypes(include=['int', 'float']).columns.tolist()
170+
numeric_stats = df[numeric_cols].agg(stats)
171+
numeric_stats.columns = [col + '_' + stat for col in numeric_stats.columns for stat in stats]
172+
173+
# Merge in a new dataframe
174+
describe = pd.merge(concatenado, numeric_stats, left_index=True, right_index=True, how='left')
175+
describe.rename(columns={'mean': 'MEAN',
176+
'median': 'MEDIAN', 'std': 'STD'}, inplace=True),
177+
178+
return describe
179+
180+
181+
def clean_text(df, column:str, language:str, target:str, filename:str='data_processed.csv'):
182+
183+
'''
184+
Function to preprocess and clean a dataframe with text as a preliminary step for Natural Language Processing
185+
186+
Params:
187+
- df: Dataframe
188+
- column: The name of the column in which the text is located (str)
189+
- language: The language in which the text is written (str) in ENGLISH (e.g. 'spanish', 'english')
190+
- target: The name of the column in which the target to be predicted is located
191+
- filename: Name for the processed dataframe to be saved
192+
193+
Returns:
194+
- df_processed: Dataframe after cleaning. It contains only the text variable and the target variable
195+
'''
196+
197+
# Remove duplicated
198+
df.drop_duplicates(subset = column, inplace=True)
199+
200+
# Remove mentions (@)
201+
df[column] = df[column].str.replace(r'\s*@\w+', '', regex=True)
202+
203+
# Remove punctuation marks and convert to lowercase
204+
signos = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
205+
206+
def signs_tweets(tweet):
207+
return signos.sub('', tweet.lower())
208+
209+
df[column] = df[column].apply(signs_tweets)
210+
211+
# Remove links
212+
def remove_links(df):
213+
return " ".join(['{link}' if ('http') in word else word for word in df.split()])
214+
215+
df[column] = df[column].apply(remove_links)
216+
217+
# Remove stopwords
218+
def remove_stopwords(df):
219+
stopwords = set(stopwords.words(language))
220+
return " ".join([word for word in df.split() if word not in stopwords])
221+
df[column] = df[column].apply(remove_stopwords)
222+
223+
# Apply Stemmer
224+
stemmer = SnowballStemmer(language)
225+
226+
def def_stemmer(x):
227+
return " ".join([stemmer.stem(word) for word in x.split()])
228+
229+
df[column] = df[column].apply(def_stemmer)
230+
231+
# Save processed data
232+
df_processed = df[[column, target]]
233+
df_processed.to_csv(filename)
234+
235+
return df_processed
236+
237+
238+
239+
def load_imgs(path, im_size:int):
240+
241+
'''
242+
Function to load a directory of images and resize them for training a Convolutional Neural Network (CNN) model.
243+
IMPORTANT: Images must be divided into subdirectories according to the target
244+
(e.g. one directory for dog photos and another for cat photos).
245+
It can be used for both binary and categorical classification.
246+
247+
Args:
248+
- path: Path where the subdirectories with the images are located.
249+
- im_size: Size to which we want to resize the image (e.g. 32).
250+
251+
Returns:
252+
- df: Dataframe with the names of the images and the category to which they belong (target).
253+
- X_train: Array with the image data loaded after resizing.
254+
- y_train: Array with the target values.
255+
'''
256+
257+
258+
filenames = []
259+
X = []
260+
y = []
261+
262+
# Create a dictionary with the target values.
263+
class_names = os.listdir(path)
264+
class_names_label = {class_name:i for i , class_name in enumerate(class_names)}
265+
266+
# Iterate over the subdirectories of the given path.
267+
for subdir, dirs, files in os.walk(path):
268+
for file in files:
269+
# Save the names of the files in a list.
270+
filenames.append(file)
271+
if file [-4:] == '.jpg' or file [-4:] == '.png':
272+
# Read the image in color.
273+
image = imread(subdir + '\\' + file)
274+
# Resize the image.
275+
smallimage = cv2.resize(image, (im_size, im_size))
276+
# Save the images in the X variable.
277+
X.append(smallimage)
278+
279+
# Save the target values of each image in a list.
280+
for i in range(len(class_names_label.keys())):
281+
if list(class_names_label.keys())[i] in subdir:
282+
y.append(list(class_names_label.values())[i])
283+
284+
# Returns:
285+
286+
# 1. Dataframe with the names of the images and the category to which they belong.
287+
df = pd.DataFrame({
288+
'filename': filenames,
289+
'category': y
290+
})
291+
292+
# 2. Array with the image data loaded after resizing.
293+
X_train = np.array(X)
294+
295+
# 3. Array with the y values (target).
296+
y_train = np.array(y)
297+
298+
return df, X_train, y_train

0 commit comments

Comments
 (0)