1+ def clean_text (df , column :str , language :str , target :str , filename :str = 'data_processed.csv' ):
2+
3+ '''
4+ Function to preprocess and clean a dataframe with text as a preliminary step for Natural Language Processing
5+
6+ Params:
7+ - df: Dataframe
8+ - column: The name of the column in which the text is located (str)
9+ - language: The language in which the text is written (str) in ENGLISH (e.g. 'spanish', 'english')
10+ - target: The name of the column in which the target to be predicted is located
11+ - filename: Name for the processed dataframe to be saved
12+
13+ Returns:
14+ - df_processed: Dataframe after cleaning. It contains only the text variable and the target variable
15+ '''
16+
17+ import pandas as pd
18+ import re
19+ from nltk .corpus import stopwords
20+ from nltk .stem .snowball import SnowballStemmer
21+
22+ # Remove duplicated
23+ df .drop_duplicates (subset = column , inplace = True )
24+
25+ # Remove mentions (@)
26+ df [column ] = df [column ].str .replace (r'\s*@\w+' , '' , regex = True )
27+
28+ # Remove punctuation marks and convert to lowercase
29+ signos = re .compile ("(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\" )|(\()|(\))|(\[)|(\])|(\d+)" )
30+
31+ def signs_tweets (tweet ):
32+ return signos .sub ('' , tweet .lower ())
33+
34+ df [column ] = df [column ].apply (signs_tweets )
35+
36+ # Remove links
37+ def remove_links (df ):
38+ return " " .join (['{link}' if ('http' ) in word else word for word in df .split ()])
39+
40+ df [column ] = df [column ].apply (remove_links )
41+
42+ # Remove stopwords
43+
44+ def remove_stopwords (df ):
45+ stopwords = set (stopwords .words (language ))
46+ return " " .join ([word for word in df .split () if word not in stopwords ])
47+
48+ df [column ] = df [column ].apply (remove_stopwords )
49+
50+ # Apply Stemmer
51+ stemmer = SnowballStemmer (language )
52+
53+ def def_stemmer (x ):
54+ return " " .join ([stemmer .stem (word ) for word in x .split ()])
55+
56+ df [column ] = df [column ].apply (def_stemmer )
57+
58+ # Save processed data
59+ df_processed = df [[column , target ]]
60+ df_processed .to_csv (filename )
61+
62+ return df_processed
0 commit comments