Skip to content

Commit 6436bb4

Browse files
committed
clean_text
1 parent 5adf56d commit 6436bb4

1 file changed

Lines changed: 62 additions & 0 deletions

File tree

toolkit/clean_text.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
def clean_text(df, column:str, language:str, target:str, filename:str='data_processed.csv'):
2+
3+
'''
4+
Function to preprocess and clean a dataframe with text as a preliminary step for Natural Language Processing
5+
6+
Params:
7+
- df: Dataframe
8+
- column: The name of the column in which the text is located (str)
9+
- language: The language in which the text is written (str) in ENGLISH (e.g. 'spanish', 'english')
10+
- target: The name of the column in which the target to be predicted is located
11+
- filename: Name for the processed dataframe to be saved
12+
13+
Returns:
14+
- df_processed: Dataframe after cleaning. It contains only the text variable and the target variable
15+
'''
16+
17+
import pandas as pd
18+
import re
19+
from nltk.corpus import stopwords
20+
from nltk.stem.snowball import SnowballStemmer
21+
22+
# Remove duplicated
23+
df.drop_duplicates(subset = column, inplace=True)
24+
25+
# Remove mentions (@)
26+
df[column] = df[column].str.replace(r'\s*@\w+', '', regex=True)
27+
28+
# Remove punctuation marks and convert to lowercase
29+
signos = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\¿)|(\@)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
30+
31+
def signs_tweets(tweet):
32+
return signos.sub('', tweet.lower())
33+
34+
df[column] = df[column].apply(signs_tweets)
35+
36+
# Remove links
37+
def remove_links(df):
38+
return " ".join(['{link}' if ('http') in word else word for word in df.split()])
39+
40+
df[column] = df[column].apply(remove_links)
41+
42+
# Remove stopwords
43+
44+
def remove_stopwords(df):
45+
stopwords = set(stopwords.words(language))
46+
return " ".join([word for word in df.split() if word not in stopwords])
47+
48+
df[column] = df[column].apply(remove_stopwords)
49+
50+
# Apply Stemmer
51+
stemmer = SnowballStemmer(language)
52+
53+
def def_stemmer(x):
54+
return " ".join([stemmer.stem(word) for word in x.split()])
55+
56+
df[column] = df[column].apply(def_stemmer)
57+
58+
# Save processed data
59+
df_processed = df[[column, target]]
60+
df_processed.to_csv(filename)
61+
62+
return df_processed

0 commit comments

Comments
 (0)