Skip to content

Commit e466aad

Browse files
committed
2 parents 6e86752 + e6c22ee commit e466aad

8 files changed

Lines changed: 247 additions & 2 deletions

File tree

test/test_convert_to_numeric.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
from to_numeric import convert_to_numeric
3+
4+
def test_convert_to_numeric():
5+
6+
df = pd.DataFrame({'A': ['1', '2', '3'], 'B': ['4.5', '5.6', '6.7'], 'C': ['a', 'b', 'c']})
7+
8+
9+
convert_to_numeric(df, 'A')
10+
11+
12+
assert df['A'][0],int

test/test_heatmap_corr.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import numpy as np
2+
import matplotlib.pyplot as plt
3+
import seaborn as sns
4+
import pandas as pd
5+
from heatmap_corr import heatmap
6+
7+
def test_heatmap():
8+
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9], 'target': [10, 20, 30]})
9+
10+
heatmap(df, 2, 'target', None)
11+
12+
13+

test/test_scrap.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from selenium import webdriver
2+
from selenium.webdriver.common.by import By
3+
import requests
4+
from bs4 import BeautifulSoup
5+
from selenium import webdriver
6+
import os
7+
import time
8+
import io
9+
from PIL import Image
10+
from scrap import image_scrap
11+
import shutil
12+
13+
14+
def test_image_scrap():
15+
url = 'https://www.google.com/search?q=perros+bonitos&tbm=isch&ved=2ahUKEwiCpOG3z6n9AhVFV6QEHY7KBa0Q2-cCegQIABAA&oq=perros+bonitos&gs_lcp=CgNpbWcQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBAgAEB4yBAgAEB4yBAgAEB5QwAlY6hFg6hJoAHAAeACAAYgBiAHJBpIBAzcuMpgBAKABAaoBC2d3cy13aXotaW1nwAEB&sclient=img&ei=YUz2Y8LvFMWukdUPjpWX6Ao&bih=849&biw=1600&rlz=1C5CHFA_enCA951CA951'
16+
n = 5
17+
image_scrap(url,n)
18+
19+
download_dir = './my_images'
20+
21+
assert os.path.exists(download_dir)
22+
assert len(os.listdir(download_dir)) == n
23+
24+
shutil.rmtree(download_dir)

test/test_sunburst.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from sunburst import sunburst
2+
import plotly.graph_objs as go
3+
import plotly.express as px
4+
5+
def test_sunburst():
6+
df = {
7+
'Category': ['Fruit', 'Fruit', 'Vegetable', 'Vegetable', 'Vegetable'],
8+
'Subcategory': ['Apple', 'Orange', 'Carrot', 'Tomato', 'Cucumber'],
9+
'Value': [20, 30, 40, 15, 25] }
10+
11+
fig = sunburst(df, 'Category', 'Subcategory', 'Value', 'My Sunburst Chart')
12+
13+
assert fig.layout.title.text == 'My Sunburst Chart'
14+
assert fig.layout.width == 800
15+
assert fig.layout.height == 600

toolkit/data_analysis.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,26 @@ def read_csv_zip(zip_file, csv_file, sep=';'):
5454

5555
return df
5656

57+
def chi_squared_test(df, feature, target):
58+
import scipy.stats as stats
59+
import pandas as pd
60+
"""
61+
This function performs a chi-squared test of independence between two categorical variables.
62+
63+
Params:
64+
- df: A DataFrame containing the variables of interest.
65+
- feature: The independent variable to be analyzed.
66+
- target: The dependent variable to compare the independent variable with.
67+
68+
Returns:
69+
- chi2: The chi-squared value obtained in the test.
70+
- p: The p-value obtained in the test.
71+
"""
72+
# Create a contingency table from the independent and dependent variable data.
73+
contingency_table = pd.crosstab(df[feature], df[target])
74+
75+
# Perform a chi-squared test of independence between the variables.
76+
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
77+
78+
# Return the chi-squared value and p-value obtained in the test.
79+
return chi2, p

toolkit/data_processing.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,3 +298,33 @@ def load_imgs(path, im_size:int):
298298
y_train = np.array(y)
299299

300300
return df, X_train, y_train
301+
302+
def convert_to_numeric(df,column:str):
303+
'''
304+
This function convert any number string in that column, to int or float ignoring any NaN value.
305+
306+
df -> dataframe we are working with
307+
308+
column -> column which we want to convert to numeric. Must be 'str'
309+
310+
Return:
311+
312+
Dataframe with columns already changed
313+
314+
'''
315+
df[column] = df[column].apply(lambda x: pd.to_numeric(x, errors = 'coerce'))
316+
return df
317+
318+
def _exponential_smooth(data, alpha):
319+
"""
320+
Function that exponentially smooths dataset so values are less 'rigid'
321+
:param alpha: weight factor to weight recent values more
322+
"""
323+
324+
smoothed_data = data.ewm(alpha=alpha).mean()
325+
326+
# Check that the first and last values of the smoothed data are the same as the original data
327+
smoothed_data.iloc[0] = data.iloc[0]
328+
smoothed_data.iloc[-1] = data.iloc[-1]
329+
330+
return smoothed_data

toolkit/machine_learning.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,14 @@
33
import pandas as pd
44
import numpy as np
55
from typing import List, Union
6+
from selenium.webdriver.common.by import By
7+
import requests
8+
from bs4 import BeautifulSoup
9+
from selenium import webdriver
10+
import os
11+
import time
12+
import io
13+
from PIL import Image
614

715
def balance_binary_target(df, strategy='smote', minority_ratio=None, visualize=False):
816
"""
@@ -193,3 +201,97 @@ def load_model_zip(zip_file, model_file):
193201
model = pickle.load(file)
194202

195203
return model
204+
205+
def image_scrap(url, n:int):
206+
'''
207+
Function to scrap chrome images and get n images we want, and it create a new folder as 'my_images'.
208+
209+
As we know, we are using selenium, we will need a driver in Chrome.
210+
Must have driver from Chrome to run it [chrome](https://chromedriver.chromium.org/), file name = 'chromedriver' and dowload in the same path as the scrip or jupyter.
211+
212+
Parameters
213+
----------
214+
url -> chrome images web link, must be all way long.
215+
216+
n -> number of images you want to have in the folder. Must be 'int'
217+
218+
Return
219+
----------
220+
221+
Folder called 'my_images' with n images, where you can show as much time as you want
222+
223+
'''
224+
current_dir = os.getcwd()
225+
driver_path = os.path.join(current_dir, "chromedriver.exe")
226+
227+
wd = webdriver.Chrome(driver_path)
228+
229+
def get_images_from_google(url, wd, delay, max_images):
230+
def scroll_down(wd):
231+
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
232+
time.sleep(delay)
233+
234+
url = url
235+
wd.get(url)
236+
237+
loadMore = wd.find_element(By.XPATH, '/html/body/c-wiz/div/div/div/div[2]/div/div[3]/div/div/form/div/div/button').click()
238+
239+
image_urls = set()
240+
skips = 0
241+
242+
while len(image_urls) + skips < max_images:
243+
scroll_down(wd)
244+
245+
thumbnails = wd.find_elements(By.CLASS_NAME, "Q4LuWd")
246+
247+
for img in thumbnails[len(image_urls) + skips:max_images]:
248+
try:
249+
img.click()
250+
time.sleep(delay)
251+
except:
252+
continue
253+
254+
images = wd.find_elements(By.CLASS_NAME, "n3VNCb")
255+
for image in images:
256+
if image.get_attribute('src') in image_urls:
257+
max_images += 1
258+
skips += 1
259+
break
260+
261+
if image.get_attribute('src') and 'http' in image.get_attribute('src'):
262+
image_urls.add(image.get_attribute('src'))
263+
print(f"Found {len(image_urls)}")
264+
265+
return image_urls
266+
267+
268+
def download_image(download_path, url, file_name):
269+
try:
270+
image_content = requests.get(url).content
271+
image_file = io.BytesIO(image_content)
272+
image = Image.open(image_file)
273+
file_path = download_path + file_name
274+
275+
with open(file_path, "wb") as f:
276+
image.save(f, "JPEG")
277+
278+
print("Success")
279+
except Exception as e:
280+
print('FAILED -', e)
281+
282+
283+
urls = get_images_from_google(url,wd, 1, n)
284+
285+
286+
current_dir = os.path.dirname(os.path.abspath(__file__))
287+
288+
download_dir = os.path.join(current_dir, "my_images")
289+
290+
291+
if not os.path.exists(download_dir):
292+
os.makedirs(download_dir)
293+
294+
for i, url in enumerate(urls):
295+
download_image(download_dir, url, str(i) + ".jpg")
296+
297+
wd.quit()

toolkit/plot.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import matplotlib.pyplot as plt
2+
import seaborn as sns
23
import pandas as pd
34
import numpy as np
45
from typing import Union
@@ -140,7 +141,7 @@ def sunburst(df, interior:str, exterior:str, col_num:str, title:str):
140141
fig = go.Figure()
141142
fig = px.sunburst(df, path=[interior, exterior], values=col_num, template = 'plotly_dark')
142143
fig.update_layout(width=800, height=600, title = title)
143-
fig.show()
144+
return fig
144145

145146
def wordcloudviz(column):
146147
import matplotlib.pyplot as plt
@@ -193,4 +194,29 @@ def plot_cumulative_variance_ratio(pca, n_features):
193194
plt.ylabel('Cumulative Variance Ratio')
194195

195196
# Show the plot
196-
plt.show()
197+
plt.show()
198+
199+
def heatmap(df, n:int,target:str,columns:None):
200+
'''
201+
Heatmap which show us teh correlation of our numerical column of dataset with the target, where you can add specifics numbers
202+
203+
df -> must be the dataset we are working with
204+
n -> number of columns we want to correlate with the target
205+
target -> name of the column of the target, must be 'str'
206+
columns -> must be all the columns we have in the dataset in previous step, in type object (df.columns)
207+
208+
Return:
209+
Heatmap with YlOrBr colour and two decimals, only wiht n number of columns which correlate with our target
210+
211+
'''
212+
213+
if columns is None:
214+
columns = df.columns
215+
216+
cols = df[columns].corr().nlargest(n,target)[target].index
217+
218+
cm = np.corrcoef(df[cols].values.T)
219+
220+
plt.figure(figsize=(20,10))
221+
hm = sns.heatmap(cm, cbar=True, annot=True, cmap='YlOrBr', fmt='.2f', yticklabels=cols.values, xticklabels=cols.values)
222+
return hm

0 commit comments

Comments
 (0)