Skip to content

Commit 12d7b37

Browse files
committed
web_miguel
1 parent 76fcece commit 12d7b37

3 files changed

Lines changed: 146 additions & 2 deletions

File tree

toolkit/data_processing.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,3 +298,19 @@ def load_imgs(path, im_size:int):
298298
y_train = np.array(y)
299299

300300
return df, X_train, y_train
301+
302+
def convert_to_numeric(df,column:str):
303+
'''
304+
This function convert any number string in that column, to int or float ignoring any NaN value.
305+
306+
df -> dataframe we are working with
307+
308+
column -> column which we want to convert to numeric. Must be 'str'
309+
310+
Return:
311+
312+
Dataframe with columns already changed
313+
314+
'''
315+
df[column] = df[column].apply(lambda x: pd.to_numeric(x, errors = 'coerce'))
316+
return df

toolkit/machine_learning.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,14 @@
33
import pandas as pd
44
import numpy as np
55
from typing import List, Union
6+
from selenium.webdriver.common.by import By
7+
import requests
8+
from bs4 import BeautifulSoup
9+
from selenium import webdriver
10+
import os
11+
import time
12+
import io
13+
from PIL import Image
614

715
def balance_binary_target(df, strategy='smote', minority_ratio=None, visualize=False):
816
"""
@@ -193,3 +201,97 @@ def load_model_zip(zip_file, model_file):
193201
model = pickle.load(file)
194202

195203
return model
204+
205+
def image_scrap(url, n:int):
206+
'''
207+
Function to scrap chrome images and get n images we want, and it create a new folder as 'my_images'.
208+
209+
As we know, we are using selenium, we will need a driver in Chrome.
210+
Must have driver from Chrome to run it [chrome](https://chromedriver.chromium.org/), file name = 'chromedriver' and dowload in the same path as the scrip or jupyter.
211+
212+
Parameters
213+
----------
214+
url -> chrome images web link, must be all way long.
215+
216+
n -> number of images you want to have in the folder. Must be 'int'
217+
218+
Return
219+
----------
220+
221+
Folder called 'my_images' with n images, where you can show as much time as you want
222+
223+
'''
224+
current_dir = os.getcwd()
225+
driver_path = os.path.join(current_dir, "chromedriver.exe")
226+
227+
wd = webdriver.Chrome(driver_path)
228+
229+
def get_images_from_google(url, wd, delay, max_images):
230+
def scroll_down(wd):
231+
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
232+
time.sleep(delay)
233+
234+
url = url
235+
wd.get(url)
236+
237+
loadMore = wd.find_element(By.XPATH, '/html/body/c-wiz/div/div/div/div[2]/div/div[3]/div/div/form/div/div/button').click()
238+
239+
image_urls = set()
240+
skips = 0
241+
242+
while len(image_urls) + skips < max_images:
243+
scroll_down(wd)
244+
245+
thumbnails = wd.find_elements(By.CLASS_NAME, "Q4LuWd")
246+
247+
for img in thumbnails[len(image_urls) + skips:max_images]:
248+
try:
249+
img.click()
250+
time.sleep(delay)
251+
except:
252+
continue
253+
254+
images = wd.find_elements(By.CLASS_NAME, "n3VNCb")
255+
for image in images:
256+
if image.get_attribute('src') in image_urls:
257+
max_images += 1
258+
skips += 1
259+
break
260+
261+
if image.get_attribute('src') and 'http' in image.get_attribute('src'):
262+
image_urls.add(image.get_attribute('src'))
263+
print(f"Found {len(image_urls)}")
264+
265+
return image_urls
266+
267+
268+
def download_image(download_path, url, file_name):
269+
try:
270+
image_content = requests.get(url).content
271+
image_file = io.BytesIO(image_content)
272+
image = Image.open(image_file)
273+
file_path = download_path + file_name
274+
275+
with open(file_path, "wb") as f:
276+
image.save(f, "JPEG")
277+
278+
print("Success")
279+
except Exception as e:
280+
print('FAILED -', e)
281+
282+
283+
urls = get_images_from_google(url,wd, 1, n)
284+
285+
286+
current_dir = os.path.dirname(os.path.abspath(__file__))
287+
288+
download_dir = os.path.join(current_dir, "my_images")
289+
290+
291+
if not os.path.exists(download_dir):
292+
os.makedirs(download_dir)
293+
294+
for i, url in enumerate(urls):
295+
download_image(download_dir, url, str(i) + ".jpg")
296+
297+
wd.quit()

toolkit/plot.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import matplotlib.pyplot as plt
2+
import seaborn as sns
23
import pandas as pd
34
import numpy as np
45
from typing import Union
@@ -140,7 +141,7 @@ def sunburst(df, interior:str, exterior:str, col_num:str, title:str):
140141
fig = go.Figure()
141142
fig = px.sunburst(df, path=[interior, exterior], values=col_num, template = 'plotly_dark')
142143
fig.update_layout(width=800, height=600, title = title)
143-
fig.show()
144+
return fig
144145

145146
def wordcloudviz(column):
146147
import matplotlib.pyplot as plt
@@ -193,4 +194,29 @@ def plot_cumulative_variance_ratio(pca, n_features):
193194
plt.ylabel('Cumulative Variance Ratio')
194195

195196
# Show the plot
196-
plt.show()
197+
plt.show()
198+
199+
def heatmap(df, n:int,target:str,columns:None):
200+
'''
201+
Heatmap which show us teh correlation of our numerical column of dataset with the target, where you can add specifics numbers
202+
203+
df -> must be the dataset we are working with
204+
n -> number of columns we want to correlate with the target
205+
target -> name of the column of the target, must be 'str'
206+
columns -> must be all the columns we have in the dataset in previous step, in type object (df.columns)
207+
208+
Return:
209+
Heatmap with YlOrBr colour and two decimals, only wiht n number of columns which correlate with our target
210+
211+
'''
212+
213+
if columns is None:
214+
columns = df.columns
215+
216+
cols = df[columns].corr().nlargest(n,target)[target].index
217+
218+
cm = np.corrcoef(df[cols].values.T)
219+
220+
plt.figure(figsize=(20,10))
221+
hm = sns.heatmap(cm, cbar=True, annot=True, cmap='YlOrBr', fmt='.2f', yticklabels=cols.values, xticklabels=cols.values)
222+
return hm

0 commit comments

Comments
 (0)