web_miguel

Migueldfr · Migueldfr · commit 12d7b373c607 · 2023-02-22T20:45:59.000+01:00
diff --git a/toolkit/data_processing.py b/toolkit/data_processing.py
@@ -298,3 +298,19 @@ def load_imgs(path, im_size:int):
     y_train = np.array(y)
     
     return df, X_train, y_train
+
+def convert_to_numeric(df,column:str):
+    '''
+    This function convert any number string in that column, to int or float ignoring any NaN value.
+
+    df -> dataframe we are working with
+
+    column -> column which we want to convert to numeric. Must be 'str'
+
+    Return:
+
+    Dataframe with columns already changed
+
+    '''
+    df[column] = df[column].apply(lambda x: pd.to_numeric(x, errors = 'coerce'))
+    return df
diff --git a/toolkit/machine_learning.py b/toolkit/machine_learning.py
@@ -3,6 +3,14 @@
 import pandas as pd
 import numpy as np
 from typing import List, Union
+from selenium.webdriver.common.by import By
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+import os
+import time
+import io
+from PIL import Image
 
 def balance_binary_target(df, strategy='smote', minority_ratio=None, visualize=False):
     """
@@ -193,3 +201,97 @@ def load_model_zip(zip_file, model_file):
             model = pickle.load(file)
 
     return model
+
+def image_scrap(url, n:int):
+	'''
+	Function to scrap chrome images and get n images we want, and it create a new folder as 'my_images'.
+
+	As we know, we are using selenium, we will need a driver in Chrome.
+	Must have driver from Chrome to run it [chrome](https://chromedriver.chromium.org/), file name = 'chromedriver' and dowload in the same path as the scrip or jupyter. 
+
+	Parameters
+	----------
+	url -> chrome images web link, must be all way long.
+
+	n -> number of images you want to have in the folder. Must be 'int'
+	
+	Return
+	----------
+
+	Folder called 'my_images' with n images, where you can show as much time as you want
+	
+	'''
+	current_dir = os.getcwd()
+	driver_path = os.path.join(current_dir, "chromedriver.exe")
+
+	wd = webdriver.Chrome(driver_path)
+
+	def get_images_from_google(url, wd, delay, max_images):
+		def scroll_down(wd):
+			wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+			time.sleep(delay)
+
+		url = url
+		wd.get(url)
+
+		loadMore = wd.find_element(By.XPATH, '/html/body/c-wiz/div/div/div/div[2]/div/div[3]/div/div/form/div/div/button').click()
+
+		image_urls = set()
+		skips = 0
+
+		while len(image_urls) + skips < max_images:
+			scroll_down(wd)
+
+			thumbnails = wd.find_elements(By.CLASS_NAME, "Q4LuWd")
+
+			for img in thumbnails[len(image_urls) + skips:max_images]:
+				try:
+					img.click()
+					time.sleep(delay)
+				except:
+					continue
+
+				images = wd.find_elements(By.CLASS_NAME, "n3VNCb")
+				for image in images:
+					if image.get_attribute('src') in image_urls:
+						max_images += 1
+						skips += 1
+						break
+
+					if image.get_attribute('src') and 'http' in image.get_attribute('src'):
+						image_urls.add(image.get_attribute('src'))
+						print(f"Found {len(image_urls)}")
+
+		return image_urls
+
+
+	def download_image(download_path, url, file_name):
+		try:
+			image_content = requests.get(url).content
+			image_file = io.BytesIO(image_content)
+			image = Image.open(image_file)
+			file_path = download_path + file_name
+
+			with open(file_path, "wb") as f:
+				image.save(f, "JPEG")
+
+			print("Success")
+		except Exception as e:
+			print('FAILED -', e)
+
+
+	urls = get_images_from_google(url,wd, 1, n)
+	
+	
+	current_dir = os.path.dirname(os.path.abspath(__file__))
+
+	download_dir = os.path.join(current_dir, "my_images")
+	
+
+	if not os.path.exists(download_dir):
+		os.makedirs(download_dir)
+
+	for i, url in enumerate(urls):
+			download_image(download_dir, url, str(i) + ".jpg")
+
+	wd.quit()
diff --git a/toolkit/plot.py b/toolkit/plot.py
@@ -1,4 +1,5 @@
 import matplotlib.pyplot as plt
+import seaborn as sns
 import pandas as pd
 import numpy as np
 from typing import Union
@@ -140,7 +141,7 @@ def sunburst(df, interior:str, exterior:str, col_num:str, title:str):
     fig = go.Figure()
     fig = px.sunburst(df, path=[interior, exterior], values=col_num, template = 'plotly_dark')
     fig.update_layout(width=800, height=600, title = title)
-    fig.show()
+    return fig
 
 def wordcloudviz(column):
     import matplotlib.pyplot as plt
@@ -193,4 +194,29 @@ def plot_cumulative_variance_ratio(pca, n_features):
     plt.ylabel('Cumulative Variance Ratio')
 
     # Show the plot
-    plt.show()
+    plt.show()
+
+def heatmap(df, n:int,target:str,columns:None):
+    '''
+    Heatmap which show us teh correlation of our numerical column of dataset with the target, where you can add specifics numbers 
+    
+    df -> must be the dataset we are working with
+    n -> number of columns we want to correlate with the target
+    target -> name of the column of the target, must be 'str'
+    columns -> must be all the columns we have in the dataset in previous step, in type object (df.columns)
+
+    Return:
+    Heatmap with YlOrBr colour and two decimals, only wiht n number of columns which correlate with our target
+    
+    '''
+
+    if columns is None:
+        columns = df.columns
+    
+    cols = df[columns].corr().nlargest(n,target)[target].index
+
+    cm = np.corrcoef(df[cols].values.T) 
+
+    plt.figure(figsize=(20,10))
+    hm = sns.heatmap(cm, cbar=True, annot=True, cmap='YlOrBr', fmt='.2f', yticklabels=cols.values, xticklabels=cols.values)
+    return hm