Merge branch 'dev' of https://github.com/TheBridgeMachineLearningPythonLibrary/MachineLearningToolKit into dev

seenstevo · seenstevo · commit e466aad29aed · 2023-02-22T21:12:30.000+01:00
diff --git a/test/test_convert_to_numeric.py b/test/test_convert_to_numeric.py
@@ -0,0 +1,12 @@
+import pandas as pd
+from to_numeric import convert_to_numeric
+
+def test_convert_to_numeric():
+    
+    df = pd.DataFrame({'A': ['1', '2', '3'], 'B': ['4.5', '5.6', '6.7'], 'C': ['a', 'b', 'c']})
+    
+    
+    convert_to_numeric(df, 'A')
+    
+    
+    assert df['A'][0],int
diff --git a/test/test_heatmap_corr.py b/test/test_heatmap_corr.py
@@ -0,0 +1,13 @@
+import numpy as np 
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from heatmap_corr import heatmap
+
+def test_heatmap():
+    df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9], 'target': [10, 20, 30]})
+
+    heatmap(df, 2, 'target', None)
+    
+
+    
diff --git a/test/test_scrap.py b/test/test_scrap.py
@@ -0,0 +1,24 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+import os
+import time
+import io
+from PIL import Image
+from scrap import image_scrap
+import shutil
+
+
+def test_image_scrap():
+    url = 'https://www.google.com/search?q=perros+bonitos&tbm=isch&ved=2ahUKEwiCpOG3z6n9AhVFV6QEHY7KBa0Q2-cCegQIABAA&oq=perros+bonitos&gs_lcp=CgNpbWcQAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBAgAEB4yBAgAEB4yBAgAEB5QwAlY6hFg6hJoAHAAeACAAYgBiAHJBpIBAzcuMpgBAKABAaoBC2d3cy13aXotaW1nwAEB&sclient=img&ei=YUz2Y8LvFMWukdUPjpWX6Ao&bih=849&biw=1600&rlz=1C5CHFA_enCA951CA951'
+    n = 5
+    image_scrap(url,n)
+
+    download_dir = './my_images'
+    
+    assert os.path.exists(download_dir)
+    assert len(os.listdir(download_dir)) == n
+
+    shutil.rmtree(download_dir) 
diff --git a/test/test_sunburst.py b/test/test_sunburst.py
@@ -0,0 +1,15 @@
+from sunburst import sunburst
+import plotly.graph_objs as go
+import plotly.express as px
+
+def test_sunburst():
+    df = {
+        'Category': ['Fruit', 'Fruit', 'Vegetable', 'Vegetable', 'Vegetable'],
+        'Subcategory': ['Apple', 'Orange', 'Carrot', 'Tomato', 'Cucumber'],
+        'Value': [20, 30, 40, 15, 25] }
+    
+    fig = sunburst(df, 'Category', 'Subcategory', 'Value', 'My Sunburst Chart')   
+    
+    assert fig.layout.title.text == 'My Sunburst Chart'
+    assert fig.layout.width == 800
+    assert fig.layout.height == 600
diff --git a/toolkit/data_analysis.py b/toolkit/data_analysis.py
@@ -54,3 +54,26 @@ def read_csv_zip(zip_file, csv_file, sep=';'):
 
     return df
 
+def chi_squared_test(df, feature, target):
+    import scipy.stats as stats
+    import pandas as pd
+    """
+    This function performs a chi-squared test of independence between two categorical variables.
+
+    Params:
+    - df: A DataFrame containing the variables of interest.
+    - feature: The independent variable to be analyzed.
+    - target: The dependent variable to compare the independent variable with.
+
+    Returns:
+    - chi2: The chi-squared value obtained in the test.
+    - p: The p-value obtained in the test.
+    """
+    # Create a contingency table from the independent and dependent variable data.
+    contingency_table = pd.crosstab(df[feature], df[target])
+
+    # Perform a chi-squared test of independence between the variables.
+    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
+
+    # Return the chi-squared value and p-value obtained in the test.
+    return chi2, p
diff --git a/toolkit/data_processing.py b/toolkit/data_processing.py
@@ -298,3 +298,33 @@ def load_imgs(path, im_size:int):
     y_train = np.array(y)
     
     return df, X_train, y_train
+
+def convert_to_numeric(df,column:str):
+    '''
+    This function convert any number string in that column, to int or float ignoring any NaN value.
+
+    df -> dataframe we are working with
+
+    column -> column which we want to convert to numeric. Must be 'str'
+
+    Return:
+
+    Dataframe with columns already changed
+
+    '''
+    df[column] = df[column].apply(lambda x: pd.to_numeric(x, errors = 'coerce'))
+    return df
+
+def _exponential_smooth(data, alpha):
+    """
+    Function that exponentially smooths dataset so values are less 'rigid'
+    :param alpha: weight factor to weight recent values more
+    """
+
+    smoothed_data = data.ewm(alpha=alpha).mean()
+
+    # Check that the first and last values of the smoothed data are the same as the original data
+    smoothed_data.iloc[0] = data.iloc[0]
+    smoothed_data.iloc[-1] = data.iloc[-1]
+
+    return smoothed_data    
diff --git a/toolkit/machine_learning.py b/toolkit/machine_learning.py
@@ -3,6 +3,14 @@
 import pandas as pd
 import numpy as np
 from typing import List, Union
+from selenium.webdriver.common.by import By
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+import os
+import time
+import io
+from PIL import Image
 
 def balance_binary_target(df, strategy='smote', minority_ratio=None, visualize=False):
     """
@@ -193,3 +201,97 @@ def load_model_zip(zip_file, model_file):
             model = pickle.load(file)
 
     return model
+
+def image_scrap(url, n:int):
+	'''
+	Function to scrap chrome images and get n images we want, and it create a new folder as 'my_images'.
+
+	As we know, we are using selenium, we will need a driver in Chrome.
+	Must have driver from Chrome to run it [chrome](https://chromedriver.chromium.org/), file name = 'chromedriver' and dowload in the same path as the scrip or jupyter. 
+
+	Parameters
+	----------
+	url -> chrome images web link, must be all way long.
+
+	n -> number of images you want to have in the folder. Must be 'int'
+	
+	Return
+	----------
+
+	Folder called 'my_images' with n images, where you can show as much time as you want
+	
+	'''
+	current_dir = os.getcwd()
+	driver_path = os.path.join(current_dir, "chromedriver.exe")
+
+	wd = webdriver.Chrome(driver_path)
+
+	def get_images_from_google(url, wd, delay, max_images):
+		def scroll_down(wd):
+			wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+			time.sleep(delay)
+
+		url = url
+		wd.get(url)
+
+		loadMore = wd.find_element(By.XPATH, '/html/body/c-wiz/div/div/div/div[2]/div/div[3]/div/div/form/div/div/button').click()
+
+		image_urls = set()
+		skips = 0
+
+		while len(image_urls) + skips < max_images:
+			scroll_down(wd)
+
+			thumbnails = wd.find_elements(By.CLASS_NAME, "Q4LuWd")
+
+			for img in thumbnails[len(image_urls) + skips:max_images]:
+				try:
+					img.click()
+					time.sleep(delay)
+				except:
+					continue
+
+				images = wd.find_elements(By.CLASS_NAME, "n3VNCb")
+				for image in images:
+					if image.get_attribute('src') in image_urls:
+						max_images += 1
+						skips += 1
+						break
+
+					if image.get_attribute('src') and 'http' in image.get_attribute('src'):
+						image_urls.add(image.get_attribute('src'))
+						print(f"Found {len(image_urls)}")
+
+		return image_urls
+
+
+	def download_image(download_path, url, file_name):
+		try:
+			image_content = requests.get(url).content
+			image_file = io.BytesIO(image_content)
+			image = Image.open(image_file)
+			file_path = download_path + file_name
+
+			with open(file_path, "wb") as f:
+				image.save(f, "JPEG")
+
+			print("Success")
+		except Exception as e:
+			print('FAILED -', e)
+
+
+	urls = get_images_from_google(url,wd, 1, n)
+	
+	
+	current_dir = os.path.dirname(os.path.abspath(__file__))
+
+	download_dir = os.path.join(current_dir, "my_images")
+	
+
+	if not os.path.exists(download_dir):
+		os.makedirs(download_dir)
+
+	for i, url in enumerate(urls):
+			download_image(download_dir, url, str(i) + ".jpg")
+
+	wd.quit()
diff --git a/toolkit/plot.py b/toolkit/plot.py
@@ -1,4 +1,5 @@
 import matplotlib.pyplot as plt
+import seaborn as sns
 import pandas as pd
 import numpy as np
 from typing import Union
@@ -140,7 +141,7 @@ def sunburst(df, interior:str, exterior:str, col_num:str, title:str):
     fig = go.Figure()
     fig = px.sunburst(df, path=[interior, exterior], values=col_num, template = 'plotly_dark')
     fig.update_layout(width=800, height=600, title = title)
-    fig.show()
+    return fig
 
 def wordcloudviz(column):
     import matplotlib.pyplot as plt
@@ -193,4 +194,29 @@ def plot_cumulative_variance_ratio(pca, n_features):
     plt.ylabel('Cumulative Variance Ratio')
 
     # Show the plot
-    plt.show()
+    plt.show()
+
+def heatmap(df, n:int,target:str,columns:None):
+    '''
+    Heatmap which show us teh correlation of our numerical column of dataset with the target, where you can add specifics numbers 
+    
+    df -> must be the dataset we are working with
+    n -> number of columns we want to correlate with the target
+    target -> name of the column of the target, must be 'str'
+    columns -> must be all the columns we have in the dataset in previous step, in type object (df.columns)
+
+    Return:
+    Heatmap with YlOrBr colour and two decimals, only wiht n number of columns which correlate with our target
+    
+    '''
+
+    if columns is None:
+        columns = df.columns
+    
+    cols = df[columns].corr().nlargest(n,target)[target].index
+
+    cm = np.corrcoef(df[cols].values.T) 
+
+    plt.figure(figsize=(20,10))
+    hm = sns.heatmap(cm, cbar=True, annot=True, cmap='YlOrBr', fmt='.2f', yticklabels=cols.values, xticklabels=cols.values)
+    return hm