pranavsutar · prathik8794 · Mar 9, 2023 · Mar 9, 2023 · Mar 11, 2023 · Mar 11, 2023
diff --git a/backend/.gitignore b/backend/.gitignore
@@ -0,0 +1,5 @@
+
+__pycache__/
+
+# Python env
+*env/
diff --git a/backend/algorithm/__pycache__/correlated.cpython-310.pyc b/backend/algorithm/__pycache__/correlated.cpython-310.pyc
diff --git a/backend/algorithm/__pycache__/correlated.cpython-311.pyc b/backend/algorithm/__pycache__/correlated.cpython-311.pyc
diff --git a/backend/algorithm/__pycache__/duplicate.cpython-310.pyc b/backend/algorithm/__pycache__/duplicate.cpython-310.pyc
diff --git a/backend/algorithm/__pycache__/duplicate.cpython-311.pyc b/backend/algorithm/__pycache__/duplicate.cpython-311.pyc
diff --git a/backend/algorithm/__pycache__/sp_missingvalue.cpython-310.pyc b/backend/algorithm/__pycache__/sp_missingvalue.cpython-310.pyc
diff --git a/backend/algorithm/__pycache__/sp_missingvalue.cpython-311.pyc b/backend/algorithm/__pycache__/sp_missingvalue.cpython-311.pyc
diff --git a/backend/algorithm/correlated.py b/backend/algorithm/correlated.py
@@ -0,0 +1,39 @@
+import numpy as np
+
+def correlated(df):
+    # Compute the correlation matrix
+    corr_matrix = df.corr(numeric_only=True) 
+    instr = []
+
+    # Identify highly correlated features
+    high_corr_features = set()
+    for i in range(len(corr_matrix.columns)):
+        for j in range(i):
+            if abs(corr_matrix.iloc[i, j]) > 0.7: # Change the threshold value as needed
+                colname = corr_matrix.columns[i]
+                high_corr_features.add(colname)
+    # Highest cvalue of correlation on non-diagonal elements
+    max_corr = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_)).stack().max()
+    print("Maximum correlation value among any two values:", max_corr)
+    # instr += "Maximum correlation value among any two values: " + str(max_corr) + "\n"
+    instr.append("Maximum correlation value among any two distinct values: " + str(max_corr) + "\n")
+
+    # print giving info
+    if len(high_corr_features) > 0:
+        print("There are highly correlated features in the dataset.")
+        print("Number of highly correlated features:", len(high_corr_features))
+        print("Highly correlated features:", high_corr_features)
+        # instr += "There are highly correlated features in the dataset.\n"
+        # instr += "Number of highly correlated features: " + str(len(high_corr_features)) + "\n"
+        # instr += "Highly correlated features: " + str(high_corr_features) + "\n"
+        instr.extend(["There are highly correlated features in the dataset.\n", "Number of highly correlated features: " + str(len(high_corr_features)) + "\n", "Highly correlated features: " + str(high_corr_features) + "\n"])
+
+
+
+
+    else:
+        print("There are no highly correlated features in the dataset.")
+        # instr += "There are no highly correlated features in the dataset.\n"
+        instr.append("There are no highly correlated features in the dataset.\n")
+
+    return instr
diff --git a/backend/algorithm/duplicate.py b/backend/algorithm/duplicate.py
@@ -0,0 +1,16 @@
+def duplicated(df):
+    duplicates = df.duplicated()
+    instr = ''
+    if any(duplicates):
+        # print("Duplicate examples are present in the dataset.")
+        # print("Number of duplicate examples:", duplicates.sum())
+        # print("Indices of duplicate examples:", df.index[duplicates].tolist())
+        instr += "Duplicate examples are present in the dataset.\n"
+        instr += "Number of duplicate examples: " + str(duplicates.sum()) + "\n"
+        instr += "Indices of duplicate examples: " + str(df.index[duplicates].tolist()) + "\n"
+
+    else:
+        # print("There are no duplicate examples in the dataset.")
+        instr += "There are no duplicate examples in the dataset.\n"
+
+    return instr
diff --git a/backend/algorithm/imbalance.py b/backend/algorithm/imbalance.py
@@ -0,0 +1,34 @@
+def imbalance(df):
+    imb = False; nu = 1
+    s = ''
+    # Check for class imbalance
+    for col in df.columns:
+        class_counts = df[col].value_counts()
+        if class_counts.min() / class_counts.max() < 0.1 and not col in ['Date', 'Time', 'Name' ] and not df[col].dtype in ['int', 'float32', 'float', 'c']:
+            # if all values are unique, then continue
+            if  len(df[col].unique()) == len(df):
+                continue
+            print(f"{nu}) Class imbalance detected in column '{col}' with ", end= '')
+            s+= f"{nu}) Class imbalance detected in column '{col}' with "
+            imb = True        
+            print("Class imbalance ratio:", round(class_counts.min() / class_counts.max(), 10),'\n')
+            s+= "Class imbalance ratio:" + str(round(class_counts.min() / class_counts.max(), 10)) + '\n'
+            nu += 1
+            # ck = class_counts.to_dict()
+            # # print few elements of dictionary
+            # print("Class counts:", {k: ck[k] for k in list(ck)[:5]}, '... etc')
+
+    if imb:
+        print("Potential mitigation strategies:")
+        print("- Use appropriate sampling techniques to balance the classes.")
+        print("- Use appropriate evaluation metrics like F1 score, precision, recall, etc. as accuracy is not a good metric for imbalanced datasets.")
+        print("- Use appropriate regularization techniques like class weights to combat bias.")
+        s += '''Potential mitigation strategies:
+        - Use appropriate sampling techniques to balance the classes.
+        - Use appropriate evaluation metrics like F1 score, precision, recall, etc. as accuracy is not a good metric for imbalanced datasets.
+        - Use appropriate regularization techniques like class weights to combat bias.\n'''
+    else:
+        print("There is no class imbalance in the dataset.")
+        print("No mitigation strategies are required.")
+        s += "There is no class imbalance in the dataset.\nNo mitigation strategies are required.\n"
+    return s
diff --git a/backend/algorithm/sp_missingvalue.py b/backend/algorithm/sp_missingvalue.py
@@ -0,0 +1,79 @@
+def SpecialMissingValues(df):
+    # Check for special missing values
+    s = ''
+    special_missing_values = ['-', 'n/a', 'N/A', 'NA', '--', '?']
+    pres = {val: 0 for val in special_missing_values}
+    for val in special_missing_values:
+        pres[val] = df.isin([val]).sum().sum()
+    sugg = ''
+    if any(pres.values()):
+    #     print("There are special missing values in the dataset.")
+    #     print("Number of special missing values:", sum(pres.values()))
+    #     print("Percentage of special missing values:", round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2), "%")
+    #     print("Special missing values in each column:")
+    #     for col in df.columns:
+    #         print(col, ":", df[col].isin(special_missing_values).sum())
+    # else:
+    #     print("There are no special missing values in the dataset.")
+        s += f'''There are special missing values in the dataset.\n
+    Number of special missing values: {sum(pres.values())}\n
+    Percentage of special missing values: {round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2)} %\n
+    Special missing values in each column:\n
+    '''
+        for col in df.columns:
+            if df[col].isin(special_missing_values).sum():
+                s += f'''{col}: {df[col].isin(special_missing_values).sum()}\n'''
+
+    # Refactorting :
+        sugg + '# for each column, replace the special missing values with NaN values.\n'
+        sugg += 'for col in df.columns:\n'
+        sugg += '    df[col] = df[col].replace(special_missing_values, np.nan)\n'
+        sugg += '# check for NaN values again\n'
+        sugg += 'df.isnull().sum().sum()\n'
+
+        sugg += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n'
+        sugg += 'for col in df.columns:\n'
+        sugg += '    if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n'
+        sugg += '        df[col] = df[col].fillna(df[col].mean())\n'
+        sugg += '# check for NaN values again\n'
+        sugg += 'df.isnull().sum().sum()\n'
+
+        sugg += '# for each column, replace the NaN values with the mode of the column if the column is categorical.\n'
+        sugg += 'for col in df.columns:\n'
+        sugg += '    if df[col].dtype == np.object:\n'
+        sugg += '        df[col] = df[col].fillna(df[col].mode()[0])\n'
+        sugg += '# check for NaN values again\n'
+
+        s += sugg
+    else:
+        s += "There are no special missing values in the dataset.\n \n"
+
+    # Check for NaN values
+    sugg_2 = ''
+    if df.isnull().values.any():
+        s += f'''There are NaN values in the dataset.\n
+    Number of NaN values: {df.isnull().sum().sum()}\n
+    Percentage of NaN values: {round(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100, 2)} %\n
+    NaN values in each column:\n
+    '''
+        for col in df.columns:
+            if df[col].isnull().sum():
+                s += f'''{col}: {df[col].isnull().sum()}\n'''  
+    # Refactoring :
+        sugg_2 += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n'
+
+        sugg_2 += 'for col in df.columns:\n'
+        sugg_2 += '    if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n'
+        sugg_2 += '        df[col] = df[col].fillna(df[col].mean())\n'
+        sugg_2 += '    elif df[col].dtype == np.object:\n'
+        sugg_2 += '        df[col] = df[col].fillna(df[col].mode()[0])\n'
+        sugg_2 += '# check for NaN values again\n'
+        sugg_2 += 'df.isnull().sum().sum()\n'
+        s += sugg_2
+
+
+
+    else:
+        s += "\nThere are no NaN values in the dataset."
+
+    return s
diff --git a/backend/app.py b/backend/app.py
@@ -1,13 +1,17 @@
-# pip install numpy panda seaborn matplotlib flask flask_cors
 from flask import Flask, request, jsonify, send_file
 from flask_cors import CORS
 import pandas as pd, numpy as np, io, base64, matplotlib.pyplot as plt, seaborn as sns
-
+import matplotlib
+from algorithm.sp_missingvalue import *
+from algorithm.correlated import *
+from algorithm.duplicate import *
+from algorithm.imbalance import *
 app = Flask(__name__)
 CORS(app)
 
+matplotlib.use('Agg')
 def generate_heatmap(df):
-    corr = df.corr()
+    corr = df.corr(numeric_only=True)
     plt.imshow(corr, cmap='coolwarm', interpolation='none')
     plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
     plt.yticks(range(len(corr.columns)), corr.columns)
@@ -59,140 +63,6 @@ def generate_bargraph_nan_values(df):
     return img_base64
 
 
-def SpecialMissingValues(df):
-    # Check for special missing values
-    s = ''
-    special_missing_values = ['-', 'n/a', 'N/A', 'NA', '--', '?']
-    pres = {val: 0 for val in special_missing_values}
-    for val in special_missing_values:
-        pres[val] = df.isin([val]).sum().sum()
-    sugg = ''
-    if any(pres.values()):
-    #     print("There are special missing values in the dataset.")
-    #     print("Number of special missing values:", sum(pres.values()))
-    #     print("Percentage of special missing values:", round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2), "%")
-    #     print("Special missing values in each column:")
-    #     for col in df.columns:
-    #         print(col, ":", df[col].isin(special_missing_values).sum())
-    # else:
-    #     print("There are no special missing values in the dataset.")
-        s += f'''There are special missing values in the dataset.\n
-    Number of special missing values: {sum(pres.values())}\n
-    Percentage of special missing values: {round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2)} %\n
-    Special missing values in each column:\n
-    '''
-        for col in df.columns:
-            if df[col].isin(special_missing_values).sum():
-                s += f'''{col}: {df[col].isin(special_missing_values).sum()}\n'''
-
-    # Refactorting :
-        sugg + '# for each column, replace the special missing values with NaN values.\n'
-        sugg += 'for col in df.columns:\n'
-        sugg += '    df[col] = df[col].replace(special_missing_values, np.nan)\n'
-        sugg += '# check for NaN values again\n'
-        sugg += 'df.isnull().sum().sum()\n'
-
-        sugg += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n'
-        sugg += 'for col in df.columns:\n'
-        sugg += '    if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n'
-        sugg += '        df[col] = df[col].fillna(df[col].mean())\n'
-        sugg += '# check for NaN values again\n'
-        sugg += 'df.isnull().sum().sum()\n'
-
-        sugg += '# for each column, replace the NaN values with the mode of the column if the column is categorical.\n'
-        sugg += 'for col in df.columns:\n'
-        sugg += '    if df[col].dtype == np.object:\n'
-        sugg += '        df[col] = df[col].fillna(df[col].mode()[0])\n'
-        sugg += '# check for NaN values again\n'
-
-        s += sugg
-    else:
-        s += "There are no special missing values in the dataset.\n \n"
-
-    # Check for NaN values
-    sugg_2 = ''
-    if df.isnull().values.any():
-        s += f'''There are NaN values in the dataset.\n
-    Number of NaN values: {df.isnull().sum().sum()}\n
-    Percentage of NaN values: {round(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100, 2)} %\n
-    NaN values in each column:\n
-    '''
-        for col in df.columns:
-            if df[col].isnull().sum():
-                s += f'''{col}: {df[col].isnull().sum()}\n'''  
-    # Refactoring :
-        sugg_2 += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n'
-
-        sugg_2 += 'for col in df.columns:\n'
-        sugg_2 += '    if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n'
-        sugg_2 += '        df[col] = df[col].fillna(df[col].mean())\n'
-        sugg_2 += '    elif df[col].dtype == np.object:\n'
-        sugg_2 += '        df[col] = df[col].fillna(df[col].mode()[0])\n'
-        sugg_2 += '# check for NaN values again\n'
-        sugg_2 += 'df.isnull().sum().sum()\n'
-        s += sugg_2
-
-
-
-    else:
-        s += "\nThere are no NaN values in the dataset."
-
-    return s
-
-def correlated(df):
-    # Compute the correlation matrix
-    corr_matrix = df.corr() 
-    instr = []
-
-    # Identify highly correlated features
-    high_corr_features = set()
-    for i in range(len(corr_matrix.columns)):
-        for j in range(i):
-            if abs(corr_matrix.iloc[i, j]) > 0.7: # Change the threshold value as needed
-                colname = corr_matrix.columns[i]
-                high_corr_features.add(colname)
-    # Highest cvalue of correlation on non-diagonal elements
-    max_corr = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)).stack().max()
-    print("Maximum correlation value among any two values:", max_corr)
-    # instr += "Maximum correlation value among any two values: " + str(max_corr) + "\n"
-    instr.append("Maximum correlation value among any two distinct values: " + str(max_corr) + "\n")
-
-    # print giving info
-    if len(high_corr_features) > 0:
-        print("There are highly correlated features in the dataset.")
-        print("Number of highly correlated features:", len(high_corr_features))
-        print("Highly correlated features:", high_corr_features)
-        # instr += "There are highly correlated features in the dataset.\n"
-        # instr += "Number of highly correlated features: " + str(len(high_corr_features)) + "\n"
-        # instr += "Highly correlated features: " + str(high_corr_features) + "\n"
-        instr.extend(["There are highly correlated features in the dataset.\n", "Number of highly correlated features: " + str(len(high_corr_features)) + "\n", "Highly correlated features: " + str(high_corr_features) + "\n"])
-
-
-
-
-    else:
-        print("There are no highly correlated features in the dataset.")
-        # instr += "There are no highly correlated features in the dataset.\n"
-        instr.append("There are no highly correlated features in the dataset.\n")
-
-    return instr
-
-def duplicated(df):
-    duplicates = df.duplicated()
-    instr = ''
-    if any(duplicates):
-        # print("Duplicate examples are present in the dataset.")
-        # print("Number of duplicate examples:", duplicates.sum())
-        # print("Indices of duplicate examples:", df.index[duplicates].tolist())
-        instr += "Duplicate examples are present in the dataset.\n"
-        instr += "Number of duplicate examples: " + str(duplicates.sum()) + "\n"
-        instr += "Indices of duplicate examples: " + str(df.index[duplicates].tolist()) + "\n"
-
-    else:
-        # print("There are no duplicate examples in the dataset.")
-        instr += "There are no duplicate examples in the dataset.\n"
-
-    return instr
 
 
 @app.route('/upload', methods=['POST'])
@@ -209,9 +79,10 @@ def upload():
     results['bargraph_sp_miss'] = generate_bargraph_special_missing_values(df)
     results['bargraph_nan'] = generate_bargraph_nan_values(df)
     results['duplicates'] = duplicated(df)
+    results['imbalance'] = imbalance(df)
     j = jsonify(results)
-    print(j)    
-    return j
+    # print(df)    
+    return results
 
 if __name__ == '__main__':
     app.run(debug= True)

diff --git a/backend/req.txt b/backend/req.txt
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -0,0 +1,7 @@
+Flask==2.2.3
+flask_cors==3.0.10
+jsonschema==4.17.3
+matplotlib==3.7.1
+numpy==1.24.2
+pandas==1.5.3
+seaborn==0.12.2
diff --git a/backend/requirements_backend.txt b/backend/requirements_backend.txt
diff --git a/frontend/.gitignore b/frontend/.gitignore
@@ -20,4 +20,9 @@
 
 npm-debug.log*
 yarn-debug.log*
-yarn-error.log*
+yarn-error.log*
+
+__pycache__/
+
+# Python env
+*env