Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions backend/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

__pycache__/

# Python env
*env/
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
39 changes: 39 additions & 0 deletions backend/algorithm/correlated.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import numpy as np

def correlated(df):
# Compute the correlation matrix
corr_matrix = df.corr(numeric_only=True)
instr = []

# Identify highly correlated features
high_corr_features = set()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > 0.7: # Change the threshold value as needed
colname = corr_matrix.columns[i]
high_corr_features.add(colname)
# Highest cvalue of correlation on non-diagonal elements
max_corr = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_)).stack().max()
print("Maximum correlation value among any two values:", max_corr)
# instr += "Maximum correlation value among any two values: " + str(max_corr) + "\n"
instr.append("Maximum correlation value among any two distinct values: " + str(max_corr) + "\n")

# print giving info
if len(high_corr_features) > 0:
print("There are highly correlated features in the dataset.")
print("Number of highly correlated features:", len(high_corr_features))
print("Highly correlated features:", high_corr_features)
# instr += "There are highly correlated features in the dataset.\n"
# instr += "Number of highly correlated features: " + str(len(high_corr_features)) + "\n"
# instr += "Highly correlated features: " + str(high_corr_features) + "\n"
instr.extend(["There are highly correlated features in the dataset.\n", "Number of highly correlated features: " + str(len(high_corr_features)) + "\n", "Highly correlated features: " + str(high_corr_features) + "\n"])




else:
print("There are no highly correlated features in the dataset.")
# instr += "There are no highly correlated features in the dataset.\n"
instr.append("There are no highly correlated features in the dataset.\n")

return instr
16 changes: 16 additions & 0 deletions backend/algorithm/duplicate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
def duplicated(df):
duplicates = df.duplicated()
instr = ''
if any(duplicates):
# print("Duplicate examples are present in the dataset.")
# print("Number of duplicate examples:", duplicates.sum())
# print("Indices of duplicate examples:", df.index[duplicates].tolist())
instr += "Duplicate examples are present in the dataset.\n"
instr += "Number of duplicate examples: " + str(duplicates.sum()) + "\n"
instr += "Indices of duplicate examples: " + str(df.index[duplicates].tolist()) + "\n"

else:
# print("There are no duplicate examples in the dataset.")
instr += "There are no duplicate examples in the dataset.\n"

return instr
34 changes: 34 additions & 0 deletions backend/algorithm/imbalance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
def imbalance(df):
imb = False; nu = 1
s = ''
# Check for class imbalance
for col in df.columns:
class_counts = df[col].value_counts()
if class_counts.min() / class_counts.max() < 0.1 and not col in ['Date', 'Time', 'Name' ] and not df[col].dtype in ['int', 'float32', 'float', 'c']:
# if all values are unique, then continue
if len(df[col].unique()) == len(df):
continue
print(f"{nu}) Class imbalance detected in column '{col}' with ", end= '')
s+= f"{nu}) Class imbalance detected in column '{col}' with "
imb = True
print("Class imbalance ratio:", round(class_counts.min() / class_counts.max(), 10),'\n')
s+= "Class imbalance ratio:" + str(round(class_counts.min() / class_counts.max(), 10)) + '\n'
nu += 1
# ck = class_counts.to_dict()
# # print few elements of dictionary
# print("Class counts:", {k: ck[k] for k in list(ck)[:5]}, '... etc')

if imb:
print("Potential mitigation strategies:")
print("- Use appropriate sampling techniques to balance the classes.")
print("- Use appropriate evaluation metrics like F1 score, precision, recall, etc. as accuracy is not a good metric for imbalanced datasets.")
print("- Use appropriate regularization techniques like class weights to combat bias.")
s += '''Potential mitigation strategies:
- Use appropriate sampling techniques to balance the classes.
- Use appropriate evaluation metrics like F1 score, precision, recall, etc. as accuracy is not a good metric for imbalanced datasets.
- Use appropriate regularization techniques like class weights to combat bias.\n'''
else:
print("There is no class imbalance in the dataset.")
print("No mitigation strategies are required.")
s += "There is no class imbalance in the dataset.\nNo mitigation strategies are required.\n"
return s
79 changes: 79 additions & 0 deletions backend/algorithm/sp_missingvalue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
def SpecialMissingValues(df):
# Check for special missing values
s = ''
special_missing_values = ['-', 'n/a', 'N/A', 'NA', '--', '?']
pres = {val: 0 for val in special_missing_values}
for val in special_missing_values:
pres[val] = df.isin([val]).sum().sum()
sugg = ''
if any(pres.values()):
# print("There are special missing values in the dataset.")
# print("Number of special missing values:", sum(pres.values()))
# print("Percentage of special missing values:", round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2), "%")
# print("Special missing values in each column:")
# for col in df.columns:
# print(col, ":", df[col].isin(special_missing_values).sum())
# else:
# print("There are no special missing values in the dataset.")
s += f'''There are special missing values in the dataset.\n
Number of special missing values: {sum(pres.values())}\n
Percentage of special missing values: {round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2)} %\n
Special missing values in each column:\n
'''
for col in df.columns:
if df[col].isin(special_missing_values).sum():
s += f'''{col}: {df[col].isin(special_missing_values).sum()}\n'''

# Refactorting :
sugg + '# for each column, replace the special missing values with NaN values.\n'
sugg += 'for col in df.columns:\n'
sugg += ' df[col] = df[col].replace(special_missing_values, np.nan)\n'
sugg += '# check for NaN values again\n'
sugg += 'df.isnull().sum().sum()\n'

sugg += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n'
sugg += 'for col in df.columns:\n'
sugg += ' if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n'
sugg += ' df[col] = df[col].fillna(df[col].mean())\n'
sugg += '# check for NaN values again\n'
sugg += 'df.isnull().sum().sum()\n'

sugg += '# for each column, replace the NaN values with the mode of the column if the column is categorical.\n'
sugg += 'for col in df.columns:\n'
sugg += ' if df[col].dtype == np.object:\n'
sugg += ' df[col] = df[col].fillna(df[col].mode()[0])\n'
sugg += '# check for NaN values again\n'

s += sugg
else:
s += "There are no special missing values in the dataset.\n \n"

# Check for NaN values
sugg_2 = ''
if df.isnull().values.any():
s += f'''There are NaN values in the dataset.\n
Number of NaN values: {df.isnull().sum().sum()}\n
Percentage of NaN values: {round(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100, 2)} %\n
NaN values in each column:\n
'''
for col in df.columns:
if df[col].isnull().sum():
s += f'''{col}: {df[col].isnull().sum()}\n'''
# Refactoring :
sugg_2 += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n'

sugg_2 += 'for col in df.columns:\n'
sugg_2 += ' if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n'
sugg_2 += ' df[col] = df[col].fillna(df[col].mean())\n'
sugg_2 += ' elif df[col].dtype == np.object:\n'
sugg_2 += ' df[col] = df[col].fillna(df[col].mode()[0])\n'
sugg_2 += '# check for NaN values again\n'
sugg_2 += 'df.isnull().sum().sum()\n'
s += sugg_2



else:
s += "\nThere are no NaN values in the dataset."

return s
149 changes: 10 additions & 139 deletions backend/app.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
# pip install numpy panda seaborn matplotlib flask flask_cors
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import pandas as pd, numpy as np, io, base64, matplotlib.pyplot as plt, seaborn as sns

import matplotlib
from algorithm.sp_missingvalue import *
from algorithm.correlated import *
from algorithm.duplicate import *
from algorithm.imbalance import *
app = Flask(__name__)
CORS(app)

matplotlib.use('Agg')
def generate_heatmap(df):
corr = df.corr()
corr = df.corr(numeric_only=True)
plt.imshow(corr, cmap='coolwarm', interpolation='none')
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
Expand Down Expand Up @@ -59,140 +63,6 @@ def generate_bargraph_nan_values(df):
return img_base64


def SpecialMissingValues(df):
# Check for special missing values
s = ''
special_missing_values = ['-', 'n/a', 'N/A', 'NA', '--', '?']
pres = {val: 0 for val in special_missing_values}
for val in special_missing_values:
pres[val] = df.isin([val]).sum().sum()
sugg = ''
if any(pres.values()):
# print("There are special missing values in the dataset.")
# print("Number of special missing values:", sum(pres.values()))
# print("Percentage of special missing values:", round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2), "%")
# print("Special missing values in each column:")
# for col in df.columns:
# print(col, ":", df[col].isin(special_missing_values).sum())
# else:
# print("There are no special missing values in the dataset.")
s += f'''There are special missing values in the dataset.\n
Number of special missing values: {sum(pres.values())}\n
Percentage of special missing values: {round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2)} %\n
Special missing values in each column:\n
'''
for col in df.columns:
if df[col].isin(special_missing_values).sum():
s += f'''{col}: {df[col].isin(special_missing_values).sum()}\n'''

# Refactorting :
sugg + '# for each column, replace the special missing values with NaN values.\n'
sugg += 'for col in df.columns:\n'
sugg += ' df[col] = df[col].replace(special_missing_values, np.nan)\n'
sugg += '# check for NaN values again\n'
sugg += 'df.isnull().sum().sum()\n'

sugg += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n'
sugg += 'for col in df.columns:\n'
sugg += ' if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n'
sugg += ' df[col] = df[col].fillna(df[col].mean())\n'
sugg += '# check for NaN values again\n'
sugg += 'df.isnull().sum().sum()\n'

sugg += '# for each column, replace the NaN values with the mode of the column if the column is categorical.\n'
sugg += 'for col in df.columns:\n'
sugg += ' if df[col].dtype == np.object:\n'
sugg += ' df[col] = df[col].fillna(df[col].mode()[0])\n'
sugg += '# check for NaN values again\n'

s += sugg
else:
s += "There are no special missing values in the dataset.\n \n"

# Check for NaN values
sugg_2 = ''
if df.isnull().values.any():
s += f'''There are NaN values in the dataset.\n
Number of NaN values: {df.isnull().sum().sum()}\n
Percentage of NaN values: {round(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100, 2)} %\n
NaN values in each column:\n
'''
for col in df.columns:
if df[col].isnull().sum():
s += f'''{col}: {df[col].isnull().sum()}\n'''
# Refactoring :
sugg_2 += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n'

sugg_2 += 'for col in df.columns:\n'
sugg_2 += ' if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n'
sugg_2 += ' df[col] = df[col].fillna(df[col].mean())\n'
sugg_2 += ' elif df[col].dtype == np.object:\n'
sugg_2 += ' df[col] = df[col].fillna(df[col].mode()[0])\n'
sugg_2 += '# check for NaN values again\n'
sugg_2 += 'df.isnull().sum().sum()\n'
s += sugg_2



else:
s += "\nThere are no NaN values in the dataset."

return s

def correlated(df):
# Compute the correlation matrix
corr_matrix = df.corr()
instr = []

# Identify highly correlated features
high_corr_features = set()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > 0.7: # Change the threshold value as needed
colname = corr_matrix.columns[i]
high_corr_features.add(colname)
# Highest cvalue of correlation on non-diagonal elements
max_corr = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)).stack().max()
print("Maximum correlation value among any two values:", max_corr)
# instr += "Maximum correlation value among any two values: " + str(max_corr) + "\n"
instr.append("Maximum correlation value among any two distinct values: " + str(max_corr) + "\n")

# print giving info
if len(high_corr_features) > 0:
print("There are highly correlated features in the dataset.")
print("Number of highly correlated features:", len(high_corr_features))
print("Highly correlated features:", high_corr_features)
# instr += "There are highly correlated features in the dataset.\n"
# instr += "Number of highly correlated features: " + str(len(high_corr_features)) + "\n"
# instr += "Highly correlated features: " + str(high_corr_features) + "\n"
instr.extend(["There are highly correlated features in the dataset.\n", "Number of highly correlated features: " + str(len(high_corr_features)) + "\n", "Highly correlated features: " + str(high_corr_features) + "\n"])




else:
print("There are no highly correlated features in the dataset.")
# instr += "There are no highly correlated features in the dataset.\n"
instr.append("There are no highly correlated features in the dataset.\n")

return instr

def duplicated(df):
duplicates = df.duplicated()
instr = ''
if any(duplicates):
# print("Duplicate examples are present in the dataset.")
# print("Number of duplicate examples:", duplicates.sum())
# print("Indices of duplicate examples:", df.index[duplicates].tolist())
instr += "Duplicate examples are present in the dataset.\n"
instr += "Number of duplicate examples: " + str(duplicates.sum()) + "\n"
instr += "Indices of duplicate examples: " + str(df.index[duplicates].tolist()) + "\n"

else:
# print("There are no duplicate examples in the dataset.")
instr += "There are no duplicate examples in the dataset.\n"

return instr


@app.route('/upload', methods=['POST'])
Expand All @@ -209,9 +79,10 @@ def upload():
results['bargraph_sp_miss'] = generate_bargraph_special_missing_values(df)
results['bargraph_nan'] = generate_bargraph_nan_values(df)
results['duplicates'] = duplicated(df)
results['imbalance'] = imbalance(df)
j = jsonify(results)
print(j)
return j
# print(df)
return results

if __name__ == '__main__':
app.run(debug= True)
Expand Down
Binary file removed backend/req.txt
Binary file not shown.
7 changes: 7 additions & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Flask==2.2.3
flask_cors==3.0.10
jsonschema==4.17.3
matplotlib==3.7.1
numpy==1.24.2
pandas==1.5.3
seaborn==0.12.2
Binary file removed backend/requirements_backend.txt
Binary file not shown.
7 changes: 6 additions & 1 deletion frontend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,9 @@

npm-debug.log*
yarn-debug.log*
yarn-error.log*
yarn-error.log*

__pycache__/

# Python env
*env
Loading