diff --git a/backend/.gitignore b/backend/.gitignore new file mode 100644 index 00000000..5235e02b --- /dev/null +++ b/backend/.gitignore @@ -0,0 +1,5 @@ + +__pycache__/ + +# Python env +*env/ diff --git a/backend/algorithm/__pycache__/correlated.cpython-310.pyc b/backend/algorithm/__pycache__/correlated.cpython-310.pyc new file mode 100644 index 00000000..5128803e Binary files /dev/null and b/backend/algorithm/__pycache__/correlated.cpython-310.pyc differ diff --git a/backend/algorithm/__pycache__/correlated.cpython-311.pyc b/backend/algorithm/__pycache__/correlated.cpython-311.pyc new file mode 100644 index 00000000..f3906ea8 Binary files /dev/null and b/backend/algorithm/__pycache__/correlated.cpython-311.pyc differ diff --git a/backend/algorithm/__pycache__/duplicate.cpython-310.pyc b/backend/algorithm/__pycache__/duplicate.cpython-310.pyc new file mode 100644 index 00000000..836a6b9a Binary files /dev/null and b/backend/algorithm/__pycache__/duplicate.cpython-310.pyc differ diff --git a/backend/algorithm/__pycache__/duplicate.cpython-311.pyc b/backend/algorithm/__pycache__/duplicate.cpython-311.pyc new file mode 100644 index 00000000..3a551b7c Binary files /dev/null and b/backend/algorithm/__pycache__/duplicate.cpython-311.pyc differ diff --git a/backend/algorithm/__pycache__/sp_missingvalue.cpython-310.pyc b/backend/algorithm/__pycache__/sp_missingvalue.cpython-310.pyc new file mode 100644 index 00000000..aaab02ef Binary files /dev/null and b/backend/algorithm/__pycache__/sp_missingvalue.cpython-310.pyc differ diff --git a/backend/algorithm/__pycache__/sp_missingvalue.cpython-311.pyc b/backend/algorithm/__pycache__/sp_missingvalue.cpython-311.pyc new file mode 100644 index 00000000..400c19bc Binary files /dev/null and b/backend/algorithm/__pycache__/sp_missingvalue.cpython-311.pyc differ diff --git a/backend/algorithm/correlated.py b/backend/algorithm/correlated.py new file mode 100644 index 00000000..073bf5e0 --- /dev/null +++ b/backend/algorithm/correlated.py @@ -0,0 +1,39 @@ +import numpy as np + +def correlated(df): + # Compute the correlation matrix + corr_matrix = df.corr(numeric_only=True) + instr = [] + + # Identify highly correlated features + high_corr_features = set() + for i in range(len(corr_matrix.columns)): + for j in range(i): + if abs(corr_matrix.iloc[i, j]) > 0.7: # Change the threshold value as needed + colname = corr_matrix.columns[i] + high_corr_features.add(colname) + # Highest cvalue of correlation on non-diagonal elements + max_corr = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_)).stack().max() + print("Maximum correlation value among any two values:", max_corr) + # instr += "Maximum correlation value among any two values: " + str(max_corr) + "\n" + instr.append("Maximum correlation value among any two distinct values: " + str(max_corr) + "\n") + + # print giving info + if len(high_corr_features) > 0: + print("There are highly correlated features in the dataset.") + print("Number of highly correlated features:", len(high_corr_features)) + print("Highly correlated features:", high_corr_features) + # instr += "There are highly correlated features in the dataset.\n" + # instr += "Number of highly correlated features: " + str(len(high_corr_features)) + "\n" + # instr += "Highly correlated features: " + str(high_corr_features) + "\n" + instr.extend(["There are highly correlated features in the dataset.\n", "Number of highly correlated features: " + str(len(high_corr_features)) + "\n", "Highly correlated features: " + str(high_corr_features) + "\n"]) + + + + + else: + print("There are no highly correlated features in the dataset.") + # instr += "There are no highly correlated features in the dataset.\n" + instr.append("There are no highly correlated features in the dataset.\n") + + return instr \ No newline at end of file diff --git a/backend/algorithm/duplicate.py b/backend/algorithm/duplicate.py new file mode 100644 index 00000000..8651b1ff --- /dev/null +++ b/backend/algorithm/duplicate.py @@ -0,0 +1,16 @@ +def duplicated(df): + duplicates = df.duplicated() + instr = '' + if any(duplicates): + # print("Duplicate examples are present in the dataset.") + # print("Number of duplicate examples:", duplicates.sum()) + # print("Indices of duplicate examples:", df.index[duplicates].tolist()) + instr += "Duplicate examples are present in the dataset.\n" + instr += "Number of duplicate examples: " + str(duplicates.sum()) + "\n" + instr += "Indices of duplicate examples: " + str(df.index[duplicates].tolist()) + "\n" + + else: + # print("There are no duplicate examples in the dataset.") + instr += "There are no duplicate examples in the dataset.\n" + + return instr diff --git a/backend/algorithm/imbalance.py b/backend/algorithm/imbalance.py new file mode 100644 index 00000000..3d40f648 --- /dev/null +++ b/backend/algorithm/imbalance.py @@ -0,0 +1,34 @@ +def imbalance(df): + imb = False; nu = 1 + s = '' + # Check for class imbalance + for col in df.columns: + class_counts = df[col].value_counts() + if class_counts.min() / class_counts.max() < 0.1 and not col in ['Date', 'Time', 'Name' ] and not df[col].dtype in ['int', 'float32', 'float', 'c']: + # if all values are unique, then continue + if len(df[col].unique()) == len(df): + continue + print(f"{nu}) Class imbalance detected in column '{col}' with ", end= '') + s+= f"{nu}) Class imbalance detected in column '{col}' with " + imb = True + print("Class imbalance ratio:", round(class_counts.min() / class_counts.max(), 10),'\n') + s+= "Class imbalance ratio:" + str(round(class_counts.min() / class_counts.max(), 10)) + '\n' + nu += 1 + # ck = class_counts.to_dict() + # # print few elements of dictionary + # print("Class counts:", {k: ck[k] for k in list(ck)[:5]}, '... etc') + + if imb: + print("Potential mitigation strategies:") + print("- Use appropriate sampling techniques to balance the classes.") + print("- Use appropriate evaluation metrics like F1 score, precision, recall, etc. as accuracy is not a good metric for imbalanced datasets.") + print("- Use appropriate regularization techniques like class weights to combat bias.") + s += '''Potential mitigation strategies: + - Use appropriate sampling techniques to balance the classes. + - Use appropriate evaluation metrics like F1 score, precision, recall, etc. as accuracy is not a good metric for imbalanced datasets. + - Use appropriate regularization techniques like class weights to combat bias.\n''' + else: + print("There is no class imbalance in the dataset.") + print("No mitigation strategies are required.") + s += "There is no class imbalance in the dataset.\nNo mitigation strategies are required.\n" + return s \ No newline at end of file diff --git a/backend/algorithm/sp_missingvalue.py b/backend/algorithm/sp_missingvalue.py new file mode 100644 index 00000000..5ee0b2fc --- /dev/null +++ b/backend/algorithm/sp_missingvalue.py @@ -0,0 +1,79 @@ +def SpecialMissingValues(df): + # Check for special missing values + s = '' + special_missing_values = ['-', 'n/a', 'N/A', 'NA', '--', '?'] + pres = {val: 0 for val in special_missing_values} + for val in special_missing_values: + pres[val] = df.isin([val]).sum().sum() + sugg = '' + if any(pres.values()): + # print("There are special missing values in the dataset.") + # print("Number of special missing values:", sum(pres.values())) + # print("Percentage of special missing values:", round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2), "%") + # print("Special missing values in each column:") + # for col in df.columns: + # print(col, ":", df[col].isin(special_missing_values).sum()) + # else: + # print("There are no special missing values in the dataset.") + s += f'''There are special missing values in the dataset.\n + Number of special missing values: {sum(pres.values())}\n + Percentage of special missing values: {round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2)} %\n + Special missing values in each column:\n + ''' + for col in df.columns: + if df[col].isin(special_missing_values).sum(): + s += f'''{col}: {df[col].isin(special_missing_values).sum()}\n''' + + # Refactorting : + sugg + '# for each column, replace the special missing values with NaN values.\n' + sugg += 'for col in df.columns:\n' + sugg += ' df[col] = df[col].replace(special_missing_values, np.nan)\n' + sugg += '# check for NaN values again\n' + sugg += 'df.isnull().sum().sum()\n' + + sugg += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n' + sugg += 'for col in df.columns:\n' + sugg += ' if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n' + sugg += ' df[col] = df[col].fillna(df[col].mean())\n' + sugg += '# check for NaN values again\n' + sugg += 'df.isnull().sum().sum()\n' + + sugg += '# for each column, replace the NaN values with the mode of the column if the column is categorical.\n' + sugg += 'for col in df.columns:\n' + sugg += ' if df[col].dtype == np.object:\n' + sugg += ' df[col] = df[col].fillna(df[col].mode()[0])\n' + sugg += '# check for NaN values again\n' + + s += sugg + else: + s += "There are no special missing values in the dataset.\n \n" + + # Check for NaN values + sugg_2 = '' + if df.isnull().values.any(): + s += f'''There are NaN values in the dataset.\n + Number of NaN values: {df.isnull().sum().sum()}\n + Percentage of NaN values: {round(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100, 2)} %\n + NaN values in each column:\n + ''' + for col in df.columns: + if df[col].isnull().sum(): + s += f'''{col}: {df[col].isnull().sum()}\n''' + # Refactoring : + sugg_2 += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n' + + sugg_2 += 'for col in df.columns:\n' + sugg_2 += ' if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n' + sugg_2 += ' df[col] = df[col].fillna(df[col].mean())\n' + sugg_2 += ' elif df[col].dtype == np.object:\n' + sugg_2 += ' df[col] = df[col].fillna(df[col].mode()[0])\n' + sugg_2 += '# check for NaN values again\n' + sugg_2 += 'df.isnull().sum().sum()\n' + s += sugg_2 + + + + else: + s += "\nThere are no NaN values in the dataset." + + return s \ No newline at end of file diff --git a/backend/app.py b/backend/app.py index 6b0abe4a..395db6bc 100644 --- a/backend/app.py +++ b/backend/app.py @@ -1,13 +1,17 @@ -# pip install numpy panda seaborn matplotlib flask flask_cors from flask import Flask, request, jsonify, send_file from flask_cors import CORS import pandas as pd, numpy as np, io, base64, matplotlib.pyplot as plt, seaborn as sns - +import matplotlib +from algorithm.sp_missingvalue import * +from algorithm.correlated import * +from algorithm.duplicate import * +from algorithm.imbalance import * app = Flask(__name__) CORS(app) +matplotlib.use('Agg') def generate_heatmap(df): - corr = df.corr() + corr = df.corr(numeric_only=True) plt.imshow(corr, cmap='coolwarm', interpolation='none') plt.xticks(range(len(corr.columns)), corr.columns, rotation=90) plt.yticks(range(len(corr.columns)), corr.columns) @@ -59,140 +63,6 @@ def generate_bargraph_nan_values(df): return img_base64 -def SpecialMissingValues(df): - # Check for special missing values - s = '' - special_missing_values = ['-', 'n/a', 'N/A', 'NA', '--', '?'] - pres = {val: 0 for val in special_missing_values} - for val in special_missing_values: - pres[val] = df.isin([val]).sum().sum() - sugg = '' - if any(pres.values()): - # print("There are special missing values in the dataset.") - # print("Number of special missing values:", sum(pres.values())) - # print("Percentage of special missing values:", round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2), "%") - # print("Special missing values in each column:") - # for col in df.columns: - # print(col, ":", df[col].isin(special_missing_values).sum()) - # else: - # print("There are no special missing values in the dataset.") - s += f'''There are special missing values in the dataset.\n - Number of special missing values: {sum(pres.values())}\n - Percentage of special missing values: {round(sum(pres.values()) / (df.shape[0] * df.shape[1]) * 100, 2)} %\n - Special missing values in each column:\n - ''' - for col in df.columns: - if df[col].isin(special_missing_values).sum(): - s += f'''{col}: {df[col].isin(special_missing_values).sum()}\n''' - - # Refactorting : - sugg + '# for each column, replace the special missing values with NaN values.\n' - sugg += 'for col in df.columns:\n' - sugg += ' df[col] = df[col].replace(special_missing_values, np.nan)\n' - sugg += '# check for NaN values again\n' - sugg += 'df.isnull().sum().sum()\n' - - sugg += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n' - sugg += 'for col in df.columns:\n' - sugg += ' if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n' - sugg += ' df[col] = df[col].fillna(df[col].mean())\n' - sugg += '# check for NaN values again\n' - sugg += 'df.isnull().sum().sum()\n' - - sugg += '# for each column, replace the NaN values with the mode of the column if the column is categorical.\n' - sugg += 'for col in df.columns:\n' - sugg += ' if df[col].dtype == np.object:\n' - sugg += ' df[col] = df[col].fillna(df[col].mode()[0])\n' - sugg += '# check for NaN values again\n' - - s += sugg - else: - s += "There are no special missing values in the dataset.\n \n" - - # Check for NaN values - sugg_2 = '' - if df.isnull().values.any(): - s += f'''There are NaN values in the dataset.\n - Number of NaN values: {df.isnull().sum().sum()}\n - Percentage of NaN values: {round(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100, 2)} %\n - NaN values in each column:\n - ''' - for col in df.columns: - if df[col].isnull().sum(): - s += f'''{col}: {df[col].isnull().sum()}\n''' - # Refactoring : - sugg_2 += '# for each column, replace the NaN values with the mean of the column if the column is numeric.\n' - - sugg_2 += 'for col in df.columns:\n' - sugg_2 += ' if df[col].dtype == np.float64 or df[col].dtype == np.int64:\n' - sugg_2 += ' df[col] = df[col].fillna(df[col].mean())\n' - sugg_2 += ' elif df[col].dtype == np.object:\n' - sugg_2 += ' df[col] = df[col].fillna(df[col].mode()[0])\n' - sugg_2 += '# check for NaN values again\n' - sugg_2 += 'df.isnull().sum().sum()\n' - s += sugg_2 - - - - else: - s += "\nThere are no NaN values in the dataset." - - return s - -def correlated(df): - # Compute the correlation matrix - corr_matrix = df.corr() - instr = [] - - # Identify highly correlated features - high_corr_features = set() - for i in range(len(corr_matrix.columns)): - for j in range(i): - if abs(corr_matrix.iloc[i, j]) > 0.7: # Change the threshold value as needed - colname = corr_matrix.columns[i] - high_corr_features.add(colname) - # Highest cvalue of correlation on non-diagonal elements - max_corr = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)).stack().max() - print("Maximum correlation value among any two values:", max_corr) - # instr += "Maximum correlation value among any two values: " + str(max_corr) + "\n" - instr.append("Maximum correlation value among any two distinct values: " + str(max_corr) + "\n") - - # print giving info - if len(high_corr_features) > 0: - print("There are highly correlated features in the dataset.") - print("Number of highly correlated features:", len(high_corr_features)) - print("Highly correlated features:", high_corr_features) - # instr += "There are highly correlated features in the dataset.\n" - # instr += "Number of highly correlated features: " + str(len(high_corr_features)) + "\n" - # instr += "Highly correlated features: " + str(high_corr_features) + "\n" - instr.extend(["There are highly correlated features in the dataset.\n", "Number of highly correlated features: " + str(len(high_corr_features)) + "\n", "Highly correlated features: " + str(high_corr_features) + "\n"]) - - - - - else: - print("There are no highly correlated features in the dataset.") - # instr += "There are no highly correlated features in the dataset.\n" - instr.append("There are no highly correlated features in the dataset.\n") - - return instr - -def duplicated(df): - duplicates = df.duplicated() - instr = '' - if any(duplicates): - # print("Duplicate examples are present in the dataset.") - # print("Number of duplicate examples:", duplicates.sum()) - # print("Indices of duplicate examples:", df.index[duplicates].tolist()) - instr += "Duplicate examples are present in the dataset.\n" - instr += "Number of duplicate examples: " + str(duplicates.sum()) + "\n" - instr += "Indices of duplicate examples: " + str(df.index[duplicates].tolist()) + "\n" - - else: - # print("There are no duplicate examples in the dataset.") - instr += "There are no duplicate examples in the dataset.\n" - - return instr @app.route('/upload', methods=['POST']) @@ -209,9 +79,10 @@ def upload(): results['bargraph_sp_miss'] = generate_bargraph_special_missing_values(df) results['bargraph_nan'] = generate_bargraph_nan_values(df) results['duplicates'] = duplicated(df) + results['imbalance'] = imbalance(df) j = jsonify(results) - print(j) - return j + # print(df) + return results if __name__ == '__main__': app.run(debug= True) diff --git a/backend/req.txt b/backend/req.txt deleted file mode 100644 index e2c2902f..00000000 Binary files a/backend/req.txt and /dev/null differ diff --git a/backend/requirements.txt b/backend/requirements.txt index e69de29b..c05a1323 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -0,0 +1,7 @@ +Flask==2.2.3 +flask_cors==3.0.10 +jsonschema==4.17.3 +matplotlib==3.7.1 +numpy==1.24.2 +pandas==1.5.3 +seaborn==0.12.2 diff --git a/backend/requirements_backend.txt b/backend/requirements_backend.txt deleted file mode 100644 index e2c2902f..00000000 Binary files a/backend/requirements_backend.txt and /dev/null differ diff --git a/frontend/.gitignore b/frontend/.gitignore index 24cdedf8..c57060b6 100644 --- a/frontend/.gitignore +++ b/frontend/.gitignore @@ -20,4 +20,9 @@ npm-debug.log* yarn-debug.log* -yarn-error.log* \ No newline at end of file +yarn-error.log* + +__pycache__/ + +# Python env +*env \ No newline at end of file diff --git a/frontend/src/App.js b/frontend/src/App.js index e709cf21..c10d4cb9 100644 --- a/frontend/src/App.js +++ b/frontend/src/App.js @@ -22,7 +22,7 @@ function App() { const formData = new FormData(); formData.append('file', selectedFile); - axios.post('http://localhost:5000/upload', formData) + axios.post('http://127.0.0.1:5000/upload', formData) .then(response => { setAnalysisData(response.data); setHeatmapData(response.data.heatmap); @@ -125,6 +125,14 @@ function App() { )} +