From db12165c8b1ebed4774da1f2a5d6ea1a303a9bb6 Mon Sep 17 00:00:00 2001 From: brandonkw123 <54588120+brandonkw123@users.noreply.github.com> Date: Sat, 5 Sep 2020 18:21:39 -0400 Subject: [PATCH 1/2] Add files via upload --- .../KMeansClustering.py | 130 ++++++++++++++ .../KNearestNeighborClassifier.py | 67 ++++++++ .../NearestNeighborClassifier.py | 112 ++++++++++++ .../README.md | 97 +++++++++++ .../KNearestNeighborClassifier.cpython-37.pyc | Bin 0 -> 1500 bytes .../NearestNeighborClassifier.cpython-37.pyc | Bin 0 -> 2756 bytes .../NearestNeighborClassifier.cpython-38.pyc | Bin 0 -> 2761 bytes .../ckd.csv | 159 ++++++++++++++++++ 8 files changed, 565 insertions(+) create mode 100644 Summer-2020-Data-Analysis-Project-master/KMeansClustering.py create mode 100644 Summer-2020-Data-Analysis-Project-master/KNearestNeighborClassifier.py create mode 100644 Summer-2020-Data-Analysis-Project-master/NearestNeighborClassifier.py create mode 100644 Summer-2020-Data-Analysis-Project-master/README.md create mode 100644 Summer-2020-Data-Analysis-Project-master/__pycache__/KNearestNeighborClassifier.cpython-37.pyc create mode 100644 Summer-2020-Data-Analysis-Project-master/__pycache__/NearestNeighborClassifier.cpython-37.pyc create mode 100644 Summer-2020-Data-Analysis-Project-master/__pycache__/NearestNeighborClassifier.cpython-38.pyc create mode 100644 Summer-2020-Data-Analysis-Project-master/ckd.csv diff --git a/Summer-2020-Data-Analysis-Project-master/KMeansClustering.py b/Summer-2020-Data-Analysis-Project-master/KMeansClustering.py new file mode 100644 index 0000000..31e4484 --- /dev/null +++ b/Summer-2020-Data-Analysis-Project-master/KMeansClustering.py @@ -0,0 +1,130 @@ +# ============================================================================= +# KMeansClustering.py +# Name: Alycia Wong and Brandon Wong +# Date: June 2020 +# Description: Process and graph a CSV file containing biomedical data that +# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +# Randomly generate up to 10 centroids without issue. Each centroid will have a +# classification. The nearest centroid to a point will determine the point's +# classicfication (decide what to do if the distances are equal yourself). +# Create random test cases until centroids stop mocing and determine whether +# each case is likely to have CKD depending on the classification of the +# nearest centroid. +# Bonus: Create lines roughly separating each centroid group +# ============================================================================= + +# ============================================================================= +# Import statements +# ============================================================================= +import matplotlib.pyplot as plt +import numpy as np +import NearestNeighborClassifier as NNC +from scipy.spatial import KDTree as kdt + +# ============================================================================= +# Functions +# ============================================================================= +# randomCentroids function takes in an integer number of clusters to be +# generated. +# OR asks for k number of integer clusters +# Outputs a 2D array filled with random values between 0-1. The +# first column represents glucose and the second column represents hemoglobin. +# There are k number of rows representing the number of centroids and the +# classification of each centroid (i.e.: row index = classification value). +# OR you can have a third column with the classification value. +def randomCentroids(k): + return np.random.rand(k,2) + +# assignCentroids function takes in an array of normalized x (hemoglobin) and y +# (glucose) values from the CSV file and the randomly generated array of +# centroids from randomCentroids. Using the findDistance function from +# NearestNeighborClassifier, points are assigned the same classification as the +# nearest centroid. A 2D array of the normalized data and its classification +# are returned. +def assignToCentroids(normArr, centArr): + return kdt(centArr).query(normArr)[1] +# print(assignToCentroids(NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, np.array([[.5, .5],[.25,.25]]))) + +# updateCentroids function inputs the 2D array of centroid locations and of +# classified and normalized CSV data. The average x (hemo) and y (gluc) +# positions of all data points for each classifications are found and an +# updated 2D array with these average cartesian points as the location for the +# new centroids is returned along with the original cartesian points. +#avg of all 1s will be new cent, avg of all 0s will be new cent + +def updateCentroids(centArr, classArr, normArr): + upCentArr = centArr.copy() + for i in range(len(centArr[:,0])): + upCentArr[i,0] = np.mean(normArr.gluc[classArr==i]) + upCentArr[i,1] = np.mean(normArr.hemo[classArr==i]) + return upCentArr +# centArr = np.array([[0.5, 0.5], [.25, .25]]) +# print(updateCentroids( +# centArr, assignToCentroids( +# NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, centArr), +# NNC.normalizeData(NNC.openCSVFile('ckd.csv')) +# )) +# print(centArr) + +# iterate void function can either +# a) input information and iterate the original information until centArr ~ +# upCentArr +def iterate(normArr, centArr): + # classArr = np.zeros(len(normArr.gluc)) + classArr = assignToCentroids(normArr, centArr) + upCentArr = updateCentroids(centArr, classArr, normArr) + # print(classArr) + if (upCentArr != centArr).any(): + centArr = upCentArr + return iterate(normArr, centArr) + return centArr +print(iterate( + NNC.normalizeData(NNC.openCSVFile('ckd.csv')), np.array([[.5, .5],[.25,.25]]) + )) + +# graphClusters void function takes in a 1D and a 2D numpy array to graph. The +# 1D array of centroid locations and classifactions have distinct points on the +# graph. The 2D array graphs points of normalized CSV data and colors them the +# same color as their corresponding centroids. A legend is generated in a +# reasonable position. +# Bonus: Create lines roughly separating each centroid group +def graphClusters(): + + return + +# dataAnalysis void function takes in the original parsed CSV classifications +# and the final classifications of the data based on K-means clustering (use of +# centroids) and compares the two to find false/true positives/negatives. +# Note: This should only run when there are two centroids (i.e.: k = 2) +# False positive: Percentage of non-CKD were incorrectly labelled by K-Means as +# being in the CKD cluster +# True positive (sensitivity): Percentage of CKD patients were correctly +# labeled by K-Means +# False negative: Percentage of non-CKD were incorrectly labelled by K-Means as +# being in the CKD cluster +# True negative (specificity): Percentage of non-CKD patients were correctly +# labelled by K-Means +# Note: True positive (~93 %) + False positive (~7%) = 100% +# Note: True Negative (~100%) + False negative (~0%) = 100% +def dataAnalysis(): + return +# ============================================================================= +# Main Script +# ============================================================================= +# mainDriver function takes in nothing and graphs both the orginial CSV file, +# the k number of nearest neighbors, and the test case. This function returns +# 0. +def mainDriver(): + # Open the CSV file using the parsing method from + # NearestNeighborClassifier. No input, outputs 2D numpy array. + NNC.openCSVFile + + # Normalize data using method from NearestNeighborClassifier. Input and + # outputs a 2D numpy array + NNC.normalizeData() + + # Graph CSV file using method from NearestNeighborClassifier. Input 2D + # numpy array. Void function. + NNC.graphCSVFile() + + return 0 \ No newline at end of file diff --git a/Summer-2020-Data-Analysis-Project-master/KNearestNeighborClassifier.py b/Summer-2020-Data-Analysis-Project-master/KNearestNeighborClassifier.py new file mode 100644 index 0000000..5f53890 --- /dev/null +++ b/Summer-2020-Data-Analysis-Project-master/KNearestNeighborClassifier.py @@ -0,0 +1,67 @@ +# ============================================================================= +# KNearestNeighborClassifier.py +# Name: Alycia Wong and Brandon Wong +# Date: June 2020 +# Description: Process and graph a CSV file containing biomedical data that +# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +# Create a random test case and determine whether the case is +# likely to have CKD depending on the mode of the classifications of the +# k number of nearest points. +# ============================================================================= + +# ============================================================================= +# Import statements +# ============================================================================= +import matplotlib.pyplot as plt +import numpy as np +import NearestNeighborClassifier as NNC +from statistics import mode + +# ============================================================================= +# Functions +# ============================================================================= +# findDistanceArray inputs a numpy array, a random point, and an integer k and +# uses the findDistance function from NearestNeighborClassifier. The function +# outputs a 1D array containing the k number of nearst points to the random +# test case. +def findDistanceArray(normArr, testCase, k): + distArr = np.zeros(normArr.len) + for i in range(len(distArr)): + distArr[i] = NNC.findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0]) + kindex = np.argsort(distArr)[:k] + return kindex + +# graphKNearestNeighbor void function takes in two 1D and one 2D numpy arrays +# to graph. One of the 1D arrays is a random testCase with its own distinct +# points. The other 1D array is used to circle the k number of points closest +# to the test case. The 2D array contains information parsed from the CSV +# column. The first column (hemoglobin) is graphed as the x-axis and the second +# column (glucose) as the y-axis. The third column (classification) determines +# the color of the points. A legend is generated in a reasonable position. +def graphKNearestNeighbor(testCase, normArr, k): + kindex = findDistanceArray(normArr, testCase, k) + NNC.graphCSVFile(normArr) + plt.scatter(testCase[1], testCase[0], + c = ('b' if mode(normArr.disease[kindex])==0 else 'r'), + label = 'Test Case', + marker = "x") + plt.scatter(normArr.hemo[kindex], normArr.gluc[kindex], + c='y', label = 'Nearest neighbor(s)') + print("butts") + plt.legend(fontsize="small") + plt.show() + return + +# ============================================================================= +# Main Script +# ============================================================================= +# mainDriver function takes in nothing and graphs both the orginial CSV file, +# the k number of nearest neighbors, and the test case. This function returns +# 0.5 +def mainDriver(): + val = int(input("How many neighbors are you looking for: ")) + test = NNC.createTestCase() + normal = NNC.normalizeData(NNC.openCSVFile('ckd.csv')) + graphKNearestNeighbor(test, normal, val) + return 0 +mainDriver() \ No newline at end of file diff --git a/Summer-2020-Data-Analysis-Project-master/NearestNeighborClassifier.py b/Summer-2020-Data-Analysis-Project-master/NearestNeighborClassifier.py new file mode 100644 index 0000000..cb298ce --- /dev/null +++ b/Summer-2020-Data-Analysis-Project-master/NearestNeighborClassifier.py @@ -0,0 +1,112 @@ +# ============================================================================= +# NearestNeighborClassifier.py +# Name: Alycia Wong and Brandon Wong +# Date: June 2020 +# Description: Process and graph a CSV file containing biomedical data that +# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +# Create n number of random test cases and determine whether the case is +# likely to have CKD depending on the classification of the nearest point. +# ============================================================================= + +# ============================================================================= +# Import statements +# ============================================================================= +import matplotlib.pyplot as plt +import numpy as np + +# ============================================================================= +# Classes +# ============================================================================= +class Butts: + def __init__(self, data): + self.gluc = data[:,0] + self.hemo = data[:,1] + self.disease = data[:,2] + self.len = len(data) + self.all = data[:,:3] + self.paras = data[:,:2] + self.shape = np.shape(data) + self.colmax = np.amax(data, axis = 0) + self.colmin = np.amin(data, axis = 0) + +# ============================================================================= +# Functions +# ============================================================================= +# Parses in file and turns it into Butts class of data +def openCSVFile(fileName): + return Butts(np.genfromtxt(fileName, delimiter=',',skip_header=1)) + +# Takes in butts class +# Loops over data normalizing it for every row +# returns normalized butts class data +def normalizeData(dataArr): + normArr = np.zeros(dataArr.shape) + for i in range(len(normArr)): + normArr[i] = (dataArr.all[i] - dataArr.colmin) / (dataArr.colmax - dataArr.colmin) + return Butts(normArr) + +# graphCSVFile void function takes in a 2D numpy array and graphs with the +# first column (hemoglobin) as the x-axis and second column (glucose) as the +# y-axis. The third column (classification) is used to determine the color of +# the points on the graph. +def graphCSVFile(normArr): + plt.scatter(normArr.hemo[normArr.disease==0], normArr.gluc[normArr.disease==0], + c='b', label='No CKD' ) + plt.scatter(normArr.hemo[normArr.disease==1], normArr.gluc[normArr.disease==1], + c='r', label='CKD') + plt.title('Hemoglobin and Glucose levels') + plt.xlabel('Hemoglobin') + plt.ylabel('Glucose') + return +# findDistance function is either: +# a) takes in an array and a point and returns an array of distances or the +# minimum distance or +# B) takes in cartesian coordinates and uses a simple use of the distance +# formula to return the distance between the two points. +def findDistance(x1, y1, x2, y2): + return np.sqrt((x1-x2)**2+(y1-y2)**2) + +# createTestCase function creates two random test cases (hemoglobin and +# glucose) from 0-1 and: +# creates a new 1D array with the two points +# return the points raw +def createTestCase(): + return np.random.rand(2) + +# nearestNeighborIndex takes in the test case point and returns the index of the +# nearest point to the test case +def nearestNeighborIndex(testCase, normArr): + distArr = np.zeros(normArr.len) + for i in range(len(distArr)): + distArr[i] = findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0]) + nni = distArr.argmin() + return nni + +# graphNearestNeighbor void function takes in a 2D numpy array (and a cartesian +# coordinate depending on createTestCase) and graphs the first column +# (hemoglobin) as the x-axis and the second column (glucose) as the y-axis +# the third column (classification) determines the color of the points. A +# randomly generated test case is graphed as a distinct point with a +# line connecting it to the nearest neighbor whose classification it takes on. +# A legend is generated in a reasonable position. +def graphNearestNeighbor(testCase, normArr): + nni = nearestNeighborIndex(testCase, normArr) + graphCSVFile(normArr) + plt.scatter(testCase[1], testCase[0], + c = ('b' if normArr.disease[nni]==0 else 'r'), + label = 'Test Case', + marker = "x") + plt.plot([testCase[1], normArr.hemo[nni]], [testCase[0], normArr.gluc[nni]], 'k-') + plt.legend() + plt.show() + return + +# ============================================================================= +# Main Script +# ============================================================================= +# mainDriver function takes in no inputs and graphs both the orginial CSV +# file and the test case. This function returns 0. +def mainDriver(): + graphNearestNeighbor(createTestCase(), normalizeData(openCSVFile('ckd.csv'))) + return 0 +# mainDriver() \ No newline at end of file diff --git a/Summer-2020-Data-Analysis-Project-master/README.md b/Summer-2020-Data-Analysis-Project-master/README.md new file mode 100644 index 0000000..d1d53d0 --- /dev/null +++ b/Summer-2020-Data-Analysis-Project-master/README.md @@ -0,0 +1,97 @@ +# Summer-2020-ML-Project + +# Nearest Neighbor Classifier Script Description: +Process and graph a CSV file containing biomedical data that relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +Create n number of random test cases and determine whether the case is likely to have CKD depending on the classification of the nearest point. + +# Nearest Neighbor Classifier Function Descriptions: +openCSVFile function takes in no arguments and parses/organizes data from a CSV file into a 2-D numpy array with the columns being: +hemoglobin, glucose, classification and each row being a case. + +normalizeData function takes in a 2D numpy array and +scales down the first and second columns to range from 0-1 and +outputs a 2D array with the normalized data. + +graphCSVFile void function takes in a 2D numpy array and graphs with: +the first column (hemoglobin) as the x-axis and second column (glucose) as the y-axis. +The third column (classification) is used to determine the color of the points on the graph. + +findDistance function is either takes in cartesian coordinates and +uses a simple use of the distance formula +to return the distance between the two points. + +createTestCase function creates two random test cases (hemoglobin and glucose) from 0-1 and +creates/returns a new 1D array with the two points. + +graphNearestNeighbor void function takes in a 2D numpy array (and a cartesian +coordinate depending on createTestCase) and +graphs the first column (hemoglobin) as the x-axis and the second column (glucose) as the y-axis. +The third column (classification) determines the color of the points. +A randomly generated test case is graphed as a distinct point with a line connecting it to the nearest neighbor whose classification it takes on. +A legend is generated in a reasonable position. + +mainDriver function takes in no inputs and graphs both the orginial CSV file and the test case. +This function returns 0. + +# K Nearest Nearest Neighbor Classifier Script Description: + +Process and graph a CSV file containing biomedical data that relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +Create n number of random test cases and determine whether the case is likely to have CKD depending on the mode of the classifications of the k number of nearest points. + +# K Nearest Nearest Neighbor Classifier Functions Descriptions: + +findDistanceArray inputs a numpy array, a random point, and an integer k and +uses the findDistance function from NearestNeighborClassifier. +The function outputs a 1D array containing the k number of nearst points to the random test case. + +graphKNearestNeighbor void function takes in two 1D and one 2D numpy arrays to graph. +One of the 1D arrays is a random testCase with its own distinct points. +The other 1D array is used to circle the k number of points closest to the test case. +The 2D array contains information parsed from the CSV column. +The first column (hemoglobin) is graphed as the x-axis and the second column (glucose) as the y-axis. +The third column (classification) determines the color of the points. +A legend is generated in a reasonable position. + +mainDriver function takes in nothing and graphs both the orginial CSV file, the k number of nearest neighbors, and the test case. +This function returns 0. + +# K Means Clustering Script Description: + +Process and graph a CSV file containing biomedical data that relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +Randomly generate up to 10 centroids without issue. +Each centroid will have a classification. +The nearest centroid to a point will determine the point's classicfication (decide what to do if the distances are equal yourself). +Create random test cases until centroids stop mocing and determine whether each case is likely to have CKD depending on the classification of the nearest centroid. + +# K Means ClusteringClassifier Functions Descriptions: + +randomCentroids function takes in an integer number of clusters to be generated. +OR asks for k number of integer clusters +Outputs a 2D array filled with random values between 0-1. +The first column represents hemoglobin and the second column represents glucose. +There are k number of rows representing the number of centroids and the classification of each centroid (i.e.: row index = classification value). +OR you can have a third column with the classification value. + +assignCentroids function takes in an array of normalized x (hemoglobin) and y (glucose) values from the CSV file and the randomly generated array of centroids from randomCentroids. +Using the findDistance function from NearestNeighborClassifier, points are assigned the same classification as the nearest centroid. +A 2D array of the normalized data and its classification are returned. + +updateCentroids function inputs the 2D array of centroid locations and of classified and normalized CSV data. +The average x (hemo) and y (gluc) positions of all data points for each classifications are found and +an updated 2D array with these average cartesian points as the location for the new centroids is returned along with the original cartesian points. + +iterate void function can either +a) input information and iterate the original information until centArr ~ upCentArr +b) don't input any information and run by itself. Similar to a main script +The function causes for the centroids to reassign points and update the centroid until the centroids do not move. + +graphClusters void function takes in a 1D and a 2D numpy array to graph. +The 1D array of centroid locations and classifactions have distinct points on the graph. +The 2D array graphs points of normalized CSV data and colors them the same color as their corresponding centroids. +A legend is generated in a reasonable position. + +dataAnalysis void function takes in the original parsed CSV classifications and the final classifications of the data based on K-means clustering (use of centroids) and +compares the two to find false/true positives/negatives. + +mainDriver function takes in nothing and graphs both the orginial CSV file, the k number of nearest neighbors, and the test case. +This function returns 0. diff --git a/Summer-2020-Data-Analysis-Project-master/__pycache__/KNearestNeighborClassifier.cpython-37.pyc b/Summer-2020-Data-Analysis-Project-master/__pycache__/KNearestNeighborClassifier.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14a61c6cbb1b28751cf21378d2680a221c1a70ac GIT binary patch literal 1500 zcma)6O>Y}T7@pbLFFQ_X+5!zo$P$;VP^zlpRE3J*2qBP6r2v&I^D*9;IGgN;nHi_C zwLTfR%-XH0N=S_Ke$33fGyA^J^Uf#R+iimJ=Rbda_j`ws zKiuTS@Zfw5(Vs&JB4|zuI`HV)X9K@k3kD&qmq$fBh_T0n{}UOsL?A-gwunRwyCt`l z1iSQvB(3K-kMt92{h|^wWp0WQLSje{qJMxg_eedN{&eV)$h#k5zUlhsRYq zEo5o*@X5@aRpszxS`<<}QPsIj&BF%|AAEn*d%9UYaPxy>Zur4b$@7`c^uZ?bpy1j_ z_5TCE=u58f^L?3(&qkH%B_K}~@z+apV$dEu!;!Aei~j#nbO z4mMv<;Eou?0^>a*V$l-qYahEUv9(;E+ZG+MeckvsTPw#x0_=DC$`P-@-qMk!YTo(^ ziFenqT_w!Y)CM^p$=rqoR~Hfp&gOSE1?rZY3hnD8@zmSytb0&{U^F#G+dvmQ&q4C| zv;vOVTqd2SGRkq>cE*ZN&Uz=`e46E;*d#YL(kV9pOS#5tSR=t8)p94h3fo|!veMWv zmt$E9>+7@XS>mg07l7s^YR9QtPC0h1)opn8&E~e(u}!e0-9Hmx*QU4VJ?hbz-i04y zC8p|q?4>Ti4OgxJ(u3#^p$swTH`kF1V!Ru+A`1%ZLjr-WMc;ZR1lnFrmdw2ksJq{v z?|)W3>lVD6y{w_`f^^+kHSOkAg%%liPb>APJCD)}aX;0U>kf@9bNL5ZIhmTKjDww2 zNp7SAuEA~Fbs*1CZPyNVtD4AiQ{*NYpvo?JZhhCFHe5%)UafY)v$E+!^^wc<1-)aG zs6*8USW7x~x8P=yS0>L!XbtDtaBbClh_FF9EhaPN_;2_Av#Zu=wDGPOyb@We8}!#i man)~bd*ufkU>i*1L%g2TTt0pSVcJ3OAOmo%cqhIahyMc6e1D_LOsPIrV!SSzgjZDUk1xd>lSLe&3_+PfRpCls{ho^Yd?) zJnwJn92XzTJygApMtFi{Ud~oHt8{-Qc;E>x{1=|^5BN$b0ujFORw5CJ8e~nxq7E60 zh8Tmai*Ydl*$|WB0_2#O5*H!I#k9BtIU(3%uQ~G?dwH!6wa0}+xreG}&`58E32%i9 zCOB64!pEw>hWEFP(H%0SLNY*ILe;;c8DNp{USiogzvd0R_Q?nzjqQlyN9nrd)iMlbYm=YUn=&tLO{7{TT3SDoh4qsxvtch$iMG7x*-&pLJ&A>R(zj8k z%t)e_7F91Tn%o9jX6rT(iAnzS)H~Sg&0kBmSAWz}>D4ur6k_#ZA(vIUBUhKp&UP*f zqgNm8n$5CUeY~CLQaw^-S9Z+8&4ruo#iggi(K+hRE#td$i$#*{>Qv7SE$8w?8>#-k zVM``TYSWVG#^ze7ma;_abUl@7zPD@Rb~`On({7XVdK!(#rroc>F5@@Orr6%3!^Hft zX$vsHx+2MCyKNioc3z6@jP&t#`}uZ~RWs@eKIVdQs<}*KbTb4&O-jsMze9IMg=B>4 z5Jy(VGpvg{C!h-dm486+4q9*6=gQqKw{;=2G*1x~JEphNUVBp}LaL_kzN{wj5!=|1 z#kwkU(>K6Fyq;#Vg#(;(&5o5lSu8#N=^M;n$2#Yt$J8wJk&CqLh!o$WW4uFvw9Aj! zKI`8yz5$>Q@LTqtUEhc6-EiQ@(v7ZyGTj;~wD&-r0l(utfA^_p0B(0M)@s%(kHbBw zO06dGeH#Kx8&XkR5>XuJ%6|fno6>CPZF|jV4;rD zc(~2+MDQwLXD!1smaa#M?bz}#6!8(G zHHVR?54Dag=C28@csdm3QPlj6^Oq?5b-PR zig!fF=PgA`n!G|upr0!PLh}BXmhYnW7a$kj@@zdV#B!=lQgo!-if{v9e~q?7>Tw}( zJA-j3bmxc2^=EeWtDPgtZ)I)+lBf2gOrnVAcCw>nV&oHKpCu$(V#+a}?m!~c+u#!)@FAaW>^pXWQ9hm@){UG*sOT{g0Odh2-~+#6)qo$?j=cR~ z5Fl`V%Yntc5Y5<~L4AnLDpq|&8U>>QfV=xBQ5#73&CteXcm{Xv?&&b#nEDtCtX~vw zQEjF;n{~e{gzP_rwI&W%jJv^*`xem~1%hHf3dFDI7#s?Tf$wvJSM`8+!n;1@!sFv2 z1UT?D9DR$cTf#3m{e2*Yy$Z?tFdSM_s9?82nUuK8kcM0z{PUez^>-W;tOhsSvKfpr8ynw zOd>rCyr-MxFZkeHvUCKyN18*_39M!u%g#M(&tT~O*LYx9j8|)haY%|WqWl1}9n?`7 zG43jc=!H4J$qn)v!R&zknsEo2y;^5W%y;xo1)IQqNfCNowsh~pDgREKY~)E=xR+lQ zu-1<5a7ulPUUSl3%o9UV$kH{u!=%}$k}g;_EVlFBZiP~!YPAz(=qjQPs?HBWP~{eV j`ahkC?^I9pU3yGhl*U;cPEN&9+=ws4wK$9?j0Y#7!I&E!-e&qo8(EH>V(Nli0`u1%v=eTfj&mtVLR0d3U97 zY1cO7qJR?g`xA7^H2jGpLu^Y=@P z{pC7i!h?Met=NQ+O!Aavd|mL;_SXGJObY3}V$vIk^+5VEc*WL38OjQDMMkm;9m$%U zg09MGIRjmjv+@JzDLE%EKu^ny@)GonyGjTXuAN1$^Lt(b>UY@JKCE+m%% zGd<~Hrf-7>JH`|(aveQn`)I3Z#h(y;OqA?3rf!O>tk0TfZ^ZBoKlSm3Z-nvse8Axi zJ)F;^|9UjTAA9Saz70%xTVNjBlNw(BIM# z)VcNJy`-p5Tfd#|w5-3SvaYSjq)>68te2|XdU2ZCpcm`7up;l-pxBCg3KO$<&xWmT zN*cW+FMCN|7uGLSx@mnGoA?dWA7ZU{e>}SL;@M7?Dg9J;JE~@8l#8o5Y6Fh@H=`s}BN7{88r zE=myL3 zFYdCJ3kJ}28{>_7wTw78P`X>_S$x|DI?mflQ!&v~Bf|w zPkIIuo~}nFY~*7^T%f(0FbGJ~7soGU(VQXi@wlW;6N7n15##V|}Y=9z7S2ZS)z zZPfRAU?Vf!gTsKkDGW%bKf?s;<@tNuo6jdp@snJty#~As_Yc8Q@CHXgq{Em<1SS78 z5x=2huq{*zfi&PE9T0Iu$D;x{+AxxUEnmgvw}rkfydgzIdc3oiZ+{hR0^*N-vMqT%u9lzhZnx_k!6m_fNCxDDvPnQ-7asI<1_a;w-a z0pz={X~MB9r0&WXi}EbgS{5hynof3=&fr}*_9HY+e5lXb3t4O^7iqGAYnm(@mfEGv z2Ki3b+b{7-)xTVcig%e;2U-^gVW|v@KK-A;M0d*@{5!g5++V+G9tE>=Q5e;t*{Bi) HkQe?1GD1bi literal 0 HcmV?d00001 diff --git a/Summer-2020-Data-Analysis-Project-master/ckd.csv b/Summer-2020-Data-Analysis-Project-master/ckd.csv new file mode 100644 index 0000000..8c30b6b --- /dev/null +++ b/Summer-2020-Data-Analysis-Project-master/ckd.csv @@ -0,0 +1,159 @@ +Glucose,Hemoglobin,Class +117,11.2,1 +70,9.5,1 +380,10.8,1 +157,5.6,1 +173,7.7,1 +95,9.8,1 +264,12.5,1 +70,10,1 +253,10.5,1 +163,9.8,1 +129,9.1,1 +133,10.3,1 +76,7.1,1 +280,13,1 +210,16.1,1 +219,10.4,1 +295,9.2,1 +118,11.4,1 +224,8.1,1 +128,8.2,1 +118,12,1 +105,11.1,1 +288,7.9,1 +273,8.3,1 +122,12.6,1 +303,10.4,1 +102,8.7,1 +107,8.3,1 +117,10,1 +239,9.5,1 +94,9.9,1 +129,8.1,1 +252,11.2,1 +255,7.3,1 +253,10.9,1 +214,10.9,1 +490,11.5,1 +163,7.9,1 +241,9.6,1 +214,9.4,1 +106,8.6,1 +424,12.6,1 +176,3.1,1 +140,15,0 +70,17,0 +82,15.9,0 +119,15.4,0 +99,13,0 +121,13.6,0 +131,14.5,0 +91,14,0 +98,13.9,0 +104,16.1,0 +131,14.1,0 +122,17,0 +118,15.5,0 +117,16.2,0 +132,14.4,0 +97,14.2,0 +133,13.2,0 +122,13.9,0 +121,15,0 +111,14.3,0 +96,13.8,0 +139,14.8,0 +125,16.5,0 +123,15.7,0 +112,14.5,0 +140,16.3,0 +130,15.5,0 +123,14.6,0 +100,16.9,0 +94,16,0 +81,14.7,0 +93,16.6,0 +124,14.9,0 +89,16.7,0 +125,16.8,0 +91,13.5,0 +127,15.1,0 +96,16.9,0 +128,13.1,0 +122,17.1,0 +128,15.2,0 +137,13.6,0 +81,13.9,0 +102,13.2,0 +132,13.7,0 +104,17.3,0 +131,15.6,0 +102,15,0 +120,17.4,0 +105,15.7,0 +109,13.9,0 +130,15.9,0 +100,14,0 +109,15.8,0 +120,13.4,0 +80,14.1,0 +130,13.5,0 +99,17.7,0 +134,14.2,0 +92,14,0 +132,17.8,0 +88,13.3,0 +100,14.3,0 +130,13.4,0 +95,15,0 +111,16.2,0 +106,14.4,0 +97,13.5,0 +108,17.8,0 +99,13.6,0 +83,17.5,0 +109,15,0 +86,13.6,0 +102,14.6,0 +95,15,0 +87,17.1,0 +107,13.6,0 +117,13,0 +88,17.2,0 +105,14.7,0 +70,13.7,0 +89,15,0 +118,14.8,0 +81,15,0 +125,17.4,0 +82,14.9,0 +107,13.6,0 +83,16.2,0 +79,17.6,0 +109,15,0 +133,13.7,0 +111,16.3,0 +74,15.1,0 +88,16.4,0 +97,13.8,0 +78,16.1,0 +113,15.3,0 +75,16.8,0 +119,13.9,0 +132,15.4,0 +113,16.5,0 +100,16.4,0 +93,16.7,0 +94,15.5,0 +112,17,0 +99,15,0 +85,15.6,0 +133,14.8,0 +117,13,0 +137,14.1,0 +140,15.7,0 +75,16.5,0 +100,15.8,0 +114,14.2,0 +131,15.8,0 From 9ea6086d66bbb319a46358fa2c24058d7e84e8bf Mon Sep 17 00:00:00 2001 From: awong15 <46977254+awong15@users.noreply.github.com> Date: Sun, 6 Sep 2020 10:10:01 -0400 Subject: [PATCH 2/2] Changed folder name --- .../KMeansClustering.py | 258 +++++++------- .../KNearestNeighborClassifier.py | 132 ++++---- .../NearestNeighborClassifier.py | 222 ++++++------ .../README.md | 0 .../KNearestNeighborClassifier.cpython-37.pyc | Bin .../NearestNeighborClassifier.cpython-37.pyc | Bin .../NearestNeighborClassifier.cpython-38.pyc | Bin .../ckd.csv | 318 +++++++++--------- 8 files changed, 465 insertions(+), 465 deletions(-) rename {Summer-2020-Data-Analysis-Project-master => Summer-2020-Data-Analysis-Project-Brandon-Branch}/KMeansClustering.py (97%) rename {Summer-2020-Data-Analysis-Project-master => Summer-2020-Data-Analysis-Project-Brandon-Branch}/KNearestNeighborClassifier.py (97%) rename {Summer-2020-Data-Analysis-Project-master => Summer-2020-Data-Analysis-Project-Brandon-Branch}/NearestNeighborClassifier.py (97%) rename {Summer-2020-Data-Analysis-Project-master => Summer-2020-Data-Analysis-Project-Brandon-Branch}/README.md (100%) rename {Summer-2020-Data-Analysis-Project-master => Summer-2020-Data-Analysis-Project-Brandon-Branch}/__pycache__/KNearestNeighborClassifier.cpython-37.pyc (100%) rename {Summer-2020-Data-Analysis-Project-master => Summer-2020-Data-Analysis-Project-Brandon-Branch}/__pycache__/NearestNeighborClassifier.cpython-37.pyc (100%) rename {Summer-2020-Data-Analysis-Project-master => Summer-2020-Data-Analysis-Project-Brandon-Branch}/__pycache__/NearestNeighborClassifier.cpython-38.pyc (100%) rename {Summer-2020-Data-Analysis-Project-master => Summer-2020-Data-Analysis-Project-Brandon-Branch}/ckd.csv (91%) diff --git a/Summer-2020-Data-Analysis-Project-master/KMeansClustering.py b/Summer-2020-Data-Analysis-Project-Brandon-Branch/KMeansClustering.py similarity index 97% rename from Summer-2020-Data-Analysis-Project-master/KMeansClustering.py rename to Summer-2020-Data-Analysis-Project-Brandon-Branch/KMeansClustering.py index 31e4484..0edbcf2 100644 --- a/Summer-2020-Data-Analysis-Project-master/KMeansClustering.py +++ b/Summer-2020-Data-Analysis-Project-Brandon-Branch/KMeansClustering.py @@ -1,130 +1,130 @@ -# ============================================================================= -# KMeansClustering.py -# Name: Alycia Wong and Brandon Wong -# Date: June 2020 -# Description: Process and graph a CSV file containing biomedical data that -# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). -# Randomly generate up to 10 centroids without issue. Each centroid will have a -# classification. The nearest centroid to a point will determine the point's -# classicfication (decide what to do if the distances are equal yourself). -# Create random test cases until centroids stop mocing and determine whether -# each case is likely to have CKD depending on the classification of the -# nearest centroid. -# Bonus: Create lines roughly separating each centroid group -# ============================================================================= - -# ============================================================================= -# Import statements -# ============================================================================= -import matplotlib.pyplot as plt -import numpy as np -import NearestNeighborClassifier as NNC -from scipy.spatial import KDTree as kdt - -# ============================================================================= -# Functions -# ============================================================================= -# randomCentroids function takes in an integer number of clusters to be -# generated. -# OR asks for k number of integer clusters -# Outputs a 2D array filled with random values between 0-1. The -# first column represents glucose and the second column represents hemoglobin. -# There are k number of rows representing the number of centroids and the -# classification of each centroid (i.e.: row index = classification value). -# OR you can have a third column with the classification value. -def randomCentroids(k): - return np.random.rand(k,2) - -# assignCentroids function takes in an array of normalized x (hemoglobin) and y -# (glucose) values from the CSV file and the randomly generated array of -# centroids from randomCentroids. Using the findDistance function from -# NearestNeighborClassifier, points are assigned the same classification as the -# nearest centroid. A 2D array of the normalized data and its classification -# are returned. -def assignToCentroids(normArr, centArr): - return kdt(centArr).query(normArr)[1] -# print(assignToCentroids(NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, np.array([[.5, .5],[.25,.25]]))) - -# updateCentroids function inputs the 2D array of centroid locations and of -# classified and normalized CSV data. The average x (hemo) and y (gluc) -# positions of all data points for each classifications are found and an -# updated 2D array with these average cartesian points as the location for the -# new centroids is returned along with the original cartesian points. -#avg of all 1s will be new cent, avg of all 0s will be new cent - -def updateCentroids(centArr, classArr, normArr): - upCentArr = centArr.copy() - for i in range(len(centArr[:,0])): - upCentArr[i,0] = np.mean(normArr.gluc[classArr==i]) - upCentArr[i,1] = np.mean(normArr.hemo[classArr==i]) - return upCentArr -# centArr = np.array([[0.5, 0.5], [.25, .25]]) -# print(updateCentroids( -# centArr, assignToCentroids( -# NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, centArr), -# NNC.normalizeData(NNC.openCSVFile('ckd.csv')) -# )) -# print(centArr) - -# iterate void function can either -# a) input information and iterate the original information until centArr ~ -# upCentArr -def iterate(normArr, centArr): - # classArr = np.zeros(len(normArr.gluc)) - classArr = assignToCentroids(normArr, centArr) - upCentArr = updateCentroids(centArr, classArr, normArr) - # print(classArr) - if (upCentArr != centArr).any(): - centArr = upCentArr - return iterate(normArr, centArr) - return centArr -print(iterate( - NNC.normalizeData(NNC.openCSVFile('ckd.csv')), np.array([[.5, .5],[.25,.25]]) - )) - -# graphClusters void function takes in a 1D and a 2D numpy array to graph. The -# 1D array of centroid locations and classifactions have distinct points on the -# graph. The 2D array graphs points of normalized CSV data and colors them the -# same color as their corresponding centroids. A legend is generated in a -# reasonable position. -# Bonus: Create lines roughly separating each centroid group -def graphClusters(): - - return - -# dataAnalysis void function takes in the original parsed CSV classifications -# and the final classifications of the data based on K-means clustering (use of -# centroids) and compares the two to find false/true positives/negatives. -# Note: This should only run when there are two centroids (i.e.: k = 2) -# False positive: Percentage of non-CKD were incorrectly labelled by K-Means as -# being in the CKD cluster -# True positive (sensitivity): Percentage of CKD patients were correctly -# labeled by K-Means -# False negative: Percentage of non-CKD were incorrectly labelled by K-Means as -# being in the CKD cluster -# True negative (specificity): Percentage of non-CKD patients were correctly -# labelled by K-Means -# Note: True positive (~93 %) + False positive (~7%) = 100% -# Note: True Negative (~100%) + False negative (~0%) = 100% -def dataAnalysis(): - return -# ============================================================================= -# Main Script -# ============================================================================= -# mainDriver function takes in nothing and graphs both the orginial CSV file, -# the k number of nearest neighbors, and the test case. This function returns -# 0. -def mainDriver(): - # Open the CSV file using the parsing method from - # NearestNeighborClassifier. No input, outputs 2D numpy array. - NNC.openCSVFile - - # Normalize data using method from NearestNeighborClassifier. Input and - # outputs a 2D numpy array - NNC.normalizeData() - - # Graph CSV file using method from NearestNeighborClassifier. Input 2D - # numpy array. Void function. - NNC.graphCSVFile() - +# ============================================================================= +# KMeansClustering.py +# Name: Alycia Wong and Brandon Wong +# Date: June 2020 +# Description: Process and graph a CSV file containing biomedical data that +# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +# Randomly generate up to 10 centroids without issue. Each centroid will have a +# classification. The nearest centroid to a point will determine the point's +# classicfication (decide what to do if the distances are equal yourself). +# Create random test cases until centroids stop mocing and determine whether +# each case is likely to have CKD depending on the classification of the +# nearest centroid. +# Bonus: Create lines roughly separating each centroid group +# ============================================================================= + +# ============================================================================= +# Import statements +# ============================================================================= +import matplotlib.pyplot as plt +import numpy as np +import NearestNeighborClassifier as NNC +from scipy.spatial import KDTree as kdt + +# ============================================================================= +# Functions +# ============================================================================= +# randomCentroids function takes in an integer number of clusters to be +# generated. +# OR asks for k number of integer clusters +# Outputs a 2D array filled with random values between 0-1. The +# first column represents glucose and the second column represents hemoglobin. +# There are k number of rows representing the number of centroids and the +# classification of each centroid (i.e.: row index = classification value). +# OR you can have a third column with the classification value. +def randomCentroids(k): + return np.random.rand(k,2) + +# assignCentroids function takes in an array of normalized x (hemoglobin) and y +# (glucose) values from the CSV file and the randomly generated array of +# centroids from randomCentroids. Using the findDistance function from +# NearestNeighborClassifier, points are assigned the same classification as the +# nearest centroid. A 2D array of the normalized data and its classification +# are returned. +def assignToCentroids(normArr, centArr): + return kdt(centArr).query(normArr)[1] +# print(assignToCentroids(NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, np.array([[.5, .5],[.25,.25]]))) + +# updateCentroids function inputs the 2D array of centroid locations and of +# classified and normalized CSV data. The average x (hemo) and y (gluc) +# positions of all data points for each classifications are found and an +# updated 2D array with these average cartesian points as the location for the +# new centroids is returned along with the original cartesian points. +#avg of all 1s will be new cent, avg of all 0s will be new cent + +def updateCentroids(centArr, classArr, normArr): + upCentArr = centArr.copy() + for i in range(len(centArr[:,0])): + upCentArr[i,0] = np.mean(normArr.gluc[classArr==i]) + upCentArr[i,1] = np.mean(normArr.hemo[classArr==i]) + return upCentArr +# centArr = np.array([[0.5, 0.5], [.25, .25]]) +# print(updateCentroids( +# centArr, assignToCentroids( +# NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, centArr), +# NNC.normalizeData(NNC.openCSVFile('ckd.csv')) +# )) +# print(centArr) + +# iterate void function can either +# a) input information and iterate the original information until centArr ~ +# upCentArr +def iterate(normArr, centArr): + # classArr = np.zeros(len(normArr.gluc)) + classArr = assignToCentroids(normArr, centArr) + upCentArr = updateCentroids(centArr, classArr, normArr) + # print(classArr) + if (upCentArr != centArr).any(): + centArr = upCentArr + return iterate(normArr, centArr) + return centArr +print(iterate( + NNC.normalizeData(NNC.openCSVFile('ckd.csv')), np.array([[.5, .5],[.25,.25]]) + )) + +# graphClusters void function takes in a 1D and a 2D numpy array to graph. The +# 1D array of centroid locations and classifactions have distinct points on the +# graph. The 2D array graphs points of normalized CSV data and colors them the +# same color as their corresponding centroids. A legend is generated in a +# reasonable position. +# Bonus: Create lines roughly separating each centroid group +def graphClusters(): + + return + +# dataAnalysis void function takes in the original parsed CSV classifications +# and the final classifications of the data based on K-means clustering (use of +# centroids) and compares the two to find false/true positives/negatives. +# Note: This should only run when there are two centroids (i.e.: k = 2) +# False positive: Percentage of non-CKD were incorrectly labelled by K-Means as +# being in the CKD cluster +# True positive (sensitivity): Percentage of CKD patients were correctly +# labeled by K-Means +# False negative: Percentage of non-CKD were incorrectly labelled by K-Means as +# being in the CKD cluster +# True negative (specificity): Percentage of non-CKD patients were correctly +# labelled by K-Means +# Note: True positive (~93 %) + False positive (~7%) = 100% +# Note: True Negative (~100%) + False negative (~0%) = 100% +def dataAnalysis(): + return +# ============================================================================= +# Main Script +# ============================================================================= +# mainDriver function takes in nothing and graphs both the orginial CSV file, +# the k number of nearest neighbors, and the test case. This function returns +# 0. +def mainDriver(): + # Open the CSV file using the parsing method from + # NearestNeighborClassifier. No input, outputs 2D numpy array. + NNC.openCSVFile + + # Normalize data using method from NearestNeighborClassifier. Input and + # outputs a 2D numpy array + NNC.normalizeData() + + # Graph CSV file using method from NearestNeighborClassifier. Input 2D + # numpy array. Void function. + NNC.graphCSVFile() + return 0 \ No newline at end of file diff --git a/Summer-2020-Data-Analysis-Project-master/KNearestNeighborClassifier.py b/Summer-2020-Data-Analysis-Project-Brandon-Branch/KNearestNeighborClassifier.py similarity index 97% rename from Summer-2020-Data-Analysis-Project-master/KNearestNeighborClassifier.py rename to Summer-2020-Data-Analysis-Project-Brandon-Branch/KNearestNeighborClassifier.py index 5f53890..382bd59 100644 --- a/Summer-2020-Data-Analysis-Project-master/KNearestNeighborClassifier.py +++ b/Summer-2020-Data-Analysis-Project-Brandon-Branch/KNearestNeighborClassifier.py @@ -1,67 +1,67 @@ -# ============================================================================= -# KNearestNeighborClassifier.py -# Name: Alycia Wong and Brandon Wong -# Date: June 2020 -# Description: Process and graph a CSV file containing biomedical data that -# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). -# Create a random test case and determine whether the case is -# likely to have CKD depending on the mode of the classifications of the -# k number of nearest points. -# ============================================================================= - -# ============================================================================= -# Import statements -# ============================================================================= -import matplotlib.pyplot as plt -import numpy as np -import NearestNeighborClassifier as NNC -from statistics import mode - -# ============================================================================= -# Functions -# ============================================================================= -# findDistanceArray inputs a numpy array, a random point, and an integer k and -# uses the findDistance function from NearestNeighborClassifier. The function -# outputs a 1D array containing the k number of nearst points to the random -# test case. -def findDistanceArray(normArr, testCase, k): - distArr = np.zeros(normArr.len) - for i in range(len(distArr)): - distArr[i] = NNC.findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0]) - kindex = np.argsort(distArr)[:k] - return kindex - -# graphKNearestNeighbor void function takes in two 1D and one 2D numpy arrays -# to graph. One of the 1D arrays is a random testCase with its own distinct -# points. The other 1D array is used to circle the k number of points closest -# to the test case. The 2D array contains information parsed from the CSV -# column. The first column (hemoglobin) is graphed as the x-axis and the second -# column (glucose) as the y-axis. The third column (classification) determines -# the color of the points. A legend is generated in a reasonable position. -def graphKNearestNeighbor(testCase, normArr, k): - kindex = findDistanceArray(normArr, testCase, k) - NNC.graphCSVFile(normArr) - plt.scatter(testCase[1], testCase[0], - c = ('b' if mode(normArr.disease[kindex])==0 else 'r'), - label = 'Test Case', - marker = "x") - plt.scatter(normArr.hemo[kindex], normArr.gluc[kindex], - c='y', label = 'Nearest neighbor(s)') - print("butts") - plt.legend(fontsize="small") - plt.show() - return - -# ============================================================================= -# Main Script -# ============================================================================= -# mainDriver function takes in nothing and graphs both the orginial CSV file, -# the k number of nearest neighbors, and the test case. This function returns -# 0.5 -def mainDriver(): - val = int(input("How many neighbors are you looking for: ")) - test = NNC.createTestCase() - normal = NNC.normalizeData(NNC.openCSVFile('ckd.csv')) - graphKNearestNeighbor(test, normal, val) - return 0 +# ============================================================================= +# KNearestNeighborClassifier.py +# Name: Alycia Wong and Brandon Wong +# Date: June 2020 +# Description: Process and graph a CSV file containing biomedical data that +# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +# Create a random test case and determine whether the case is +# likely to have CKD depending on the mode of the classifications of the +# k number of nearest points. +# ============================================================================= + +# ============================================================================= +# Import statements +# ============================================================================= +import matplotlib.pyplot as plt +import numpy as np +import NearestNeighborClassifier as NNC +from statistics import mode + +# ============================================================================= +# Functions +# ============================================================================= +# findDistanceArray inputs a numpy array, a random point, and an integer k and +# uses the findDistance function from NearestNeighborClassifier. The function +# outputs a 1D array containing the k number of nearst points to the random +# test case. +def findDistanceArray(normArr, testCase, k): + distArr = np.zeros(normArr.len) + for i in range(len(distArr)): + distArr[i] = NNC.findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0]) + kindex = np.argsort(distArr)[:k] + return kindex + +# graphKNearestNeighbor void function takes in two 1D and one 2D numpy arrays +# to graph. One of the 1D arrays is a random testCase with its own distinct +# points. The other 1D array is used to circle the k number of points closest +# to the test case. The 2D array contains information parsed from the CSV +# column. The first column (hemoglobin) is graphed as the x-axis and the second +# column (glucose) as the y-axis. The third column (classification) determines +# the color of the points. A legend is generated in a reasonable position. +def graphKNearestNeighbor(testCase, normArr, k): + kindex = findDistanceArray(normArr, testCase, k) + NNC.graphCSVFile(normArr) + plt.scatter(testCase[1], testCase[0], + c = ('b' if mode(normArr.disease[kindex])==0 else 'r'), + label = 'Test Case', + marker = "x") + plt.scatter(normArr.hemo[kindex], normArr.gluc[kindex], + c='y', label = 'Nearest neighbor(s)') + print("butts") + plt.legend(fontsize="small") + plt.show() + return + +# ============================================================================= +# Main Script +# ============================================================================= +# mainDriver function takes in nothing and graphs both the orginial CSV file, +# the k number of nearest neighbors, and the test case. This function returns +# 0.5 +def mainDriver(): + val = int(input("How many neighbors are you looking for: ")) + test = NNC.createTestCase() + normal = NNC.normalizeData(NNC.openCSVFile('ckd.csv')) + graphKNearestNeighbor(test, normal, val) + return 0 mainDriver() \ No newline at end of file diff --git a/Summer-2020-Data-Analysis-Project-master/NearestNeighborClassifier.py b/Summer-2020-Data-Analysis-Project-Brandon-Branch/NearestNeighborClassifier.py similarity index 97% rename from Summer-2020-Data-Analysis-Project-master/NearestNeighborClassifier.py rename to Summer-2020-Data-Analysis-Project-Brandon-Branch/NearestNeighborClassifier.py index cb298ce..6d0e806 100644 --- a/Summer-2020-Data-Analysis-Project-master/NearestNeighborClassifier.py +++ b/Summer-2020-Data-Analysis-Project-Brandon-Branch/NearestNeighborClassifier.py @@ -1,112 +1,112 @@ -# ============================================================================= -# NearestNeighborClassifier.py -# Name: Alycia Wong and Brandon Wong -# Date: June 2020 -# Description: Process and graph a CSV file containing biomedical data that -# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). -# Create n number of random test cases and determine whether the case is -# likely to have CKD depending on the classification of the nearest point. -# ============================================================================= - -# ============================================================================= -# Import statements -# ============================================================================= -import matplotlib.pyplot as plt -import numpy as np - -# ============================================================================= -# Classes -# ============================================================================= -class Butts: - def __init__(self, data): - self.gluc = data[:,0] - self.hemo = data[:,1] - self.disease = data[:,2] - self.len = len(data) - self.all = data[:,:3] - self.paras = data[:,:2] - self.shape = np.shape(data) - self.colmax = np.amax(data, axis = 0) - self.colmin = np.amin(data, axis = 0) - -# ============================================================================= -# Functions -# ============================================================================= -# Parses in file and turns it into Butts class of data -def openCSVFile(fileName): - return Butts(np.genfromtxt(fileName, delimiter=',',skip_header=1)) - -# Takes in butts class -# Loops over data normalizing it for every row -# returns normalized butts class data -def normalizeData(dataArr): - normArr = np.zeros(dataArr.shape) - for i in range(len(normArr)): - normArr[i] = (dataArr.all[i] - dataArr.colmin) / (dataArr.colmax - dataArr.colmin) - return Butts(normArr) - -# graphCSVFile void function takes in a 2D numpy array and graphs with the -# first column (hemoglobin) as the x-axis and second column (glucose) as the -# y-axis. The third column (classification) is used to determine the color of -# the points on the graph. -def graphCSVFile(normArr): - plt.scatter(normArr.hemo[normArr.disease==0], normArr.gluc[normArr.disease==0], - c='b', label='No CKD' ) - plt.scatter(normArr.hemo[normArr.disease==1], normArr.gluc[normArr.disease==1], - c='r', label='CKD') - plt.title('Hemoglobin and Glucose levels') - plt.xlabel('Hemoglobin') - plt.ylabel('Glucose') - return -# findDistance function is either: -# a) takes in an array and a point and returns an array of distances or the -# minimum distance or -# B) takes in cartesian coordinates and uses a simple use of the distance -# formula to return the distance between the two points. -def findDistance(x1, y1, x2, y2): - return np.sqrt((x1-x2)**2+(y1-y2)**2) - -# createTestCase function creates two random test cases (hemoglobin and -# glucose) from 0-1 and: -# creates a new 1D array with the two points -# return the points raw -def createTestCase(): - return np.random.rand(2) - -# nearestNeighborIndex takes in the test case point and returns the index of the -# nearest point to the test case -def nearestNeighborIndex(testCase, normArr): - distArr = np.zeros(normArr.len) - for i in range(len(distArr)): - distArr[i] = findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0]) - nni = distArr.argmin() - return nni - -# graphNearestNeighbor void function takes in a 2D numpy array (and a cartesian -# coordinate depending on createTestCase) and graphs the first column -# (hemoglobin) as the x-axis and the second column (glucose) as the y-axis -# the third column (classification) determines the color of the points. A -# randomly generated test case is graphed as a distinct point with a -# line connecting it to the nearest neighbor whose classification it takes on. -# A legend is generated in a reasonable position. -def graphNearestNeighbor(testCase, normArr): - nni = nearestNeighborIndex(testCase, normArr) - graphCSVFile(normArr) - plt.scatter(testCase[1], testCase[0], - c = ('b' if normArr.disease[nni]==0 else 'r'), - label = 'Test Case', - marker = "x") - plt.plot([testCase[1], normArr.hemo[nni]], [testCase[0], normArr.gluc[nni]], 'k-') - plt.legend() - plt.show() - return - -# ============================================================================= -# Main Script -# ============================================================================= -# mainDriver function takes in no inputs and graphs both the orginial CSV -# file and the test case. This function returns 0. -def mainDriver(): - graphNearestNeighbor(createTestCase(), normalizeData(openCSVFile('ckd.csv'))) - return 0 +# ============================================================================= +# NearestNeighborClassifier.py +# Name: Alycia Wong and Brandon Wong +# Date: June 2020 +# Description: Process and graph a CSV file containing biomedical data that +# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD). +# Create n number of random test cases and determine whether the case is +# likely to have CKD depending on the classification of the nearest point. +# ============================================================================= + +# ============================================================================= +# Import statements +# ============================================================================= +import matplotlib.pyplot as plt +import numpy as np + +# ============================================================================= +# Classes +# ============================================================================= +class Butts: + def __init__(self, data): + self.gluc = data[:,0] + self.hemo = data[:,1] + self.disease = data[:,2] + self.len = len(data) + self.all = data[:,:3] + self.paras = data[:,:2] + self.shape = np.shape(data) + self.colmax = np.amax(data, axis = 0) + self.colmin = np.amin(data, axis = 0) + +# ============================================================================= +# Functions +# ============================================================================= +# Parses in file and turns it into Butts class of data +def openCSVFile(fileName): + return Butts(np.genfromtxt(fileName, delimiter=',',skip_header=1)) + +# Takes in butts class +# Loops over data normalizing it for every row +# returns normalized butts class data +def normalizeData(dataArr): + normArr = np.zeros(dataArr.shape) + for i in range(len(normArr)): + normArr[i] = (dataArr.all[i] - dataArr.colmin) / (dataArr.colmax - dataArr.colmin) + return Butts(normArr) + +# graphCSVFile void function takes in a 2D numpy array and graphs with the +# first column (hemoglobin) as the x-axis and second column (glucose) as the +# y-axis. The third column (classification) is used to determine the color of +# the points on the graph. +def graphCSVFile(normArr): + plt.scatter(normArr.hemo[normArr.disease==0], normArr.gluc[normArr.disease==0], + c='b', label='No CKD' ) + plt.scatter(normArr.hemo[normArr.disease==1], normArr.gluc[normArr.disease==1], + c='r', label='CKD') + plt.title('Hemoglobin and Glucose levels') + plt.xlabel('Hemoglobin') + plt.ylabel('Glucose') + return +# findDistance function is either: +# a) takes in an array and a point and returns an array of distances or the +# minimum distance or +# B) takes in cartesian coordinates and uses a simple use of the distance +# formula to return the distance between the two points. +def findDistance(x1, y1, x2, y2): + return np.sqrt((x1-x2)**2+(y1-y2)**2) + +# createTestCase function creates two random test cases (hemoglobin and +# glucose) from 0-1 and: +# creates a new 1D array with the two points +# return the points raw +def createTestCase(): + return np.random.rand(2) + +# nearestNeighborIndex takes in the test case point and returns the index of the +# nearest point to the test case +def nearestNeighborIndex(testCase, normArr): + distArr = np.zeros(normArr.len) + for i in range(len(distArr)): + distArr[i] = findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0]) + nni = distArr.argmin() + return nni + +# graphNearestNeighbor void function takes in a 2D numpy array (and a cartesian +# coordinate depending on createTestCase) and graphs the first column +# (hemoglobin) as the x-axis and the second column (glucose) as the y-axis +# the third column (classification) determines the color of the points. A +# randomly generated test case is graphed as a distinct point with a +# line connecting it to the nearest neighbor whose classification it takes on. +# A legend is generated in a reasonable position. +def graphNearestNeighbor(testCase, normArr): + nni = nearestNeighborIndex(testCase, normArr) + graphCSVFile(normArr) + plt.scatter(testCase[1], testCase[0], + c = ('b' if normArr.disease[nni]==0 else 'r'), + label = 'Test Case', + marker = "x") + plt.plot([testCase[1], normArr.hemo[nni]], [testCase[0], normArr.gluc[nni]], 'k-') + plt.legend() + plt.show() + return + +# ============================================================================= +# Main Script +# ============================================================================= +# mainDriver function takes in no inputs and graphs both the orginial CSV +# file and the test case. This function returns 0. +def mainDriver(): + graphNearestNeighbor(createTestCase(), normalizeData(openCSVFile('ckd.csv'))) + return 0 # mainDriver() \ No newline at end of file diff --git a/Summer-2020-Data-Analysis-Project-master/README.md b/Summer-2020-Data-Analysis-Project-Brandon-Branch/README.md similarity index 100% rename from Summer-2020-Data-Analysis-Project-master/README.md rename to Summer-2020-Data-Analysis-Project-Brandon-Branch/README.md diff --git a/Summer-2020-Data-Analysis-Project-master/__pycache__/KNearestNeighborClassifier.cpython-37.pyc b/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/KNearestNeighborClassifier.cpython-37.pyc similarity index 100% rename from Summer-2020-Data-Analysis-Project-master/__pycache__/KNearestNeighborClassifier.cpython-37.pyc rename to Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/KNearestNeighborClassifier.cpython-37.pyc diff --git a/Summer-2020-Data-Analysis-Project-master/__pycache__/NearestNeighborClassifier.cpython-37.pyc b/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/NearestNeighborClassifier.cpython-37.pyc similarity index 100% rename from Summer-2020-Data-Analysis-Project-master/__pycache__/NearestNeighborClassifier.cpython-37.pyc rename to Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/NearestNeighborClassifier.cpython-37.pyc diff --git a/Summer-2020-Data-Analysis-Project-master/__pycache__/NearestNeighborClassifier.cpython-38.pyc b/Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/NearestNeighborClassifier.cpython-38.pyc similarity index 100% rename from Summer-2020-Data-Analysis-Project-master/__pycache__/NearestNeighborClassifier.cpython-38.pyc rename to Summer-2020-Data-Analysis-Project-Brandon-Branch/__pycache__/NearestNeighborClassifier.cpython-38.pyc diff --git a/Summer-2020-Data-Analysis-Project-master/ckd.csv b/Summer-2020-Data-Analysis-Project-Brandon-Branch/ckd.csv similarity index 91% rename from Summer-2020-Data-Analysis-Project-master/ckd.csv rename to Summer-2020-Data-Analysis-Project-Brandon-Branch/ckd.csv index 8c30b6b..d071373 100644 --- a/Summer-2020-Data-Analysis-Project-master/ckd.csv +++ b/Summer-2020-Data-Analysis-Project-Brandon-Branch/ckd.csv @@ -1,159 +1,159 @@ -Glucose,Hemoglobin,Class -117,11.2,1 -70,9.5,1 -380,10.8,1 -157,5.6,1 -173,7.7,1 -95,9.8,1 -264,12.5,1 -70,10,1 -253,10.5,1 -163,9.8,1 -129,9.1,1 -133,10.3,1 -76,7.1,1 -280,13,1 -210,16.1,1 -219,10.4,1 -295,9.2,1 -118,11.4,1 -224,8.1,1 -128,8.2,1 -118,12,1 -105,11.1,1 -288,7.9,1 -273,8.3,1 -122,12.6,1 -303,10.4,1 -102,8.7,1 -107,8.3,1 -117,10,1 -239,9.5,1 -94,9.9,1 -129,8.1,1 -252,11.2,1 -255,7.3,1 -253,10.9,1 -214,10.9,1 -490,11.5,1 -163,7.9,1 -241,9.6,1 -214,9.4,1 -106,8.6,1 -424,12.6,1 -176,3.1,1 -140,15,0 -70,17,0 -82,15.9,0 -119,15.4,0 -99,13,0 -121,13.6,0 -131,14.5,0 -91,14,0 -98,13.9,0 -104,16.1,0 -131,14.1,0 -122,17,0 -118,15.5,0 -117,16.2,0 -132,14.4,0 -97,14.2,0 -133,13.2,0 -122,13.9,0 -121,15,0 -111,14.3,0 -96,13.8,0 -139,14.8,0 -125,16.5,0 -123,15.7,0 -112,14.5,0 -140,16.3,0 -130,15.5,0 -123,14.6,0 -100,16.9,0 -94,16,0 -81,14.7,0 -93,16.6,0 -124,14.9,0 -89,16.7,0 -125,16.8,0 -91,13.5,0 -127,15.1,0 -96,16.9,0 -128,13.1,0 -122,17.1,0 -128,15.2,0 -137,13.6,0 -81,13.9,0 -102,13.2,0 -132,13.7,0 -104,17.3,0 -131,15.6,0 -102,15,0 -120,17.4,0 -105,15.7,0 -109,13.9,0 -130,15.9,0 -100,14,0 -109,15.8,0 -120,13.4,0 -80,14.1,0 -130,13.5,0 -99,17.7,0 -134,14.2,0 -92,14,0 -132,17.8,0 -88,13.3,0 -100,14.3,0 -130,13.4,0 -95,15,0 -111,16.2,0 -106,14.4,0 -97,13.5,0 -108,17.8,0 -99,13.6,0 -83,17.5,0 -109,15,0 -86,13.6,0 -102,14.6,0 -95,15,0 -87,17.1,0 -107,13.6,0 -117,13,0 -88,17.2,0 -105,14.7,0 -70,13.7,0 -89,15,0 -118,14.8,0 -81,15,0 -125,17.4,0 -82,14.9,0 -107,13.6,0 -83,16.2,0 -79,17.6,0 -109,15,0 -133,13.7,0 -111,16.3,0 -74,15.1,0 -88,16.4,0 -97,13.8,0 -78,16.1,0 -113,15.3,0 -75,16.8,0 -119,13.9,0 -132,15.4,0 -113,16.5,0 -100,16.4,0 -93,16.7,0 -94,15.5,0 -112,17,0 -99,15,0 -85,15.6,0 -133,14.8,0 -117,13,0 -137,14.1,0 -140,15.7,0 -75,16.5,0 -100,15.8,0 -114,14.2,0 -131,15.8,0 +Glucose,Hemoglobin,Class +117,11.2,1 +70,9.5,1 +380,10.8,1 +157,5.6,1 +173,7.7,1 +95,9.8,1 +264,12.5,1 +70,10,1 +253,10.5,1 +163,9.8,1 +129,9.1,1 +133,10.3,1 +76,7.1,1 +280,13,1 +210,16.1,1 +219,10.4,1 +295,9.2,1 +118,11.4,1 +224,8.1,1 +128,8.2,1 +118,12,1 +105,11.1,1 +288,7.9,1 +273,8.3,1 +122,12.6,1 +303,10.4,1 +102,8.7,1 +107,8.3,1 +117,10,1 +239,9.5,1 +94,9.9,1 +129,8.1,1 +252,11.2,1 +255,7.3,1 +253,10.9,1 +214,10.9,1 +490,11.5,1 +163,7.9,1 +241,9.6,1 +214,9.4,1 +106,8.6,1 +424,12.6,1 +176,3.1,1 +140,15,0 +70,17,0 +82,15.9,0 +119,15.4,0 +99,13,0 +121,13.6,0 +131,14.5,0 +91,14,0 +98,13.9,0 +104,16.1,0 +131,14.1,0 +122,17,0 +118,15.5,0 +117,16.2,0 +132,14.4,0 +97,14.2,0 +133,13.2,0 +122,13.9,0 +121,15,0 +111,14.3,0 +96,13.8,0 +139,14.8,0 +125,16.5,0 +123,15.7,0 +112,14.5,0 +140,16.3,0 +130,15.5,0 +123,14.6,0 +100,16.9,0 +94,16,0 +81,14.7,0 +93,16.6,0 +124,14.9,0 +89,16.7,0 +125,16.8,0 +91,13.5,0 +127,15.1,0 +96,16.9,0 +128,13.1,0 +122,17.1,0 +128,15.2,0 +137,13.6,0 +81,13.9,0 +102,13.2,0 +132,13.7,0 +104,17.3,0 +131,15.6,0 +102,15,0 +120,17.4,0 +105,15.7,0 +109,13.9,0 +130,15.9,0 +100,14,0 +109,15.8,0 +120,13.4,0 +80,14.1,0 +130,13.5,0 +99,17.7,0 +134,14.2,0 +92,14,0 +132,17.8,0 +88,13.3,0 +100,14.3,0 +130,13.4,0 +95,15,0 +111,16.2,0 +106,14.4,0 +97,13.5,0 +108,17.8,0 +99,13.6,0 +83,17.5,0 +109,15,0 +86,13.6,0 +102,14.6,0 +95,15,0 +87,17.1,0 +107,13.6,0 +117,13,0 +88,17.2,0 +105,14.7,0 +70,13.7,0 +89,15,0 +118,14.8,0 +81,15,0 +125,17.4,0 +82,14.9,0 +107,13.6,0 +83,16.2,0 +79,17.6,0 +109,15,0 +133,13.7,0 +111,16.3,0 +74,15.1,0 +88,16.4,0 +97,13.8,0 +78,16.1,0 +113,15.3,0 +75,16.8,0 +119,13.9,0 +132,15.4,0 +113,16.5,0 +100,16.4,0 +93,16.7,0 +94,15.5,0 +112,17,0 +99,15,0 +85,15.6,0 +133,14.8,0 +117,13,0 +137,14.1,0 +140,15.7,0 +75,16.5,0 +100,15.8,0 +114,14.2,0 +131,15.8,0