-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathEMGMMbyK.py
More file actions
292 lines (225 loc) · 8.96 KB
/
EMGMMbyK.py
File metadata and controls
292 lines (225 loc) · 8.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
__author__ = 'seanhendryx'
# Sean Hendryx
# Authored in the Spring of 2016 at the University of Arizona as part of thesis work aimed at quantifying natural resources from computer vision.
# Script built to run on Python version 3.4
# References:
# scipy.cluster.vq.kmeans: Centroids used to initialize mu vectors <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.kmeans.html#scipy.cluster.vq.kmeans>
#NOTES: Test on very small subset of image pixels. Actual test image has 1,973,905 pixels (i.e. observations (N))
import numpy
from scipy.cluster.vq import kmeans
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
def main():
"""
Terminology:
df:= dataframe (observations, commonly denoted bold X)
mu := vector of means for cluster/class k
covar := array of covariance matrix for each cluster
weights := vector of mixture weights for each class and must sum to 1
k := number of clusters
gamma := the responsibility of component k for data point xn (vector of length N (number of observations).
:return:
"""
#Specify largest number of classes + 1 (e.g. 11 yields test on 10 classes)
mostClasses = 11
visLk = numpy.zeros((mostClasses, 2))
#SPECIFY NUMBER OF DIMENSIONS (3 in this case (RGB)):
dimensions = 3
df = numpy.loadtxt('mesquitesSubsetPixelData.csv', delimiter=',')
#Split data by every nth element (number stored in fold variable), i.e. putting .1 into test dataset and .9 into training
fold = 10
testdf = df[::fold,:]
#wholeSet = df
#Training dataset with test data removed:
df = numpy.delete(df, list(range(0, df.shape[0], fold)), axis = 0)
for c in numpy.arange(start = 2, stop = mostClasses):
#SPECIFY NUMBER OF CLUSTERS/CLASSES:
k = c
mu, covar, weights = initialize(df, k, dimensions)
#print(mu, covar, weights)
print("Initialized mu: ", mu)
step = 0
#NUMBER OF ITERATIONS:
iterations = 11
#Make 2d arrays of likelihood and iterations (step, l) for training (visL) and test (visValidation) data
visL = numpy.zeros((iterations, 2))
#visValidation = numpy.zeros((1000, 2))
while step <iterations:
gamma = expectation(df, weights, mu, covar, k)
mu, covar, weights = maximize(df, weights, mu, covar, gamma, k)
"""
print("new mu: ", mu)
print('new covariance matrices:', covar)
print('new weights/mixing coefficients for k: ', weights)
"""
newLikelihood = evalLikelihood(df, weights, mu, covar, k)
print('Iteration #:', step)
print ('L: ', newLikelihood)
l = newLikelihood
visL[step,:] = (step, l)
#visValidation[step,:] = (step, validationLikelihood)
step += 1
x = visL[:,0]
y = visL[:,1]
plt.figure(k)
plt.plot(x, y)
plt.title("Log Likelihood Progress Over {} Iterations and {} Classes".format(iterations-1, k))
plt.xlabel('Iteration')
plt.ylabel("L")
testL = evalLikelihood(testdf, weights, mu, covar, k)
visLk[c,:] = (c,testL)
x= visLk[:,0]
y = visLk[:,1]
plt.figure(k+1)
plt.plot(x,y)
plt.title('Log Likelihood on Validation Dataset as a Function of Number of Classes')
plt.xlabel('K')
plt.ylabel('L')
plt.show()
def initialize(df, k, dimensions):
"""
Initialize mean vector (mu) with kmeans
:param df:
:param mu:
:param covar:
:param weights:
:return:
"""
#initialize cluster centers with kmeans:
centroids,_ = kmeans(df,k)
mu = centroids
covarShape = (dimensions, dimensions, k)
covar = numpy.zeros(covarShape)
for i in numpy.arange(k):
covar[:,:,i] = numpy.identity(dimensions)
weights = numpy.zeros(k)
for i in numpy.arange(k):
weights[i] = 1./k
return mu, covar, weights
def expectation(df, weights, mu, covar, k):
"""
Computes the responsibility of component k for data point xn (from each observation in df).
:param df:
:param weights:
:param mu:
:param covar:
:param k:
:return: responsibility of component k for data point xn in matrix with rows containing observations (xns) and
columns containing classes (ks)
"""
#initialize gamma array:
#the responsibility of component k for data point xn
#k column array of n rows
gamma = numpy.zeros((df.shape[0],k))
for kth in numpy.arange(k):
#numerator
muVec = mu[kth]
c = covar[:,:,kth]
numerator = numpy.multiply(weights[kth],multivariate_normal.pdf(df, muVec, c))
runSum = numpy.zeros_like(numerator)
for jth in numpy.arange(k):
muj = mu[jth]
cj = covar[:,:,jth]
jthDenominator = numpy.multiply(weights[jth],multivariate_normal.pdf(df, muj, cj))
runSum += jthDenominator
denominator = runSum
denominator = numpy.nan_to_num(denominator)
numerator = numpy.nan_to_num(numerator)
g = numpy.divide(numerator, denominator)
g = numpy.nan_to_num(g)
gamma[:,kth] = g
return gamma
def maximize(df, weights, mu, covar, gamma, k):
"""
:param df: input dataframe (observations of d columns and n rows)
:param weights: mixing coefficients of k classes
:param mu: input mean array, where each row represent the mean of clustuer k
:param covar: covariance array of shape (dimensions, dimensions, k)
:param gamma: responsibility of k for xn
:param k: scalar number of components
:return: new mu, new stack of covariance matrices (covariance array of shape (dimensions, dimensions, k)), and new weights
"""
Nk = numpy.sum(gamma, 0)
N = df.shape[0]
#FIND NEW MEANS
# Note gamma: rows containing observations (xns) and columns containing classes (ks)
muNew = numpy.zeros_like(mu)
"""
for kth in numpy.arange(k):
prodsArrayShape = (df.shape[0], df.shape[1])
prodsArray = numpy.zeros(prodsArrayShape)
for nth in numpy.arange(N):
prod = gamma[nth, kth] * df[nth,:]
prodsArray[nth,:] = prod
sum = numpy.sum(prodsArray, 0)
muNew[kth] = sum * (1./Nk[kth])
Old attemp to fix issues with dimensions^
"""
for kth in numpy.arange(k):
kthgamma = gamma[:,kth]
kthgamma = kthgamma[:,numpy.newaxis]
prod = kthgamma * df
sum = numpy.sum(prod, axis = 0)
muVec = numpy.multiply((1/Nk[kth]), sum)
muVec = numpy.nan_to_num(muVec)
muNew[kth] = muVec
#FIND NEW COVARIANCE MATRICES
covarNew = numpy.zeros_like(covar)
prod = 0.
sum = 0.
for kth in numpy.arange(k):
distance = df - muNew[kth]
stackedCovarsShape = (df.shape[1], df.shape[1], df.shape[0])
stackedCovars = numpy.zeros(stackedCovarsShape)
for nth in numpy.arange(N):
outerProd = numpy.outer(distance[nth], distance[nth])
prod = gamma[nth, kth] * outerProd
stackedCovars[:,:,nth] = prod
sum = numpy.sum(stackedCovars, axis = 2)
covarNew[:,:,kth] = sum * (1./Nk[kth])
covarNew = numpy.nan_to_num(covarNew)
#FIND NEW MIXING COEFFICIENTS
weightsNew = numpy.zeros_like(weights)
for kth in numpy.arange(k):
weightsNew[kth] = Nk[kth]/N
return muNew, covarNew, weightsNew
def evalLikelihood(df, weights, mu, covar, k):
"""
Computes the log (base 10) of likelihood of p(X|mu, covariance matrices, and weights)
input parameters are same as above
:param df:
:param weights:
:param mu:
:param covar:
:param k:
:return: scalar log liklihood
"""
runSum = numpy.zeros(df.shape[0])
for kth in numpy.arange(k):
muk = mu[kth]
ck = covar[:,:,kth]
prod = numpy.multiply(weights[kth],multivariate_normal.pdf(df, muk, ck))
runSum += prod
log = numpy.log10(runSum)
logLikelihood = numpy.sum(log)
return logLikelihood
def mvNormal(x, mu, covarianceMatrix):
"""
:param x: Input values (arrays) to use in the calculation of the multivariate Gaussian probability density
:param mu: mean
:param covarianceMatrix: input covariance matrix that determines how the variables covary
:return: The probability densities from the multivariate normal distribution
"""
#k = the dimension of the space where x takes values
k = x.shape[0]
#distance is the "distance between x and mean:
distance = x-mu
#Covariance matrix as specified in assignment prompt:
firstFragment = numpy.exp(-0.5*k*numpy.log(2*numpy.pi))
secondFragment = numpy.power(numpy.linalg.det(covarianceMatrix),-0.5)
thirdFragment = numpy.exp(-0.5*numpy.dot(numpy.dot(distance.transpose(),numpy.linalg.inv(covarianceMatrix)),distance))
multivariateNormalDistribution = firstFragment*secondFragment*thirdFragment
return multivariateNormalDistribution
# Main Function
if __name__ == '__main__':
main()