-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPCAstepbystep.py
More file actions
69 lines (43 loc) · 1.61 KB
/
PCAstepbystep.py
File metadata and controls
69 lines (43 loc) · 1.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Retrieve data, in this case from an Excel file
path = '~/yourdata'
d = pd.read_excel(path, sheet_name='sheet')
# Store data in a pandas DataFrame and center data
d = pd.DataFrame(d)
mean_vec = np.mean(d, axis=0) # mean of each variable
var_vec = np.var(d,axis=0)
n,m = d.shape
d_std = d - mean_vec #center data
# Covariance matrix
cov_mat = (d_std.T.dot(d_std))/ (n - 1)
# Eigen-decomposition of the covariance matrix
eigval, eigvec = np.linalg.eig(cov_mat)
# check eigenvectors have unit length 1
for i in eigvec:
np.testing.assert_array_almost_equal(1.0, np.linalg.norm(i))
print('Everything ok!')
# Sorting eigenvectors according to their eigenvalues
idx = eigval.argsort()[::-1]
eigval = eigval[idx]
eigvec = eigvec[:, idx]
tot = sum(eigval)
var_exp = [i/tot*100 for i in sorted(eigval, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
PC = ['PC%s' %s for s in range(1, len(eigval) + 1)] #creating a vector of PCs names
# Plotting components and their explained variance
plt.scatter(var_exp, PC, alpha= 0.5)
plt.title('Explained variance')
plt.xlabel('Principal Components')
plt.ylabel('Explained Variance (%)')
plt.show()
# Selection of components
n_components = 3 #arbitrary number, it varies case by case
loadings = eigvec[:,:n_components:]
# Projection onto the new features space
scores = d_std.dot(loadings)
# Reconstructing original data and calculating the residuals
d_hat = scores.dot(loadings.T)
d_hat_raw = d_hat + mean_vec.values
res = d - d_hat_raw