python学习笔记之PCA
2017-10-27 本文已影响10人
Chelsea_Dagger
1.PCA算法简述
输入:n维样本集D=(x1,x2,...,xm),要降到的维数n‘。
输出:降维后的样本集D'。
1)对所有的样本进行中心化,即将样本中每列数据减去该列平均值;
2)计算样本的协方差矩阵;
3)对得到的协方差矩阵进行特征值分解,去除最大的n'个特征值对应的特征向量,将所有的特征向量标准化之后,组成特征向量矩阵;
4)将原样本集转化为新样本集。
2.python实现PCA算法过程
数据集:sklearn包自带的鸢尾花数据集
array([[ 5.1, 3.5, 1.4, 0.2],
[ 4.9, 3. , 1.4, 0.2],
[ 4.7, 3.2, 1.3, 0.2],
[ 4.6, 3.1, 1.5, 0.2],
[ 5. , 3.6, 1.4, 0.2],
[ 5.4, 3.9, 1.7, 0.4],
[ 4.6, 3.4, 1.4, 0.3],
[ 5. , 3.4, 1.5, 0.2],
[ 4.4, 2.9, 1.4, 0.2],
[ 4.9, 3.1, 1.5, 0.1],
[ 5.4, 3.7, 1.5, 0.2],
[ 4.8, 3.4, 1.6, 0.2],
[ 4.8, 3. , 1.4, 0.1],
[ 4.3, 3. , 1.1, 0.1],
[ 5.8, 4. , 1.2, 0.2],
[ 5.7, 4.4, 1.5, 0.4],
[ 5.4, 3.9, 1.3, 0.4],
[ 5.1, 3.5, 1.4, 0.3],
[ 5.7, 3.8, 1.7, 0.3],
[ 5.1, 3.8, 1.5, 0.3],
[ 5.4, 3.4, 1.7, 0.2],
[ 5.1, 3.7, 1.5, 0.4],
[ 4.6, 3.6, 1. , 0.2],
[ 5.1, 3.3, 1.7, 0.5],
[ 4.8, 3.4, 1.9, 0.2],
[ 5. , 3. , 1.6, 0.2],
[ 5. , 3.4, 1.6, 0.4],
[ 5.2, 3.5, 1.5, 0.2],
[ 5.2, 3.4, 1.4, 0.2],
[ 4.7, 3.2, 1.6, 0.2],
[ 4.8, 3.1, 1.6, 0.2],
[ 5.4, 3.4, 1.5, 0.4],
[ 5.2, 4.1, 1.5, 0.1],
[ 5.5, 4.2, 1.4, 0.2],
[ 4.9, 3.1, 1.5, 0.1],
[ 5. , 3.2, 1.2, 0.2],
[ 5.5, 3.5, 1.3, 0.2],
[ 4.9, 3.1, 1.5, 0.1],
[ 4.4, 3. , 1.3, 0.2],
[ 5.1, 3.4, 1.5, 0.2],
[ 5. , 3.5, 1.3, 0.3],
[ 4.5, 2.3, 1.3, 0.3],
[ 4.4, 3.2, 1.3, 0.2],
[ 5. , 3.5, 1.6, 0.6],
[ 5.1, 3.8, 1.9, 0.4],
[ 4.8, 3. , 1.4, 0.3],
[ 5.1, 3.8, 1.6, 0.2],
[ 4.6, 3.2, 1.4, 0.2],
[ 5.3, 3.7, 1.5, 0.2],
[ 5. , 3.3, 1.4, 0.2],
[ 7. , 3.2, 4.7, 1.4],
[ 6.4, 3.2, 4.5, 1.5],
[ 6.9, 3.1, 4.9, 1.5],
[ 5.5, 2.3, 4. , 1.3],
[ 6.5, 2.8, 4.6, 1.5],
[ 5.7, 2.8, 4.5, 1.3],
[ 6.3, 3.3, 4.7, 1.6],
[ 4.9, 2.4, 3.3, 1. ],
[ 6.6, 2.9, 4.6, 1.3],
[ 5.2, 2.7, 3.9, 1.4],
[ 5. , 2. , 3.5, 1. ],
[ 5.9, 3. , 4.2, 1.5],
[ 6. , 2.2, 4. , 1. ],
[ 6.1, 2.9, 4.7, 1.4],
[ 5.6, 2.9, 3.6, 1.3],
[ 6.7, 3.1, 4.4, 1.4],
[ 5.6, 3. , 4.5, 1.5],
[ 5.8, 2.7, 4.1, 1. ],
[ 6.2, 2.2, 4.5, 1.5],
[ 5.6, 2.5, 3.9, 1.1],
[ 5.9, 3.2, 4.8, 1.8],
[ 6.1, 2.8, 4. , 1.3],
[ 6.3, 2.5, 4.9, 1.5],
[ 6.1, 2.8, 4.7, 1.2],
[ 6.4, 2.9, 4.3, 1.3],
[ 6.6, 3. , 4.4, 1.4],
[ 6.8, 2.8, 4.8, 1.4],
[ 6.7, 3. , 5. , 1.7],
[ 6. , 2.9, 4.5, 1.5],
[ 5.7, 2.6, 3.5, 1. ],
[ 5.5, 2.4, 3.8, 1.1],
[ 5.5, 2.4, 3.7, 1. ],
[ 5.8, 2.7, 3.9, 1.2],
[ 6. , 2.7, 5.1, 1.6],
[ 5.4, 3. , 4.5, 1.5],
[ 6. , 3.4, 4.5, 1.6],
[ 6.7, 3.1, 4.7, 1.5],
[ 6.3, 2.3, 4.4, 1.3],
[ 5.6, 3. , 4.1, 1.3],
[ 5.5, 2.5, 4. , 1.3],
[ 5.5, 2.6, 4.4, 1.2],
[ 6.1, 3. , 4.6, 1.4],
[ 5.8, 2.6, 4. , 1.2],
[ 5. , 2.3, 3.3, 1. ],
[ 5.6, 2.7, 4.2, 1.3],
[ 5.7, 3. , 4.2, 1.2],
[ 5.7, 2.9, 4.2, 1.3],
[ 6.2, 2.9, 4.3, 1.3],
[ 5.1, 2.5, 3. , 1.1],
[ 5.7, 2.8, 4.1, 1.3],
[ 6.3, 3.3, 6. , 2.5],
[ 5.8, 2.7, 5.1, 1.9],
[ 7.1, 3. , 5.9, 2.1],
[ 6.3, 2.9, 5.6, 1.8],
[ 6.5, 3. , 5.8, 2.2],
[ 7.6, 3. , 6.6, 2.1],
[ 4.9, 2.5, 4.5, 1.7],
[ 7.3, 2.9, 6.3, 1.8],
[ 6.7, 2.5, 5.8, 1.8],
[ 7.2, 3.6, 6.1, 2.5],
[ 6.5, 3.2, 5.1, 2. ],
[ 6.4, 2.7, 5.3, 1.9],
[ 6.8, 3. , 5.5, 2.1],
[ 5.7, 2.5, 5. , 2. ],
[ 5.8, 2.8, 5.1, 2.4],
[ 6.4, 3.2, 5.3, 2.3],
[ 6.5, 3. , 5.5, 1.8],
[ 7.7, 3.8, 6.7, 2.2],
[ 7.7, 2.6, 6.9, 2.3],
[ 6. , 2.2, 5. , 1.5],
[ 6.9, 3.2, 5.7, 2.3],
[ 5.6, 2.8, 4.9, 2. ],
[ 7.7, 2.8, 6.7, 2. ],
[ 6.3, 2.7, 4.9, 1.8],
[ 6.7, 3.3, 5.7, 2.1],
[ 7.2, 3.2, 6. , 1.8],
[ 6.2, 2.8, 4.8, 1.8],
[ 6.1, 3. , 4.9, 1.8],
[ 6.4, 2.8, 5.6, 2.1],
[ 7.2, 3. , 5.8, 1.6],
[ 7.4, 2.8, 6.1, 1.9],
[ 7.9, 3.8, 6.4, 2. ],
[ 6.4, 2.8, 5.6, 2.2],
[ 6.3, 2.8, 5.1, 1.5],
[ 6.1, 2.6, 5.6, 1.4],
[ 7.7, 3. , 6.1, 2.3],
[ 6.3, 3.4, 5.6, 2.4],
[ 6.4, 3.1, 5.5, 1.8],
[ 6. , 3. , 4.8, 1.8],
[ 6.9, 3.1, 5.4, 2.1],
[ 6.7, 3.1, 5.6, 2.4],
[ 6.9, 3.1, 5.1, 2.3],
[ 5.8, 2.7, 5.1, 1.9],
[ 6.8, 3.2, 5.9, 2.3],
[ 6.7, 3.3, 5.7, 2.5],
[ 6.7, 3. , 5.2, 2.3],
[ 6.3, 2.5, 5. , 1.9],
[ 6.5, 3. , 5.2, 2. ],
[ 6.2, 3.4, 5.4, 2.3],
[ 5.9, 3. , 5.1, 1.8]])
#encoding = utf-8
import numpy as np
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn import preprocessing
#preprocessing是sklearn的一个数据预处理的包,用来进行数据归一化、正则化
iris = load_iris()
iris_data = iris.data
#取'data'属性对应的array
iris_scaled =iris_data - np.mean(iris_data,axis=0)
#数据中心化,也可以对列分别处理,比如iris_data[:,0] = iris_data[:,0] - np.mean(iris_data[:,0])
iris_scaled_cov = np.cov(iris_scaled.T)
#计算协方差矩阵
iris_eig = np.linalg.eig(iris_scaled_cov)
#得到协方差矩阵的特征值和特征向量
(array([ 4.22484077, 0.24224357, 0.07852391, 0.02368303]),
array([[ 0.36158968, -0.65653988, -0.58099728, 0.31725455],
[-0.08226889, -0.72971237, 0.59641809, -0.32409435],
[ 0.85657211, 0.1757674 , 0.07252408, -0.47971899],
[ 0.35884393, 0.07470647, 0.54906091, 0.75112056]]))
特征值是由大到小排列的,前两个特征值的和已经超过了所有特征值之和的97%。我们取前两个特征值对应的特征向量,得到一个4×2的矩阵M。
A' = AM
即得到了降维后的数据集
M = np.array([[0.36158968, -0.65653988],
[-0.08226889, -0.72971237],
[0.85657211, 0.1757674],
[0.35884393, 0.07470647]])
iris_pre = np.dot(iris_data,M)
print iris_pre
最终得到PCA降维后的鸢尾花数据集
[[ 2.82713599 -5.64133103]
[ 2.7959525 -5.14516687]
[ 2.62152358 -5.17737811]
[ 2.76490592 -5.0035994 ]
[ 2.78275014 -5.64864828]
[ 3.23144576 -6.06250643]
[ 2.69045243 -5.2326192 ]
[ 2.88486113 -5.48512906]
[ 2.62338455 -4.74392569]
[ 2.83749843 -5.20803201]
[ 3.00481633 -5.96665873]
[ 2.8982004 -5.33624435]
[ 2.72390914 -5.08698353]
[ 2.28614267 -4.81144381]
[ 2.8677999 -6.50091861]
[ 3.1274738 -6.65947806]
[ 2.88881692 -6.13281339]
[ 2.86302039 -5.63386038]
[ 3.31226516 -6.1939678 ]
[ 2.92399693 -5.83519735]
[ 3.20081142 -5.71259154]
[ 2.96810821 -5.75475547]
[ 2.29548542 -5.45633929]
[ 3.20821458 -5.42024639]
[ 3.15517203 -5.28351413]
[ 3.00342589 -5.17566738]
[ 3.04228712 -5.45261103]
[ 2.94895217 -5.68940828]
[ 2.87152185 -5.63401378]
[ 2.87849521 -5.12464789]
[ 2.92288107 -5.11733064]
[ 3.10126578 -5.73280372]
[ 2.86370645 -6.13470635]
[ 2.91418364 -6.41474564]
[ 2.83749843 -5.20803201]
[ 2.64434327 -5.39191681]
[ 2.88611465 -5.92152372]
[ 2.83749843 -5.20803201]
[ 2.52950045 -4.83447367]
[ 2.92102009 -5.55078305]
[ 2.74120421 -5.58578313]
[ 2.65913204 -4.38185835]
[ 2.51304667 -4.98041614]
[ 3.10582902 -5.51064097]
[ 3.30251017 -5.75741975]
[ 2.79567793 -5.07204223]
[ 2.97376975 -5.82509126]
[ 2.67102182 -5.09414738]
[ 2.96865736 -5.90100474]
[ 2.8074308 -5.42973457]
[ 6.79613773 -6.00016291]
[ 6.44375389 -5.63392181]
[ 6.97540447 -5.81891355]
[ 5.69231034 -4.48911978]
[ 6.59847763 -5.39011411]
[ 6.15177989 -4.89740024]
[ 6.60656685 -5.59861493]
[ 4.75987599 -4.31361621]
[ 6.55464092 -5.54368063]
[ 5.50115306 -4.59414886]
[ 5.00025694 -4.05223177]
[ 6.0224412 -5.21243962]
[ 5.77367889 -4.76683042]
[ 6.49538769 -5.1903633 ]
[ 5.33647913 -5.06290815]
[ 6.43891608 -5.78295992]
[ 6.17093593 -4.96274743]
[ 5.74588372 -4.98280189]
[ 6.45370485 -4.77290147]
[ 5.55458953 -4.73323428]
[ 6.62758387 -5.23050971]
[ 5.8681297 -5.24789989]
[ 6.80781199 -4.9871622 ]
[ 6.43184579 -5.13233336]
[ 6.22535135 -5.46510287]
[ 6.410984 -5.6443347 ]
[ 6.84238456 -5.55939324]
[ 7.06873942 -5.58211631]
[ 6.32379869 -5.15239215]
[ 5.20400838 -4.94963711]
[ 5.44100025 -4.61218579]
[ 5.31945864 -4.63723318]
[ 5.64633809 -5.00301408]
[ 6.89008013 -4.89351859]
[ 6.09861799 -4.83143946]
[ 6.31854864 -5.50977769]
[ 6.73177211 -5.72275906]
[ 6.32421093 -4.94404472]
[ 5.7565383 -5.04799569]
[ 5.67585656 -4.63506225]
[ 5.97437413 -4.64519718]
[ 6.40150359 -5.28091128]
[ 5.74022219 -4.9124661 ]
[ 4.80426185 -4.30629896]
[ 5.86687618 -4.81150524]
[ 5.84247008 -5.10354358]
[ 5.88658137 -5.0231017 ]
[ 6.15303342 -5.3337949 ]
[ 4.6028798 -4.563155 ]
[ 5.80915104 -4.9677072 ]
[ 8.04307013 -5.30288149]
[ 6.92541537 -4.73979867]
[ 8.12782776 -5.65665901]
[ 7.48215809 -5.13359803]
[ 7.86110113 -5.27284118]
[ 8.90822308 -5.86189177]
[ 6.03072638 -4.12337204]
[ 8.44334825 -5.66710073]
[ 7.83101594 -5.06917556]
[ 8.42947739 -6.09510435]
[ 7.17327809 -5.55676212]
[ 7.3136836 -5.09856912]
[ 7.67672201 -5.53000401]
[ 6.85593736 -4.5383083 ]
[ 7.09661044 -4.77541667]
[ 7.41608673 -5.43354271]
[ 7.46059193 -5.35545398]
[ 9.00010854 -6.48626827]
[ 9.30603002 -5.5679893 ]
[ 6.80967297 -4.55370979]
[ 7.93951041 -5.6915057 ]
[ 6.70944051 -4.70914476]
[ 9.01060864 -5.77149719]
[ 6.89901139 -5.11069274]
[ 7.7871968 -5.64811025]
[ 8.12553698 -5.87309067]
[ 6.76896833 -5.13558673]
[ 6.80201279 -5.19829847]
[ 7.63419713 -5.10386884]
[ 7.89890755 -5.77724297]
[ 8.35230408 -5.68746631]
[ 8.74368605 -6.68524776]
[ 7.67008152 -5.09639819]
[ 6.95444575 -5.17092243]
[ 7.29098325 -4.81325893]
[ 8.58786478 -6.00048817]
[ 7.65633001 -5.45363033]
[ 7.41620607 -5.36277123]
[ 6.68019661 -5.15022122]
[ 7.61899688 -5.68620597]
[ 7.82564654 -5.49733257]
[ 7.43379403 -5.7239949 ]
[ 6.92541537 -4.73979867]
[ 8.07466586 -5.59069823]
[ 7.93073437 -5.61822766]
[ 7.4553602 -5.50213894]
[ 7.03700678 -4.93970288]
[ 7.27538908 -5.39324291]
[ 7.41297222 -5.43060047]
[ 6.90100928 -5.03183702]]