主成分分析

2018-09-05 本文已影响0人 Muggle01

算法原理

在用统计分析方法研究多变量的课题时，变量个数太多就会增加课题的复杂性。人们自然希望变量个数较少而得到的信息较多。在很多情形，变量之间是有一定的相关关系的，当两个变量之间有一定相关关系时，可以解释为这两个变量反映此课题的信息有一定的重叠。主成分分析是对于原先提出的所有变量，将重复的变量（关系紧密的变量）删去多余，建立尽可能少的新变量，使得这些新变量是两两不相关的，而且这些新变量在反映课题的信息方面尽可能保持原有的信息。

设法将原来变量重新组合成一组新的互相无关的几个综合变量，同时根据实际需要从中可以取出几个较少的综合变量尽可能多地反映原来变量的信息的统计方法叫做主成分分析或称主分量分析，也是数学上用来降维的一种方法。

PCA的作用

*主成分分析能降低所研究的数据空间
*多维数据的一种图形表示方法。
*由主成分分析法构造回归模型。
*用主成分分析筛选回归变量。

原程序

底层代码

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_lfw_people

#PCA问题不能将样本数据归一化，应保证样本的方差不变

class PCA:

    def __init__(self, n_components):
        """初始化PCA"""
        assert n_components >= 1, "n_components must be valid"
        self.n_components = n_components
        self.components_ = None

    def fit(self, X, eta=0.01, n_iters=1e4):
        """获得数据集X的前n个主成分"""
        assert self.n_components <= X.shape[1], \
            "n_components must not be greater than the feature number of X"

        def demean(X):
            return X - np.mean(X, axis=0)

        def f(w, X):
            return np.sum((X.dot(w)) ** 2) / len(X)

        def df(w, X, delta_w=1e-3):
            """利用求导后公式求梯度"""
            return X.T.dot(X.dot(w)) * 2. / len(X)

#            """利用导数定义求梯度，不适合用在有数据归一化的线性回归中，且不适合大量数据"""
#            res = np.empty(len(w))
#            for i in range(len(w)):
#                  w_1 = w.copy()
#                  w_2 = w.copy()
#                  w_1[i] += delta_w
#                  w_2[i] -= delta_w
#                  res[i] = ( f(w_1, X) - f(w_2, X) )/(2*delta_w)
#            return res

        def direction(w):
            return w / np.linalg.norm(w)

        def first_component(X, initial_w, eta=0.01, n_iters=1e4, epsilon=1e-8):

            w = direction(initial_w)
            cur_iter = 0

            while cur_iter < n_iters:
                gradient = df(w, X)
                last_w = w
                w = w + eta * gradient
                w = direction(w)
                if (abs(f(w, X) - f(last_w, X)) < epsilon):
                    break

                cur_iter += 1

            return w

        X_pca = demean(X)
        self.components_ = np.empty(shape=(self.n_components, X.shape[1]))
        for i in range(self.n_components):
            initial_w = np.random.random(X_pca.shape[1]) #初始单位向量不能为0
            w = first_component(X_pca, initial_w, eta, n_iters)
            self.components_[i,:] = w

            X_pca = X_pca - X_pca.dot(w).reshape(-1, 1) * w

        return self

    def transform(self, X):
        """将给定的X，映射到各个主成分分量中"""
        assert X.shape[1] == self.components_.shape[1]

        return X.dot(self.components_.T)

    def inverse_transform(self, X):
        """将给定的X，反向映射回原来的特征空间"""
        assert X.shape[1] == self.components_.shape[0]

        return X.dot(self.components_)

    def __repr__(self):
        return "PCA(n_components=%d)" % self.n_components

#"""PCA进行数据的降维"""
#x = np.empty((100, 2))
#x[:, 0] = np.random.uniform(0., 100., size=100)
#x[:, 1] = 0.75 * x[:, 0] + 3. + np.random.normal(0, 10, size=100)
#
#text_pca = PCA(1)
#text_pca.fit( x, eta=0.01, n_iters=1e4)
#
#x_output = text_pca.transform(x)
#x_last = text_pca.inverse_transform(x_output)
#
#plt.scatter(x[:,0], x[:, 1])
#plt.scatter(x_last[:,0], x_last[:, 1])
#plt.show()



"""PCA用于人脸识别"""
#face = fetch_lfw_people()
face = fetch_lfw_people(min_faces_per_person=60)
print(face.target_names)
face_test=face.data
print(face_test.shape)

#输出几张样例
#random_indexes = np.random.permutation(len(face.data))
#face_test = face.data[random_indexes]
#example_afces = face_test[:36, :]

def plot_face (faces):
    fig, axes = plt.subplots(6,6,figsize=(10,10),
                            subplot_kw={'xticks':[],'yticks':[]},
                            gridspec_kw=dict(hspace=0.1,wspace=0.1))
    for i, ax in enumerate(axes.flat):
        ax.imshow(faces[i].reshape(62,47), cmap='bone')
    plt.show()

#plot_face(example_faces)

pca = PCA(36)
pca.fit(face_test, eta=0.01, n_iters=1e4)
plot_face(pca.components_[:36,:])

sklearn代码

#!/usr/bin/env python 
# -*- coding: UTF-8 -*-
import numpy as np
from  sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_lfw_people


##人脸识别
#face = fetch_lfw_people()
#
#def plot_face (faces):
#    fig, axes = plt.subplots(6,6,figsize=(10,10),
#                            subplot_kw={'xticks':[],'yticks':[]},
#                            gridspec_kw=dict(hspace=0.1,wspace=0.1))
#    for i, ax in enumerate(axes.flat):
#        ax.imshow(faces[i].reshape(62,47), cmap='bone')
#    plt.show()
#
##plot_face(example_faces)
#
#pca = PCA(svd_solver='randomized')
#pca.fit(face.data)
#plot_face(pca.components_[:36,:])
        



#数据降维
x = np.empty((100, 3))
x[:, 0] = np.random.uniform(0., 100., size=100)
x[:, 1] = 0.75 * x[:, 0] + 3. + np.random.normal(0, 10, size=100)
x[:, 2] = 2 * x[:, 0] + 3. + np.random.normal(0, 10, size=100)
pca = PCA(0.98) #0.98表示主成分能解释的原数据的方差
pca.fit(x)
print(pca.components_)

python tips

采用导数原理计算梯度效率低下，应尽量避免。