Python数据分析与机器学习36-PCA实例

2022-07-29  本文已影响0人  只是甲

一. 数据简单的分析

我们使用的是鸢尾花的数据集
python

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math

# 读取数据源
df = pd.read_csv('E:/file/iris.data')
df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
print(df.head())

# 切分数据和标签
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values


label_dict = {1: 'Iris-Setosa',
              2: 'Iris-Versicolor',
              3: 'Iris-Virgnica'}

feature_dict = {0: 'sepal length [cm]',
                1: 'sepal width [cm]',
                2: 'petal length [cm]',
                3: 'petal width [cm]'}

# 画图观测数据集
plt.figure(figsize=(8, 6))
for cnt in range(4):
    plt.subplot(2, 2, cnt+1)
    for lab in ('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'):
        plt.hist(X[y==lab, cnt],
                     label=lab,
                     bins=10,
                     alpha=0.3,)
    plt.xlabel(feature_dict[cnt])
    plt.legend(loc='upper right', fancybox=True, fontsize=8)

plt.tight_layout()
plt.show()

测试记录:

   sepal_len  sepal_wid  petal_len  petal_wid        class
0        4.9        3.0        1.4        0.2  Iris-setosa
1        4.7        3.2        1.3        0.2  Iris-setosa
2        4.6        3.1        1.5        0.2  Iris-setosa
3        5.0        3.6        1.4        0.2  Iris-setosa
4        5.4        3.9        1.7        0.4  Iris-setosa
image.png

二. 查看特征向量的重要性

代码:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math
from sklearn.preprocessing import StandardScaler


# 读取数据源
df = pd.read_csv('E:/file/iris.data')
df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
#print(df.head())

# 切分数据和标签
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values

# 数据归一化
X_std = StandardScaler().fit_transform(X)

# 协方差矩阵(对角线元素为1,自身与自身)
cov_mat = np.cov(X_std.T)

# 计算方阵的特征值和特征向量
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

# 生成一个特征值和特征向量的二元组
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
#print(eig_pairs)
#print ('----------')

# 排序
eig_pairs.sort(key=lambda x: x[0], reverse=True)

# 输出特征值:
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])


# cumsum求累加值,然后乘100,就是百分比了
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
print (var_exp)
cum_var_exp = np.cumsum(var_exp)
cum_var_exp


# 画图
plt.figure(figsize=(6, 4))

plt.bar(range(4), var_exp, alpha=0.5, align='center',
            label='individual explained variance')
plt.step(range(4), cum_var_exp, where='mid',
             label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

测试记录:

Eigenvalues in descending order:
2.9244283691111135
0.9321523302535066
0.1494637348981336
0.02098259276427038
[72.6200333269203, 23.14740685864414, 3.7115155645845284, 0.5210442498510098]
image.png

三. PCA降维

PCA降维的步骤:

  1. 协方差矩阵
  2. 特征值 特征向量
  3. 特征值大的提取出来 (例如4维降到2维)就提取最重要的2个
  4. 1504 乘以 42 = 150*2

代码:

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math
from sklearn.preprocessing import StandardScaler


# 读取数据源
df = pd.read_csv('E:/file/iris.data')
df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
#print(df.head())

# 切分数据和标签
X = df.iloc[:,0:4].values
y = df.iloc[:,4].values

# 数据归一化
X_std = StandardScaler().fit_transform(X)

# 协方差矩阵(对角线元素为1,自身与自身)
cov_mat = np.cov(X_std.T)

# 计算方阵的特征值和特征向量
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

# 生成一个特征值和特征向量的二元组
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
#print(eig_pairs)
#print ('----------')

# 排序
eig_pairs.sort(key=lambda x: x[0], reverse=True)

# 根据特征值生成一个4*2矩阵
matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1),
                      eig_pairs[1][1].reshape(4,1)))

# 原始矩阵 [15*4] * [4*2] = [15*2]  达到降维的效果
Y = X_std.dot(matrix_w)


# 降维前的效果
plt.figure(figsize=(6, 4))
for lab, col in zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'),
                        ('blue', 'red', 'green')):
     plt.scatter(X[y==lab, 0],
                X[y==lab, 1],
                label=lab,
                c=col)
plt.xlabel('sepal_len')
plt.ylabel('sepal_wid')
plt.legend(loc='best')
plt.tight_layout()
#plt.show()


# 降维后的效果
plt.figure(figsize=(6, 4))
for lab, col in zip(('Iris-setosa', 'Iris-versicolor', 'Iris-virginica'),
                        ('blue', 'red', 'green')):
     plt.scatter(Y[y==lab, 0],
                Y[y==lab, 1],
                label=lab,
                c=col)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower center')
plt.tight_layout()
plt.show()

测试记录:

image.png image.png

参考:

  1. https://study.163.com/course/introduction.htm?courseId=1003590004#/courseDetail?tab=1
上一篇下一篇

猜你喜欢

热点阅读