R语言可视化9: PCA图 - ggplot2/factoext

2023-04-21  本文已影响0人  小程的学习笔记

PCA分析,全称Principal Components Analysis,即主成分分析,这是降维中最常见的一种方法。其是一种无监督算法,不需要标签即可对数据进行降维;降维后,由于失去了标签,可能无法理解每个维度的含义;但至少减少了数据维度,使得计算机能更好的识别和计算。

1. 使用\color{green}{ggplot2}包绘制PCA图

1.1 逐步计算PCA分析中的参数

# 安装并加载所需的R包
# install.packages("ggplot2")
library(ggplot2)
library(tidyverse)

# 使用R语言自带的iris数据集
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

# 特征分解
eigen <- scale(iris[, -5]) %>% # scale()对数据进行标准化
  cor() %>% # cor()对矩阵进行相关性分析
  eigen() # eign()对数据矩阵进行光谱分解,得到特征值和特征向量
eigen
## eigen() decomposition
## $values
## [1] 2.91849782 0.91403047 0.14675688 0.02071484
## 
## $vectors
##            [,1]        [,2]       [,3]       [,4]
## [1,]  0.5210659 -0.37741762  0.7195664  0.2612863
## [2,] -0.2693474 -0.92329566 -0.2443818 -0.1235096
## [3,]  0.5804131 -0.02449161 -0.1421264 -0.8014492
## [4,]  0.5648565 -0.06694199 -0.6342727  0.5235971

# 提取特征值(如上)
eigen$values

# 特征向量提取(如上)
eigen$vectors

# 计算标准化的主成分得分
scale(iris[,-5]) %*% eigen$vectors %>% head()
##           [,1]       [,2]        [,3]         [,4]
## [1,] -2.257141 -0.4784238  0.12727962  0.024087508
## [2,] -2.074013  0.6718827  0.23382552  0.102662845
## [3,] -2.356335  0.3407664 -0.04405390  0.028282305
## [4,] -2.291707  0.5953999 -0.09098530 -0.065735340
## [5,] -2.381863 -0.6446757 -0.01568565 -0.035802870
## [6,] -2.068701 -1.4842053 -0.02687825  0.006586116

1.2 \color{orange}{prcomp}函数和\color{orange}{princomp}函数

1.2.1 prcomp函数

与常规的求取特征值和特征向量不同的是,prcomp函数是对变量矩阵(相关矩阵)采用SVD方法计算其奇异值(原理上是特征值的平方根)

# 相关矩阵分解
iris.pca <- prcomp(iris[,-5],
                   scale. = T, # scale.=T表示标准化
                   rank. = 4, # rank.指定最大秩的数字
                   retx = T) # retx一个逻辑值,指示返回已旋转的变量
iris.pca
## Standard deviations (1, .., p=4):
## [1] 1.7083611 0.9560494 0.3830886 0.1439265
## 
## Rotation (n x k) = (4 x 4):
##                     PC1         PC2        PC3        PC4
## Sepal.Length  0.5210659 -0.37741762  0.7195664  0.2612863
## Sepal.Width  -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length  0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width   0.5648565 -0.06694199 -0.6342727  0.5235971

# 查看结果
summary(iris.pca) # 方差解释度(PCA分析结果)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.7084 0.9560 0.38309 0.14393
## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
## Cumulative Proportion  0.7296 0.9581 0.99482 1.00000

head(iris.pca$x) # 所有样本每个轴的得分
##            PC1        PC2         PC3          PC4
## [1,] -2.257141 -0.4784238  0.12727962  0.024087508
## [2,] -2.074013  0.6718827  0.23382552  0.102662845
## [3,] -2.356335  0.3407664 -0.04405390  0.028282305
## [4,] -2.291707  0.5953999 -0.09098530 -0.065735340
## [5,] -2.381863 -0.6446757 -0.01568565 -0.035802870
## [6,] -2.068701 -1.4842053 -0.02687825  0.006586116

★ PCA分析结果:
\ \ \ \ Standard deviation: 标准差 其平方为方差=特征值
\ \ \ \ Proportion of Variance: 方差贡献率
\ \ \ \ Cumulative Proportion: 方差累计贡献率

1.2.2 princomp函数

princomp以计算相关矩阵或者协方差矩阵的特征值为主。

iris.pc <- princomp(iris[, -5], cor = T, scores = T)
iris.pc
## Call:
## princomp(x = iris[, -5], cor = T, scores = T)
## 
## Standard deviations:
##    Comp.1    Comp.2    Comp.3    Comp.4 
## 1.7083611 0.9560494 0.3830886 0.1439265 
## 
##  4  variables and  150 observations.

summary(iris.pc) # 各主成份的解释量
## Importance of components:
##                           Comp.1    Comp.2     Comp.3      Comp.4
## Standard deviation     1.7083611 0.9560494 0.38308860 0.143926497
## Proportion of Variance 0.7296245 0.2285076 0.03668922 0.005178709
## Cumulative Proportion  0.7296245 0.9581321 0.99482129 1.000000000

head(iris.pc$score) # 所有样本各轴的得分
##         Comp.1     Comp.2      Comp.3      Comp.4
## [1,] -2.264703  0.4800266  0.12770602  0.02416820
## [2,] -2.080961 -0.6741336  0.23460885  0.10300677
## [3,] -2.364229 -0.3419080 -0.04420148  0.02837705
## [4,] -2.299384 -0.5973945 -0.09129011 -0.06595556
## [5,] -2.389842  0.6468354 -0.01573820 -0.03592281
## [6,] -2.075631  1.4891775 -0.02696829  0.00660818

screeplot(iris.pc,type="lines") # 方差分布图
biplot(iris.pc,scale=F) # 双标图直接把x与rotation绘图,而非标准化
princomp-1

1.3 ggplot2可视化

1.3.1 prcomp + ggplot2
# 提取PC score; 
df1 <- iris.pca$x 

# 将iris数据集的第5列数据合并进来;
df1 <- data.frame(df1, iris$Species) 
head(df1) 
##         PC1        PC2         PC3          PC4 iris.Species
## 1 -2.257141 -0.4784238  0.12727962  0.024087508       setosa
## 2 -2.074013  0.6718827  0.23382552  0.102662845       setosa
## 3 -2.356335  0.3407664 -0.04405390  0.028282305       setosa
## 4 -2.291707  0.5953999 -0.09098530 -0.065735340       setosa
## 5 -2.381863 -0.6446757 -0.01568565 -0.035802870       setosa
## 6 -2.068701 -1.4842053 -0.02687825  0.006586116       setosa

summ <- summary(iris.pca)

xlab <- paste0("PC1(", round(summ$importance[2, 1] * 100, 2),"%)")
ylab <- paste0("PC2(", round(summ$importance[2, 2] * 100, 2),"%)")

ggplot(data = df1, aes(x = PC1, y = PC2, color = iris.Species)) +
  stat_ellipse(aes(fill = iris.Species), type = "norm", geom ="polygon", alpha = 0.2, color = NA) + 
  geom_point() + labs(x = xlab, y = ylab, color = "") + 
  guides(fill = F) +
  scale_fill_manual(values = c("purple","orange","blue")) + 
  scale_colour_manual(values = c("purple","orange","blue"))
1.3.2 princomp + ggplot2
# 查看主成分的结果
pca.scores <- as.data.frame(iris.pc$scores)
head(pca.scores)
##      Comp.1     Comp.2      Comp.3      Comp.4
## 1 -2.264703  0.4800266  0.12770602  0.02416820
## 2 -2.080961 -0.6741336  0.23460885  0.10300677
## 3 -2.364229 -0.3419080 -0.04420148  0.02837705
## 4 -2.299384 -0.5973945 -0.09129011 -0.06595556
## 5 -2.389842  0.6468354 -0.01573820 -0.03592281
## 6 -2.075631  1.4891775 -0.02696829  0.00660818

# 绘制PCA图
ggplot(pca.scores, aes(Comp.1, Comp.2, 
                       col = iris$Species,
                       shape = iris$Species)
                       ) + 
  scale_shape_manual(values = c(15, 19, 17)) + 
  scale_color_manual(values = c('#999999','#E69F00', '#56B4E9')) +
  geom_point(size = 3) + 
  # geom_text(aes(label = rownames(pca.scores)), vjust = "outward") + 
  geom_hline(yintercept = 0, lty = 2, col = "red") + 
  geom_vline(xintercept = 0, lty = 2, col = "blue", lwd = 1) +
  theme_bw() + theme(legend.position = "none") + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  labs(x = "PCA_1", y  ="PCA_2", title = "PCA analysis")
ggplot2-1

2. 使用\color{green}{scatterplot3d}包绘制PCA图

# 安装并加载所需的R包
# install.packages("scatterplot3d")
library(scatterplot3d)

# 绘制三维PCA图
scatterplot3d(iris.pca$x[, 1:3],
              pch = c(rep(16, 50), rep(17, 50), rep(18, 50)),
              color = c(rep("purple", 50), rep("orange", 50), rep("blue", 50)),
              angle = 45, main= "3D PCA plot",
              cex.symbols = 1.5, mar = c(5, 4, 4, 5))

# 添加图例
legend("topright", title = "Sample",
       xpd = TRUE, inset = -0.15,
       legend = c("setosa", "versicolor", "virginica"),
       col = c("purple", "orange", "blue"),
       pch = c(16, 17, 18))
scatterplot3d-1

3. 使用\color{green}{factoextra}包绘制PCA图

3.1 空间分布图

# 安装并加载所需的R包
# install.packages("factoextra")
# install.packages("FactoMineR")
library(FactoMineR)
library(factoextra)

# PCA分析
iris.pca <- PCA(iris[,-5], graph = T)

# 获取样本的主成分分析结果
var <- get_pca_var(iris.pca)

# 绘制主成分碎石图,查看每一个主成分能在多大程度上代表原来的特征
fviz_screeplot(iris.pca, addlabels = TRUE, ylim = c(0, 80))

# PCA图
fviz_pca_ind(iris.pca,
             mean.point = F, # 去除分组的中心点,否则每个群中间会有一个比较大的点
             label = "none", # 隐藏每一个样本的标签
             habillage = iris$Species, # 根据样本类型来着色
             palette = c("purple","orange","blue"), # 三个组设置三种颜色
             addEllipses = TRUE, # 添加边界线
             ellipse.type = "convex" # 设置边界线为多边形
)
factoextra-1

3.2 特征分布图

# 查看每一个特征对每一个主成分的贡献程度
var$contrib
##                  Dim.1       Dim.2     Dim.3     Dim.4
## Sepal.Length 27.150969 14.24440565 51.777574  6.827052
## Sepal.Width   7.254804 85.24748749  5.972245  1.525463
## Petal.Length 33.687936  0.05998389  2.019990 64.232089
## Petal.Width  31.906291  0.44812296 40.230191 27.415396

fviz_pca_var(iris.pca,   
             col.var = "contrib", #根据贡献度着色
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)

library("corrplot")
corrplot(var$contrib, is.corr = FALSE)

★ Sepal.Width这个特征对Dim.2的贡献最大


factoextra-2

3.3 同时展示样本和特征

fviz_pca_biplot(iris.pca, 
                label = "var", # 只标注变量,不标注样本
                habillage = iris$Species, # 根据样本类型着色
                addEllipses = TRUE, # 添加边界线
                ellipse.type = "convex", # 设置边界线为多边形
                ggtheme = theme_minimal() # 白色背景,浅灰色网格线,无坐标轴,无边框
                )
factoextra-3

参考:

  1. https://blog.csdn.net/qq_42830713/article/details/127545000
  2. https://zhuanlan.zhihu.com/p/443843102
  3. https://www.jianshu.com/p/51de30fd18e7
上一篇下一篇

猜你喜欢

热点阅读