StatQuest掠影:PCA in R

2019-11-11 本文已影响0人周运来就是我

什么是PCA

pca是一种，降维技术。主成分分析（pca）的目的就是要从这些现有的特征中重建新的特征，新的特征剔除了原有特征的冗余信息，因此更有区分度。注意，主成分分析的结果是得到新的特征，而不是简单地舍弃原来的特征列表中的一些特征。

参见数量生态学笔记||非约束排序||PCA，其中还介绍了大量的pca的相关概念。

数据

执行pca

用R的几本函数就可以做pca了，需要注意的是输入矩阵的行列分别是对象和特征。不同的函数输入矩阵时的行列要求有可能不同。

绘制pca图

一般选择前两个PC来绘图，为什么？

注意，pca不是聚类算法，虽然pca图上可以看出明显地分出两个簇。

用ggplot2修饰一番。

ggplot2

理解pca的结果

loading值，特征根如何提取？

## In this example, the data is in a matrix called
## data.matrix
## columns are individual samples (i.e. cells)
## rows are measurements taken for all the samples (i.e. genes)
## Just for the sake of the example, here's some made up data...
data.matrix <- matrix(nrow=100, ncol=10)
colnames(data.matrix) <- c(
  paste("wt", 1:5, sep=""),
  paste("ko", 1:5, sep=""))
rownames(data.matrix) <- paste("gene", 1:100, sep="")
for (i in 1:100) {
  wt.values <- rpois(5, lambda=sample(x=10:1000, size=1))
  ko.values <- rpois(5, lambda=sample(x=10:1000, size=1))
 
  data.matrix[i,] <- c(wt.values, ko.values)
}
head(data.matrix)

      wt1 wt2 wt3 wt4 wt5 ko1 ko2 ko3 ko4 ko5
gene1 267 259 279 247 281 771 838 793 749 754
gene2 410 387 387 406 404 720 748 729 673 740
gene3 858 825 875 902 859 264 242 281 258 282
gene4 829 795 767 795 760  71  74  67  75  71
gene5 317 263 300 309 270  24  21  33  21  25
gene6 497 520 474 521 517 523 496 510 484 516

dim(data.matrix)

100  10

pca <- prcomp(t(data.matrix), scale=TRUE) 

> summary(pca)
Importance of components:
                          PC1     PC2     PC3    PC4     PC5     PC6     PC7     PC8     PC9      PC10
Standard deviation     9.2854 1.94249 1.73588 1.5781 1.26574 1.05160 0.86755 0.77237 0.66860 3.628e-15
Proportion of Variance 0.8622 0.03773 0.03013 0.0249 0.01602 0.01106 0.00753 0.00597 0.00447 0.000e+00
Cumulative Proportion  0.8622 0.89992 0.93005 0.9550 0.97098 0.98204 0.98956 0.99553 1.00000 1.000e+00
> str(pca)
List of 5
 $ sdev    : num [1:10] 9.29 1.94 1.74 1.58 1.27 ...
 $ rotation: num [1:100, 1:10] 0.107 0.107 -0.107 -0.108 -0.107 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:100] "gene1" "gene2" "gene3" "gene4" ...
  .. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
 $ center  : Named num [1:100] 524 560 565 430 158 ...
  ..- attr(*, "names")= chr [1:100] "gene1" "gene2" "gene3" "gene4" ...
 $ scale   : Named num [1:100] 272 172 316 379 142 ...
  ..- attr(*, "names")= chr [1:100] "gene1" "gene2" "gene3" "gene4" ...
 $ x       : num [1:10, 1:10] -9.09 -8.56 -8.74 -8.99 -8.65 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:10] "wt1" "wt2" "wt3" "wt4" ...
  .. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
 - attr(*, "class")= chr "prcomp"

plot(pca$x[,1], pca$x[,2])

## make a scree plot
pca.var <- pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
 
barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")

# now make a fancy looking plot that shows the PCs and the variation:
library(ggplot2)
 
pca.data <- data.frame(Sample=rownames(pca$x),
  X=pca$x[,1],
  Y=pca$x[,2])
pca.data
 
ggplot(data=pca.data, aes(x=X, y=Y, label=Sample)) +
  geom_text() +
  xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
  ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
  theme_bw() +
  ggtitle("My PCA Graph")

不少同学喜欢，加一个椭圆：

## now make a fancy looking plot that shows the PCs and the variation:
library(ggplot2)
 
pca.data <- data.frame(Sample=rownames(pca$x),
  X=pca$x[,1],
  Y=pca$x[,2])
pca.data
library(tidyverse)
pca.data<-pca.data%>%mutate(group=gsub("1|2|3|4|5","",Sample))

pca.data
   Sample         X          Y group
1     wt1 -9.093575  2.0606312    wt
2     wt2 -8.560679 -1.4518972    wt
3     wt3 -8.742473  3.1096518    wt
4     wt4 -8.985748 -2.4651641    wt
5     wt5 -8.652610 -1.2794233    wt
6     ko1  8.940648 -2.3306414    ko
7     ko2  8.769244 -0.5174678    ko
8     ko3  8.530437  0.1198165    ko
9     ko4  8.997000  1.9745708    ko
10    ko5  8.797756  0.7799236    ko

ggplot(data=pca.data, aes(x=X, y=Y, label=Sample,group = group, color = group)) +
  geom_text() +
  stat_ellipse(level = 0.99, show.legend = F) +
  #geom_polygon(data = pca.data, alpha = 0.3, show.legend = F) +
  xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
  ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
  theme_bw() +
  ggtitle("My PCA Graph")

然后，还可以加一个多边形：

## now make a fancy looking plot that shows the PCs and the variation:
library(ggplot2)
 
pca.data <- data.frame(Sample=rownames(pca$x),
  X=pca$x[,1],
  Y=pca$x[,2])
pca.data
library(tidyverse)
pca.data<-pca.data%>%mutate(group=gsub("1|2|3|4|5","",Sample))
ggplot(data=pca.data, aes(x=X, y=Y, label=Sample,group = group, color = group)) +
  geom_text() +
  #stat_ellipse(level = 0.99, show.legend = F) +
  geom_polygon(data = pca.data, alpha = 0.3, show.legend = F) +
  xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
  ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
  theme_bw() +
  ggtitle("My PCA Graph")

## get the name of the top 10 measurements (genes) that contribute
## most to pc1.
loading_scores <- pca$rotation[,1]
gene_scores <- abs(loading_scores) ## get the magnitudes
gene_score_ranked <- sort(gene_scores, decreasing=TRUE)
top_10_genes <- names(gene_score_ranked[1:10])
 

top_10_genes ## show the names of the top 10 genes
 [1] "gene89" "gene96" "gene77" "gene55" "gene4"  "gene31" "gene37" "gene8"  "gene38" "gene32"
 
pca$rotation[top_10_genes,1] ## show the scores (and +/- sign)
   gene89     gene96     gene77     gene55      gene4     gene31     gene37      gene8     gene38     gene32 
 0.1076586 -0.1076536  0.1076510 -0.1076323 -0.1075976  0.1075971 -0.1075662 -0.1075508 -0.1075468 -0.1075338

#######
##
## NOTE: Everything that follow is just bonus stuff.
## It simply demonstrates how to get the same
## results using "svd()" (Singular Value Decomposition) or using "eigen()"
## (Eigen Decomposition).
##
#######
 
############################################
##
## Now let's do the same thing with svd()
##
## svd() returns three things
## v = the "rotation" that prcomp() returns, this is a matrix of eigenvectors
##     in other words, a matrix of loading scores
## u = this is similar to the "x" that prcomp() returns. In other words,
##     sum(the rotation * the original data), but compressed to the unit vector
##     You can spread it out by multiplying by "d"
## d = this is similar to the "sdev" value that prcomp() returns (and thus
##     related to the eigen values), but not
##     scaled by sample size in an unbiased way (ie. 1/(n-1)).
##     For prcomp(), sdev = sqrt(var) = sqrt(ss(fit)/(n-1))
##     For svd(), d = sqrt(ss(fit))
##
############################################
 
svd.stuff <- svd(scale(t(data.matrix), center=TRUE))
 
## calculate the PCs
svd.data <- data.frame(Sample=colnames(data.matrix),
  X=(svd.stuff$u[,1] * svd.stuff$d[1]),
  Y=(svd.stuff$u[,2] * svd.stuff$d[2]))
svd.data

   Sample         X          Y
1     wt1 -9.093575  2.0606312
2     wt2 -8.560679 -1.4518972
3     wt3 -8.742473  3.1096518
4     wt4 -8.985748 -2.4651641
5     wt5 -8.652610 -1.2794233
6     ko1  8.940648 -2.3306414
7     ko2  8.769244 -0.5174678
8     ko3  8.530437  0.1198165
9     ko4  8.997000  1.9745708
10    ko5  8.797756  0.7799236

## alternatively, we could compute the PCs with the eigen vectors and the
## original data
svd.pcs <- t(t(svd.stuff$v) %*% t(scale(t(data.matrix), center=TRUE)))
svd.pcs[,1:2] ## the first to principal components
 
svd.df <- ncol(data.matrix) - 1
svd.var <- svd.stuff$d^2 / svd.df
svd.var.per <- round(svd.var/sum(svd.var)*100, 1)
 
ggplot(data=svd.data, aes(x=X, y=Y, label=Sample)) +
  geom_text() +
  xlab(paste("PC1 - ", svd.var.per[1], "%", sep="")) +
  ylab(paste("PC2 - ", svd.var.per[2], "%", sep="")) +
  theme_bw() +
  ggtitle("svd(scale(t(data.matrix), center=TRUE)")

另一种方法 with eigen()

###########################################
##
## Now let's do the same thing with eigen()
##
## eigen() returns two things...
## vectors = eigen vectors (vectors of loading scores)
##           NOTE: pcs = sum(loading scores * values for sample)
## values = eigen values
##
############################################
cov.mat <- cov(scale(t(data.matrix), center=TRUE))
dim(cov.mat)
 
## since the covariance matrix is symmetric, we can tell eigen() to just
## work on the lower triangle with "symmetric=TRUE"
eigen.stuff <- eigen(cov.mat, symmetric=TRUE)
dim(eigen.stuff$vectors)
head(eigen.stuff$vectors[,1:2])

              [,1]          [,2]
[1,] -1.071453e-01 -0.0005183740
[2,] -1.067197e-01  0.0176876608
[3,]  1.074979e-01  0.0008309942
[4,]  1.075976e-01 -0.0011406675
[5,]  1.071658e-01 -0.0197890483
[6,] -1.888509e-05  0.4457487042

eigen.pcs <- t(t(eigen.stuff$vectors) %*% t(scale(t(data.matrix), center=TRUE)))
eigen.pcs[,1:2]

         [,1]       [,2]
wt1  9.093575 -2.0606312
wt2  8.560679  1.4518972
wt3  8.742473 -3.1096518
wt4  8.985748  2.4651641
wt5  8.652610  1.2794233
ko1 -8.940648  2.3306414
ko2 -8.769244  0.5174678
ko3 -8.530437 -0.1198165
ko4 -8.997000 -1.9745708
ko5 -8.797756 -0.7799236

eigen.data <- data.frame(Sample=rownames(eigen.pcs),
  X=(-1 * eigen.pcs[,1]), ## eigen() flips the X-axis in this case, so we flip it back
  Y=eigen.pcs[,2]) ## X axis will be PC1, Y axis will be PC2
eigen.data

    Sample         X          Y
wt1    wt1 -9.093575 -2.0606312
wt2    wt2 -8.560679  1.4518972
wt3    wt3 -8.742473 -3.1096518
wt4    wt4 -8.985748  2.4651641
wt5    wt5 -8.652610  1.2794233
ko1    ko1  8.940648  2.3306414
ko2    ko2  8.769244  0.5174678
ko3    ko3  8.530437 -0.1198165
ko4    ko4  8.997000 -1.9745708
ko5    ko5  8.797756 -0.7799236

eigen.var.per <- round(eigen.stuff$values/sum(eigen.stuff$values)*100, 1)
 
ggplot(data=eigen.data, aes(x=X, y=Y, label=Sample)) +
  geom_text() +
  xlab(paste("PC1 - ", eigen.var.per[1], "%", sep="")) +
  ylab(paste("PC2 - ", eigen.var.per[2], "%", sep="")) +
  theme_bw() +
  ggtitle("eigen on cov(t(data.matrix))")