StatQuest掠影:PCA in R
2019-11-11 本文已影响0人
周运来就是我
什么是PCA
pca是一种,降维技术。主成分分析(pca)的目的就是要从这些现有的特征中重建新的特征,新的特征剔除了原有特征的冗余信息,因此更有区分度。注意,主成分分析的结果是得到新的特征,而不是简单地舍弃原来的特征列表中的一些特征。
参见数量生态学笔记||非约束排序||PCA,其中还介绍了大量的pca的相关概念。

目录

当然,节目的最后,我会把示例的Rcode放到这的。
数据






执行pca
用R的几本函数就可以做pca了,需要注意的是输入矩阵的行列分别是对象和特征。不同的函数输入矩阵时的行列要求有可能不同。



绘制pca图
一般选择前两个PC来绘图,为什么?


注意,pca不是聚类算法,虽然pca图上可以看出明显地分出两个簇。




用ggplot2修饰一番。









理解pca的结果
loading值,特征根如何提取?







## In this example, the data is in a matrix called
## data.matrix
## columns are individual samples (i.e. cells)
## rows are measurements taken for all the samples (i.e. genes)
## Just for the sake of the example, here's some made up data...
data.matrix <- matrix(nrow=100, ncol=10)
colnames(data.matrix) <- c(
paste("wt", 1:5, sep=""),
paste("ko", 1:5, sep=""))
rownames(data.matrix) <- paste("gene", 1:100, sep="")
for (i in 1:100) {
wt.values <- rpois(5, lambda=sample(x=10:1000, size=1))
ko.values <- rpois(5, lambda=sample(x=10:1000, size=1))
data.matrix[i,] <- c(wt.values, ko.values)
}
head(data.matrix)
wt1 wt2 wt3 wt4 wt5 ko1 ko2 ko3 ko4 ko5
gene1 267 259 279 247 281 771 838 793 749 754
gene2 410 387 387 406 404 720 748 729 673 740
gene3 858 825 875 902 859 264 242 281 258 282
gene4 829 795 767 795 760 71 74 67 75 71
gene5 317 263 300 309 270 24 21 33 21 25
gene6 497 520 474 521 517 523 496 510 484 516
dim(data.matrix)
100 10
pca <- prcomp(t(data.matrix), scale=TRUE)
> summary(pca)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
Standard deviation 9.2854 1.94249 1.73588 1.5781 1.26574 1.05160 0.86755 0.77237 0.66860 3.628e-15
Proportion of Variance 0.8622 0.03773 0.03013 0.0249 0.01602 0.01106 0.00753 0.00597 0.00447 0.000e+00
Cumulative Proportion 0.8622 0.89992 0.93005 0.9550 0.97098 0.98204 0.98956 0.99553 1.00000 1.000e+00
> str(pca)
List of 5
$ sdev : num [1:10] 9.29 1.94 1.74 1.58 1.27 ...
$ rotation: num [1:100, 1:10] 0.107 0.107 -0.107 -0.108 -0.107 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:100] "gene1" "gene2" "gene3" "gene4" ...
.. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
$ center : Named num [1:100] 524 560 565 430 158 ...
..- attr(*, "names")= chr [1:100] "gene1" "gene2" "gene3" "gene4" ...
$ scale : Named num [1:100] 272 172 316 379 142 ...
..- attr(*, "names")= chr [1:100] "gene1" "gene2" "gene3" "gene4" ...
$ x : num [1:10, 1:10] -9.09 -8.56 -8.74 -8.99 -8.65 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:10] "wt1" "wt2" "wt3" "wt4" ...
.. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
- attr(*, "class")= chr "prcomp"
plot(pca$x[,1], pca$x[,2])

## make a scree plot
pca.var <- pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")

# now make a fancy looking plot that shows the PCs and the variation:
library(ggplot2)
pca.data <- data.frame(Sample=rownames(pca$x),
X=pca$x[,1],
Y=pca$x[,2])
pca.data
ggplot(data=pca.data, aes(x=X, y=Y, label=Sample)) +
geom_text() +
xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("My PCA Graph")

不少同学喜欢,加一个椭圆:
## now make a fancy looking plot that shows the PCs and the variation:
library(ggplot2)
pca.data <- data.frame(Sample=rownames(pca$x),
X=pca$x[,1],
Y=pca$x[,2])
pca.data
library(tidyverse)
pca.data<-pca.data%>%mutate(group=gsub("1|2|3|4|5","",Sample))
pca.data
Sample X Y group
1 wt1 -9.093575 2.0606312 wt
2 wt2 -8.560679 -1.4518972 wt
3 wt3 -8.742473 3.1096518 wt
4 wt4 -8.985748 -2.4651641 wt
5 wt5 -8.652610 -1.2794233 wt
6 ko1 8.940648 -2.3306414 ko
7 ko2 8.769244 -0.5174678 ko
8 ko3 8.530437 0.1198165 ko
9 ko4 8.997000 1.9745708 ko
10 ko5 8.797756 0.7799236 ko
ggplot(data=pca.data, aes(x=X, y=Y, label=Sample,group = group, color = group)) +
geom_text() +
stat_ellipse(level = 0.99, show.legend = F) +
#geom_polygon(data = pca.data, alpha = 0.3, show.legend = F) +
xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("My PCA Graph")

然后,还可以加一个多边形:
## now make a fancy looking plot that shows the PCs and the variation:
library(ggplot2)
pca.data <- data.frame(Sample=rownames(pca$x),
X=pca$x[,1],
Y=pca$x[,2])
pca.data
library(tidyverse)
pca.data<-pca.data%>%mutate(group=gsub("1|2|3|4|5","",Sample))
ggplot(data=pca.data, aes(x=X, y=Y, label=Sample,group = group, color = group)) +
geom_text() +
#stat_ellipse(level = 0.99, show.legend = F) +
geom_polygon(data = pca.data, alpha = 0.3, show.legend = F) +
xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("My PCA Graph")

## get the name of the top 10 measurements (genes) that contribute
## most to pc1.
loading_scores <- pca$rotation[,1]
gene_scores <- abs(loading_scores) ## get the magnitudes
gene_score_ranked <- sort(gene_scores, decreasing=TRUE)
top_10_genes <- names(gene_score_ranked[1:10])
top_10_genes ## show the names of the top 10 genes
[1] "gene89" "gene96" "gene77" "gene55" "gene4" "gene31" "gene37" "gene8" "gene38" "gene32"
pca$rotation[top_10_genes,1] ## show the scores (and +/- sign)
gene89 gene96 gene77 gene55 gene4 gene31 gene37 gene8 gene38 gene32
0.1076586 -0.1076536 0.1076510 -0.1076323 -0.1075976 0.1075971 -0.1075662 -0.1075508 -0.1075468 -0.1075338
#######
##
## NOTE: Everything that follow is just bonus stuff.
## It simply demonstrates how to get the same
## results using "svd()" (Singular Value Decomposition) or using "eigen()"
## (Eigen Decomposition).
##
#######
############################################
##
## Now let's do the same thing with svd()
##
## svd() returns three things
## v = the "rotation" that prcomp() returns, this is a matrix of eigenvectors
## in other words, a matrix of loading scores
## u = this is similar to the "x" that prcomp() returns. In other words,
## sum(the rotation * the original data), but compressed to the unit vector
## You can spread it out by multiplying by "d"
## d = this is similar to the "sdev" value that prcomp() returns (and thus
## related to the eigen values), but not
## scaled by sample size in an unbiased way (ie. 1/(n-1)).
## For prcomp(), sdev = sqrt(var) = sqrt(ss(fit)/(n-1))
## For svd(), d = sqrt(ss(fit))
##
############################################
svd.stuff <- svd(scale(t(data.matrix), center=TRUE))
## calculate the PCs
svd.data <- data.frame(Sample=colnames(data.matrix),
X=(svd.stuff$u[,1] * svd.stuff$d[1]),
Y=(svd.stuff$u[,2] * svd.stuff$d[2]))
svd.data
Sample X Y
1 wt1 -9.093575 2.0606312
2 wt2 -8.560679 -1.4518972
3 wt3 -8.742473 3.1096518
4 wt4 -8.985748 -2.4651641
5 wt5 -8.652610 -1.2794233
6 ko1 8.940648 -2.3306414
7 ko2 8.769244 -0.5174678
8 ko3 8.530437 0.1198165
9 ko4 8.997000 1.9745708
10 ko5 8.797756 0.7799236
## alternatively, we could compute the PCs with the eigen vectors and the
## original data
svd.pcs <- t(t(svd.stuff$v) %*% t(scale(t(data.matrix), center=TRUE)))
svd.pcs[,1:2] ## the first to principal components
svd.df <- ncol(data.matrix) - 1
svd.var <- svd.stuff$d^2 / svd.df
svd.var.per <- round(svd.var/sum(svd.var)*100, 1)
ggplot(data=svd.data, aes(x=X, y=Y, label=Sample)) +
geom_text() +
xlab(paste("PC1 - ", svd.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", svd.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("svd(scale(t(data.matrix), center=TRUE)")

另一种方法 with eigen()
###########################################
##
## Now let's do the same thing with eigen()
##
## eigen() returns two things...
## vectors = eigen vectors (vectors of loading scores)
## NOTE: pcs = sum(loading scores * values for sample)
## values = eigen values
##
############################################
cov.mat <- cov(scale(t(data.matrix), center=TRUE))
dim(cov.mat)
## since the covariance matrix is symmetric, we can tell eigen() to just
## work on the lower triangle with "symmetric=TRUE"
eigen.stuff <- eigen(cov.mat, symmetric=TRUE)
dim(eigen.stuff$vectors)
head(eigen.stuff$vectors[,1:2])
[,1] [,2]
[1,] -1.071453e-01 -0.0005183740
[2,] -1.067197e-01 0.0176876608
[3,] 1.074979e-01 0.0008309942
[4,] 1.075976e-01 -0.0011406675
[5,] 1.071658e-01 -0.0197890483
[6,] -1.888509e-05 0.4457487042
eigen.pcs <- t(t(eigen.stuff$vectors) %*% t(scale(t(data.matrix), center=TRUE)))
eigen.pcs[,1:2]
[,1] [,2]
wt1 9.093575 -2.0606312
wt2 8.560679 1.4518972
wt3 8.742473 -3.1096518
wt4 8.985748 2.4651641
wt5 8.652610 1.2794233
ko1 -8.940648 2.3306414
ko2 -8.769244 0.5174678
ko3 -8.530437 -0.1198165
ko4 -8.997000 -1.9745708
ko5 -8.797756 -0.7799236
eigen.data <- data.frame(Sample=rownames(eigen.pcs),
X=(-1 * eigen.pcs[,1]), ## eigen() flips the X-axis in this case, so we flip it back
Y=eigen.pcs[,2]) ## X axis will be PC1, Y axis will be PC2
eigen.data
Sample X Y
wt1 wt1 -9.093575 -2.0606312
wt2 wt2 -8.560679 1.4518972
wt3 wt3 -8.742473 -3.1096518
wt4 wt4 -8.985748 2.4651641
wt5 wt5 -8.652610 1.2794233
ko1 ko1 8.940648 2.3306414
ko2 ko2 8.769244 0.5174678
ko3 ko3 8.530437 -0.1198165
ko4 ko4 8.997000 -1.9745708
ko5 ko5 8.797756 -0.7799236
eigen.var.per <- round(eigen.stuff$values/sum(eigen.stuff$values)*100, 1)
ggplot(data=eigen.data, aes(x=X, y=Y, label=Sample)) +
geom_text() +
xlab(paste("PC1 - ", eigen.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", eigen.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("eigen on cov(t(data.matrix))")
