R语言生物信息生信软件

ggplot2绘制PCA图(1)

2019-05-28  本文已影响15人  Zhigang_Han

12个组,每组3个重复,一共36个样品的主成分分析图

1、导入包和数据处理

library(tidyr)
library(dplyr)
library(ggplot2)
data_o <- read.table("./PCA.txt",
                   header = T)
> data_o
               S1           S2           S3           S4           S5           S6           S7
M1   7.395760e+04 7.078706e+04 8.417177e+04  137621.1106  147372.6292  164831.9100 7.309625e+04
M2   3.943851e+03 5.747331e+03 5.227088e+03    6523.3657    6013.1556   12186.0886 5.114227e+03
M3   6.510884e+03 5.606016e+03 6.351800e+03   12000.8553   13808.4395   14835.7556 6.354940e+03
M4   1.384136e+05 1.317561e+05 1.553619e+05  266360.3396  289288.2152  339461.9371 1.301302e+05
M5   3.801214e+03 4.022403e+03 5.158690e+03    6174.7307    6762.0510    8130.7640 4.632161e+03
M6   1.201156e+03 1.123829e+03 1.782138e+03    2042.1169    1610.9445    2399.4726 9.799112e+02
M7   5.773008e+04 3.799364e+04 6.130102e+04   30516.9603   63046.2774   24196.7880 4.144398e+04
#翻转数据
data <- t(data_o)
> data
           M1        M2        M3        M4       M5        M6        M7         M8        M9      M10
S1   73957.60  3943.851  6510.884 138413.57 3801.214 1201.1558  57730.08   416.6729 1316.5815 118533.6
S2   70787.06  5747.331  5606.016 131756.14 4022.403 1123.8288  37993.64  1213.8793 1540.8043 134324.5
S3   84171.77  5227.088  6351.800 155361.89 5158.690 1782.1378  61301.02   534.9507 1244.3340 124778.7
S4  137621.11  6523.366 12000.855 266360.34 6174.731 2042.1169  30516.96  2634.5004 2461.8294 122279.8
S5  147372.63  6013.156 13808.440 289288.22 6762.051 1610.9445  63046.28   401.3044 2603.8165 119305.2
S6  164831.91 12186.089 14835.756 339461.94 8130.764 2399.4726  24196.79   473.9180 3278.3469 100409.4
#读取数据分组情况
group <- read.table("./group.txt",
                    header = T)
> group
   Group samplenames
1     G1          S1
2     G1          S2
3     G1          S3
4     G2          S4
5     G2          S5
6     G2          S6
7     G3          S7
8     G3          S8
9     G3          S9
10    G4         S10
11    G4         S11
12    G4         S12
###上述数据格式只列出一部分

2、主成分分析

#主成分计算
pca_data <- prcomp(data, scale = TRUE)
#查看合适主成分个数
screeplot(pca_data, type = "lines")
summary(pca_data)
#查看行名,确认是否为36个样品的名称
rownames(pca_data$x)
> rownames(pca_data$x)
 [1] "S1"  "S2"  "S3"  "S4"  "S5"  "S6"  "S7"  "S8"  "S9"  "S10" "S11" "S12" "S13" "S14" "S15" "S16" "S17"
[18] "S18" "S19" "S20" "S21" "S22" "S23" "S24" "S25" "S26" "S27" "S28" "S29" "S30" "S31" "S32" "S33" "S34"
[35] "S35" "S36"
#提取PC1的百分比
x_per <- round(summary(pca_data)$importance[2, 1]*100, 1)
#提取PC2的百分比
y_per <- round(summary(pca_data)$importance[2, 2]*100, 1)
#按照样品名称添加组并且合并
df_sample <- data.frame(samplenames=rownames(pca_data$x), pca_data$x) %>%
  left_join(group, by = "samplenames")
#数据尾部添加变量Group,只截取部分数据展示
> df_sample
          PC31         PC32        PC33        PC34         PC35          PC36 Group
1   0.306133837  0.161442452  0.03766617  0.30708158 -0.186722792 -1.422473e-15    G1
2   0.237143749 -0.599197140 -0.52253461 -0.36597316  0.205268460 -1.526557e-15    G1
3  -0.073255941 -0.069312851  0.02810155 -0.15750184  0.194224495 -1.741662e-15    G1
4   0.593003778 -0.003056265 -0.31652774 -0.35467364 -0.153817055 -1.013079e-15    G2
5  -0.368112005  0.085168850  0.47080323  0.50937052  0.114099679 -1.249001e-15    G2
6  -0.160988819  0.140851886 -0.28354428 -0.05074411 -0.068941571 -1.082467e-15    G2
7   0.323091269  0.126995166  0.44969245 -0.65800755 -0.352720233 -6.522560e-16    G3
8  -0.801287671 -0.206218914 -0.16401072  0.19778263  0.516041135 -1.085070e-15    G3
9  -0.082727209  0.303859473 -0.04526318  0.51639150 -0.266017284 -1.762479e-15    G3
10  0.023928552  0.526276376 -0.37558251 -0.52455436 -0.378126147 -7.771561e-16    G4
11  0.324106409 -0.183442282  0.36741010 -0.20372081  0.423367465 -1.332268e-15    G4
12 -0.339002487 -0.271889208  0.36575640  0.66261644  0.001102365 -1.235123e-15    G4

3、ggplot绘图

#设置适合科研的背景色
theme_set(theme_bw())
#绘图
plot_1 <- ggplot(df_sample, aes(x = PC1, y = PC2)) +
  geom_point(aes(color=Group),size=5) +
  xlab(paste("PC1","(", x_per,"%)",sep=" ")) +
  ylab(paste("PC2","(", y_per,"%)",sep=" ")) +
#t图例按照G1,G2,G3...排列
  scale_color_discrete(limits = c(paste("G", seq(1:12),sep = "")))
image.png
上一篇下一篇

猜你喜欢

热点阅读