R语言~高级PCA分析
2022-07-23 本文已影响0人
Oodelay
主成分分析,不只需要找到几个虚拟但不知具体为何物的主成分因子,还需要知道我们提供的变量在各个主成分因子上的载荷(有正有负)和贡献度,以及具体保留多少个主成分因子才能既尽量保留数据的原始变异又能避免过度冗余。这是一个值得思考的问题。甚至能发nature,science。
加载程序包
packages <- c("ggplot2", "reshape2", 'ggpmisc', 'stringr',
'vegan', "FactoMineR", "factoextra", "ade4", "ggpubr",'scales')
# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
invisible(lapply(packages, library, character.only = TRUE))
library('dplyr')
source('utils.R')
读入并整理变量数据
dat <- read.table("../Data/GlobalAtlasv2_scores_v5.txt", header = T, sep = "\t")
head(dat)
dat$AridityIndex_wc2 = NULL
names(dat) = gsub('_18s_richness','', names(dat))
names(dat) = gsub('Richness_bacteria','Bacteria', names(dat))
names(dat) = gsub('_wc2','',names(dat))
names(dat) = gsub('WaterHoldingCapacity','WHC',names(dat))
names(dat) = gsub('Proportion_decomposers','Decomposers',names(dat))
names(dat) = gsub('Proportion_symbiotic_fungi','Sym_fungi', names(dat))
names(dat) = gsub('Pathogen_control','Pathogen_ctrl',names(dat))
names(dat) = gsub('Soil_respiration','Soil_resp',names(dat))
names(dat) = gsub('Soil_salinity','Salinity',names(dat))
names(dat) = gsub('Soil_pH','pH', names(dat))
names(dat) = gsub('Clay_silt_c','Clay_silt',names(dat))
names(dat) = gsub('Soil_ORC_g_kg','SOC', names(dat))
names(dat) = gsub("PO4_c", "PO4", names(dat))
names(dat) = gsub('Aridity_Index_v3','AridityIndex', names(dat))
dat$Salinity = log(dat$Salinity)
构建向量存储变量组合,以便后文直接引用
div_var = c("Oomycota","Bacteria", "Fungi", "Nematoda","Ciliophora",
"Amoebozoa","Excavata", "Apicomplexa","Platyhelminthes", "Annelida",
"Rotifera","Cercozoa", "Ochrophyta", "Dinoflagellata", "Arachnida",
"Collembola", "Neoptera","Myriapoda", "Tardigrada", "Chlorophyta")
func_var = c("Porosity","WHC","NH4","NO3","PO4","NAG","Soil_resp",
"NPP","Pathogen_ctrl" ,"Sym_fungi","Decomposers")
env_var = c("pH", "Salinity", "Clay_silt", "SOC", "Plant_cover", "MAT", "PSEA", "MDR", "TSEA",'AridityIndex')
执行PCA分析
pca = PCA(scale(subset(dat, select = func_var)), graph = F, ncp = 5)
ind = get_pca_ind(pca)
dat$PC1 <- ind$coord[,1]
dat$PC2 <- ind$coord[,2]
dat$PC3 <- ind$coord[,3]
dat$PC4 <- ind$coord[,4]
dat$PC5 <- ind$coord[,5]
绘制PCA排序图
Biplot <-fviz_pca_biplot(pca, fill.ind = dat$Eco_corrected, palette = 'lancet',
col.ind = "white", geom.ind = "point",
label ="none", labelsize = 2,
col.var = "black", arrowsize = 0.3,
repel=TRUE, pointshape = 21, pointsize = 5, alpha.ci = 0.5) +
# scale_fill_manual(breaks = c('Forest', 'Grassland'), values = c('bisque4','chartreuse4',))+
geom_hline(yintercept = 0, lty = 2, lwd = 0.1)+
geom_vline(xintercept = 0, lty = 2, lwd = 0.1) +
labs(fill = "Vegetation",
title = NULL,
x = sprintf("PC1 (%.2f %s)", round(pca$eig[1,2],2), '%'),
y = sprintf("PC2 (%.2f %s)", round(pca$eig[2,2],2), '%')) +
theme_classic()+
theme(axis.title = element_text(size = 6,color = 'black'),
axis.text = element_text(size = 6,color = 'black'),
panel.grid = element_blank(),
legend.background = element_blank(),
legend.box.background = element_blank(),
legend.position = c(0.15,0.2),
legend.key.height = unit(0.3,'cm'),
legend.title = element_text(size = 6,color = 'black'),
legend.text = element_text(size = 6, color = 'black',margin = margin(l = -8)))
判定最优的主成分因子个数
pca1 <- dudi.pca(subset(dat, select = func_var), center = TRUE, scale = TRUE, scannf=FALSE, nf = 5)
test1 <- testdim(pca1, nrepet = 999)
test1
print(paste("Number of statistical significant components according to Dray et al., 2006:", test1$nb.cor))
提取各变量在主成分因子上的载荷并进行显著性判定
var <- pca$var
# (1) bootstrapped egigenvector, p value < 0.05
boot5 <- netoboot(subset(dat, select = func_var), scannf = F, nf = 5, permutations = 1000)
pvalmatrix <- computePval_peres(pca1$c1, boot5)
# (2) fixed threshold
coordPCthr_val <- rbind(abs(var$coord[,1]), abs(var$coord[,2]), abs(var$coord[,3]),
abs(var$coord[,4]), abs(var$coord[,5]))#Richman 1988
coordPCthr <- rbind(ifelse(abs(var$coord[,1])>0.3, TRUE, FALSE),
ifelse(abs(var$coord[,2])>0.3, TRUE, FALSE),
ifelse(abs(var$coord[,3])>0.3, TRUE, FALSE),
ifelse(abs(var$coord[,4])>0.3, TRUE, FALSE),
ifelse(abs(var$coord[,5])>0.3, TRUE, FALSE))#Richman 1988
# (3) threshold method, their contribution is above 100/ndim
contrPCthr <- rbind(ifelse(var$contrib[,1]>(100/length(func_var)), TRUE, FALSE),
ifelse(var$contrib[,2]>(100/length(func_var)), TRUE, FALSE),
ifelse(var$contrib[,3]>(100/length(func_var)), TRUE, FALSE),
ifelse(var$contrib[,4]>(100/length(func_var)), TRUE, FALSE),
ifelse(var$contrib[,5]>(100/length(func_var)), TRUE, FALSE))
contrPCthr_val <- rbind(var$contrib[,1], var$contrib[,2], var$contrib[,3],
var$contrib[,4], var$contrib[,5])
df.PCA.sign <- data.frame(rbind(pvalmatrix, contrPCthr_val, coordPCthr_val))
df.PCA.sign$PC <- rep(c("PC1", "PC2", "PC3", "PC4", "PC5"), 3)
df.PCA.sign$Method <- rep(c("Peres-Neto et al., 2003", "Threshold", "Threshold Richmann 1988"),each = 5)
write.table(df.PCA.sign, file = 'PCA_signtest.csv', row.names = F)
## -- define the relevant loadings for the final plotting
pvalmatrix_TF <- ifelse(pvalmatrix < 0.05, TRUE, FALSE)
三种检验方法均指示显著差才是最终显著
pc1Relevance<-coordPCthr[1,]*pvalmatrix_TF[1,]*contrPCthr[1,]
pc2Relevance<-coordPCthr[2,]*pvalmatrix_TF[2,]*contrPCthr[2,]
pc3Relevance<-coordPCthr[3,]*pvalmatrix_TF[3,]*contrPCthr[3,]
pc4Relevance<-coordPCthr[4,]*pvalmatrix_TF[4,]*contrPCthr[4,]
pc5Relevance<-coordPCthr[5,]*pvalmatrix_TF[5,]*contrPCthr[5,]
整理结果并绘图
df.loadings <- data.frame(EFP = rep(row.names(var$coord),5),
val = c(var$coord[,1],var$coord[,2], var$coord[,3],
var$coord[,4], var$coord[,5]),
contr = c(ifelse(pc1Relevance == 1, 'High', 'Low'),
ifelse(pc2Relevance == 1, 'High', 'Low'),
ifelse(pc3Relevance == 1, 'High', 'Low'),
ifelse(pc4Relevance == 1, 'High', 'Low'),
ifelse(pc5Relevance == 1, 'High', 'Low')),
PC = c(rep("PC1",length(row.names(var$coord))),
rep("PC2",length(row.names(var$coord))),
rep("PC3",length(row.names(var$coord))),
rep("PC4",length(row.names(var$coord))),
rep("PC5",length(row.names(var$coord)))))
df.loadings$EFP = factor(df.loadings$EFP,
levels = c('Porosity','WHC','Soil_resp','NPP','NO3','NH4','PO4',
'NAG','Pathogen_ctrl','Decomposers','Sym_fungi'),
labels = c('Porosity','WHC',bquote('Soil'~'Resp'),'NPP',
bquote("NO"[3]),bquote('NH'[4]),bquote('PO'[4]),
'NAG',bquote('Pathogen'~ 'Ctrl'), bquote('Decomposers'~'(%)'),
bquote('Sym-fungi'~'(%)')))
loadings_p <- ggplot(data= subset(df.loadings, !PC %in% c("PC4",'PC5')),
aes(x=EFP, y=val, fill = contr)) +
facet_grid(. ~ PC, scales = "free_y") +
scale_y_continuous(breaks = c(-0.4,0,0.4,0.8), labels = dropLeadingZero)+
scale_x_discrete(labels = parse(text = levels(df.contrib$EFP)))+
geom_bar(stat="identity", position=position_dodge()) + coord_flip() +
scale_fill_manual(values=c("#E69F00", "#999999")) + theme_classic() +
labs(y="Loadings", x = NULL) +
theme(legend.position='none',
strip.text = element_text(size = 6,color = 'black'),
strip.background = element_blank(),
axis.title = element_text(size = 6,color = 'black'),
axis.text = element_text(size = 6,color = 'black'))
df.contrib <- data.frame(EFP = rep(row.names(var$contrib),5),
val = c(var$contrib[,1],var$contrib[,2],var$contrib[,3],
var$contrib[,4],var$contrib[,5]),
contr = c(ifelse(pc1Relevance == 1, 'High', 'Low'),
ifelse(pc2Relevance == 1, 'High', 'Low'),
ifelse(pc3Relevance == 1, 'High', 'Low'),
ifelse(pc4Relevance == 1, 'High', 'Low'),
ifelse(pc5Relevance == 1, 'High', 'Low')),
PC = c(rep("PC1",length(row.names(var$contrib))),
rep("PC2",length(row.names(var$contrib))),
rep("PC3",length(row.names(var$contrib))),
rep("PC4",length(row.names(var$contrib))),
rep("PC5",length(row.names(var$contrib)))))
df.contrib$EFP = factor(df.contrib$EFP,
levels = c('Porosity','WHC','Soil_resp','NPP','NO3','NH4','PO4',
'NAG','Pathogen_ctrl','Decomposers','Sym_fungi'),
labels = c('Porosity','WHC',bquote('Soil'~'Resp'),'NPP',bquote("NO"[3]),
bquote('NH'[4]),bquote('PO'[4]),'NAG',
bquote('Pathogen'~ 'Ctrl'), bquote('Decomposers'~'(%)'),
bquote('Sym-fungi'~'(%)')))
contribs_p <- ggplot(data=subset(df.contrib, !PC %in% c("PC4",'PC5')),
aes(x=EFP, y=val, fill = contr)) +
facet_grid(. ~ PC, scales = "free_y") +
geom_bar(stat="identity", position=position_dodge()) + coord_flip() +
theme_classic() +
scale_fill_manual(values=c("#E69F00", "#999999")) +
scale_x_discrete(labels = parse(text = levels(df.contrib$EFP)))+
labs(y="Contribution [%]", x = NULL) +
theme(legend.key.size = unit(0.3, 'cm'),
legend.title = element_text(size=6,color = 'black'),
legend.text = element_text(size=6,color = 'black'),
legend.background = element_blank(),
legend.position = c(0.9, 0.5),
strip.text = element_text(size = 6,color = 'black'),
strip.background = element_blank(),
axis.title = element_text(size = 6,color = 'black'),
axis.text = element_text(size = 6,color = 'black'))
exp_var_p <- fviz_eig(pca, addlabels=F,
barfill="white", barcolor ="darkblue",
ncp = 5, labelsize = .1,
linecolor ="red") +
theme_classic() +
scale_y_continuous(expand = c(0,0))+
geom_label(aes(x=1, y=33, label="Stretch it"), vjust= -1)+
labs(title = NULL, x = "Principal Components (PC)", y = "Explained variance (%)") +
theme(axis.title = element_text(size = 6,color = 'black'),
axis.text = element_text(size = 6, color = 'black'),
plot.margin = margin(l = 20, b = 6, t = 6, r = 6))
figure2 <- ggarrange(Biplot, exp_var_p, contribs_p, loadings_p,
labels = c("a", "b","c","d"),
font.label=list(face = "bold", color="black",size= 10,
axis.text.y = element_text(face="bold")),
common.legend = FALSE,
nrow = 2, ncol = 2)
Figure2.jpg