跟着Nature Plants学数据分析:R语言WGCNA分析完
2022-08-16 本文已影响0人
小明的数据分析笔记本
论文
Plant flavones enrich rhizosphere Oxalobacteraceae to improve maize performance under nitrogen deprivation
https://www.nature.com/articles/s41477-021-00897-y#data-availability
https://github.com/PengYuMaize/Yu2021NaturePlants
https://www.youtube.com/watch?v=dwWhm78j8YU 作者还录了视频介绍
论文中提供的数据有100多个样本,这里我减少数据量,只用19个样本
表达量的count文件
image.png最后一列是基因长度,用来计算fpkm
样本的分组文件
image.png样本的表型
image.png样本的表型论文中提供的数据也有很多,这里我只选取其中的一部分
代码
代码里有很多细节我也不明白,但是能够从前到后运行完,这里记录一下,细节有空再来研究吧
加载需要的R包
library(WGCNA)
enableWGCNAThreads(nThreads = 6)
#BiocManager::install("DESeq2")
library(DESeq2)
这里用到deseq2主要是用来计算fpkm的
读取三个数据
data0<-read.table("my_counts.csv",
header=T,
row.names=1,
sep=",")
head(data0)
sample_metadata <- read.csv(file = "sample_info.csv",
header = TRUE)
head(sample_metadata)
bac_traits<-read.csv("input_data/traits.csv",row.names = 1)
head(bac_traits)
利用count值计算fpkm并计算log2然后转置
dataExpr_deseq <- DESeqDataSetFromMatrix(countData = data0[,-20],
colData = sample_metadata,
design = ~ Zone)
mcols(dataExpr_deseq)$basepairs = data0$geneLength
fpkm_matrix = fpkm(dataExpr_deseq)
datExpr = t(log2(fpkm_matrix+1))
dim(datExpr)
对数据进行过滤
nSamples = nrow(datExpr) # 统计样品数目
variancedatExpr=as.vector(apply(as.matrix(datExpr),2,var, na.rm=T)) #按列(基因)取方差
no.missingdatExpr=as.vector(apply(is.na(as.matrix(datExpr)),2, sum) )#按列(基因)统计缺失数目
KeepGenes= variancedatExpr>0 & no.missingdatExpr<0.1*nSamples # 保留方差不等于0,且缺失低于10%的基因
table(KeepGenes)# 过滤统计
datExpr=datExpr[, KeepGenes]
这一步原文的代码里没有,可能是我减少了样本的原因,如果没有这一步后面后有报错
第一步是样本聚类树
sampleTree = hclust(dist(datExpr), method = "average")
pdf(file = "1-n-sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.5);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="",
cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)
dev.off()
image.png
第二步是筛选阈值
powers = c(c(1:20), seq(from = 22, to=30, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Scale-free topology fit index as a function of the soft-thresholding power
pdf(file = "2-n-sft.pdf", width = 9, height = 5);
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()
image.png
第三步是基因聚类树
power=sft$powerEstimate
power
#TOM = TOMsimilarityFromExpr(datExpr, power = power) 这行代码要好长时间,需要比较大的内存
## long time big mem
## multiple cpu
## 这一步用linux系统的电脑设置多核心还挺快的 enableWGCNAThreads(nThreads = 4),
## 我设置12核心很快就得到了结果
## 但是我在自己的win系统上设置多核好像没有作用
## 我这边在linux系统运行完,把结果保存下来
load("../TOM.rdata")
dissTOM = 1-TOM
# Plot gene tree
geneTree = hclust(as.dist(dissTOM), method = "average");
pdf(file = "3-gene_cluster.pdf", width = 12, height = 9);
plot(geneTree,
xlab="",
sub="",
main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE,
hang = 0.04);
dev.off()
image.png
第四步是划分模块
dynamicMods = cutreeDynamic(dendro = geneTree,
distM = dissTOM,
deepSplit = 2,
pamRespectsDendro = FALSE,
minClusterSize = 30)
dynamicColors = labels2colors(dynamicMods)
pdf(file = "4-module_tree.pdf", width = 8, height = 6);
plotDendroAndColors(geneTree,
dynamicColors,
"Dynamic Tree Cut",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05,
main = "Gene dendrogram and module colors")
dev.off()
MEDissThres=0.25
merge = mergeCloseModules(datExpr,
dynamicColors,
cutHeight = MEDissThres,
verbose = 3)
mergedColors = merge$colors
mergedMEs = merge$newMEs
# Plot merged module tree
pdf(file = "5-merged_Module_Tree.pdf", width = 12, height = 9)
plotDendroAndColors(geneTree,
cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05)
dev.off()
write.table(merge$oldMEs,file="oldMEs.txt");
write.table(merge$newMEs,file="newMEs.txt");
image.png
第五步是表型和模块的相关性
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, mergedColors)$eigengenes
MEs = orderMEs(MEs0)
table(rownames(MEs) == rownames(bac_traits))
moduleTraitCor = cor(MEs, bac_traits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
write.table(moduleTraitCor,file="moduleTrait_correlation.txt");
write.table(moduleTraitPvalue,file="moduleTrait_pValue.txt");
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
pdf("module-traits-bacteria-order.pdf", width = 10, height = 10)
par(mar = c(15, 12, 5, 5));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(bac_traits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
image.png
image.png
欢迎大家关注我的公众号
小明的数据分析笔记本
小明的数据分析笔记本 公众号 主要分享:1、R语言和python做数据分析和数据可视化的简单小例子;2、园艺植物相关转录组学、基因组学、群体遗传学文献阅读笔记;3、生物信息学入门学习资料及自己的学习笔记!