【WGCNA】WGCNA学习(三)
前面一个WGCNA的学习我们主要学习了如何进行相关性的分析和module的鉴定,以及网络的输出,主要的代码如下:
femData = read.csv("LiverFemale3600.csv")
dim(femData)
names(femData)
datExpr0 = as.data.frame(t(femData[, -c(1:8)]))
names(datExpr0) = femData$substanceBXH
rownames(datExpr0) = names(femData)[-c(1:8)]
gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK
if (!gsg$allOK)
{
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
sampleTree = hclust(dist(datExpr0), method = "average")
sizeGrWindow(12,9)
par(cex = 0.6)
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="",cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)
abline(h = 15, col = "red")
clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)
table(clust)
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 = 0.9
power = sft$powerEstimate
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
abline(h=0.85,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,
cex=cex1, col="red")
net = blockwiseModules(datExpr, power = 6,TOMType = "unsigned", minModuleSize = 30,reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,saveTOMFileBase = "femaleMouseTOM",saveTOMs = TRUE,verbose = 3)
table(net$colors)
sizeGrWindow(12, 9)
mergedColors = labels2colors(net$colors)
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],"Module colors",dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
dynamicColors <- labels2colors(net$unmergedColors)
plotDendroAndColors(net$dendrograms[[1]], cbind(dynamicColors,moduleColors),
c("Dynamic Tree Cut", "Module colors"),
dendroLabels = FALSE, hang = 0.5,
addGuide = TRUE, guideHang = 0.05)
gene_module <- data.frame(ID=colnames(datExpr), module=moduleColors)
gene_module = gene_module[order(gene_module$module),]
write.table(gene_module,file=paste0(exprMat,".gene_module.V3.txt"),
sep="\t",quote=F,row.names=F)
MEs = net$MEs
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
MEs_colt = as.data.frame(t(MEs_col))
colnames(MEs_colt) = rownames(datExpr)
write.table(MEs_colt,file=paste0(exprMat,".module_eipgengene.V3.txt"),sep="\t",quote=F)
plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap",
marDendro = c(3,3,2,4),
marHeatmap = c(3,4,2,2), plotDendrograms = T,
xLabelsAngle = 90)
hubs = chooseTopHubInEachModule(datExpr, colorh=moduleColors, power=power, type=type)
hubs
con <- nearestNeighborConnectivity(datExpr, nNeighbors=50, power=power, type=type, corFnc = corType)
load(net$TOMFiles[1], verbose=T)
TOM <- as.matrix(TOM)
dissTOM = 1-TOM
plotTOM = dissTOM^power
diag(plotTOM) = NA
probes = colnames(datExpr)
dimnames(TOM) <- list(probes, probes)
cyt = exportNetworkToCytoscape(TOM, edgeFile = paste(exprMat, ".edges.v3.txt", sep=""),nodeFile = paste(exprMat, ".nodes.v3.txt", sep=""),weighted = TRUE, threshold = 0.1, nodeNames = probes, nodeAttr = moduleColors)
write.csv(dissTOM,"dissTOM.txt")
下面主要学习可视化的TOM矩阵,特定module的选择和表型的关联分析。
==分别输出单独的module===
module_colors=unique(mergedColors)
for (color in module_colors){
gene=names(datExpr)[moduleColors==color]
color_gene=as.matrix( TOM[gene,gene])
cyt = exportNetworkToCytoscape(color_gene,
edgeFile = paste('module/CytoscapeInput-edges-', color,'.txt', sep=''),
nodeFile = paste('module/CytoscapeInput-node-', color,'.txt', sep=''),
weighted = TRUE, threshold = 0, nodeNames = gene, altNodeNames = gene)
}
====可视化TOM矩阵=====
geneTree=net$dendrograms[[1]]
pdf("TOM.pdf")
TOMplot(plotTOM, geneTree, moduleColors, main="Network heatmap plot, all genes")
dev.off()
//全部基因生成热图可能需要大量的时间。可以限制基因的数量来加快绘图速度。然而,一个基因子集的基因树状图通常看起来不同于所有基因的基因树状图。在下面的例子中,将绘制的基因数量限制在1000个。
nSelect=1000
set.seed(10)
select=sample(nGenes, size=nSelect)
selectTOM=dissTOM[select, select]
selectTree=hclust(as.dist(selectTOM), method="average")
selectColors=moduleColors[select]
plotDiss=selectTOM^7
diag(plotDiss)=NA
pdf("TOM.select.pdf")
TOMplot(plotDiss, selectTree, selectColors, main="Network heatmap plot, selected genes")
dev.off()
//部分基因可视化TOM矩阵。热图描述了分析中基因的拓扑重叠矩阵(TOM)。浅色代表低重叠,逐渐变深的红色代表高重叠。沿着对角线的深色块是模块。基因树状图和模块分配也显示在左侧和顶部。
===表型关联======
//读入表型数据 去掉一些不需要的列
traitData = read.csv("ClinicalTraits.csv");
dim(traitData)
names(traitData)
allTraits = traitData[, -c(31, 16)]
allTraits = allTraits[, c(2, 11:36)]
dim(allTraits)
names(allTraits)
femaleSamples = rownames(datExpr)
traitRows = match(femaleSamples, allTraits$Mice)
datTraits = allTraits[traitRows, -1]
rownames(datTraits) = allTraits[traitRows, 1]
sampleTree2 = hclust(dist(datExpr), method = "average")
traitColors = numbers2colors(datTraits, signed = FALSE)
plotDendroAndColors(sampleTree2, traitColors, groupLabels = names(datTraits), main = "Sample dendrogram and trait heatmap")
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8, 1, 1))
//显示module和trait的correlation
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
注:图中,y轴是模块名,x轴是性状,每个模块都跟每个性状有一个空格,空格的上部分是模块与性状的相关系数,空格的下部分是一个p值,代表相关系数的显著性
//也可以不显示具体的数值
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
====基因与性状和重要模块的关系==
Module Membership简称MM, 将该基因的表达量与module的第一主成分,即moduleeigengene进行相关性分析就可以得到MM值,所以MM值本质上是一个相关系数,如果基因和某个module的MM值为0,说明二者根本不相关,该基因不属于这个module; 如果MM的绝对值接近1,说明基因与该module相关性很高。
Gene Significance简称GS, 将该基因的表达量与对应的表型数值进行相关性分析,最终的相关系数的值就是GS, GS反映出基因表达量与表型数据的相关性。
# 以特征变量weight 为例
weight =as.data.frame(datTraits$weight_g)
names(weight) ="weight"
# 命名模块
modNames =substring(names(MEs), 3)
geneModuleMembership =as.data.frame(cor(datExpr, MEs, use ="p"))
MMPvalue =as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) =paste("MM", modNames, sep="")
names(MMPvalue) =paste("p.MM", modNames, sep="")
geneTraitSignificance =as.data.frame(cor(datExpr, weight, use ="p"))
GSPvalue =as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) =paste("GS.",names(weight), sep="")
names(GSPvalue) =paste("p.GS.",names(weight), sep="")
模块内分析:鉴定高GS和MM基因
在感兴趣的基因模块中,筛选出高GS和MM的基因。在性状关联图中可以看到,与性状weight_g相关性最高的基因模块是MEbrown,因此选择MEbrown进行后续分析,绘制brown模块中GS和MM的散点图。
module ="brown"
column =match(module, modNames)
moduleGenes = moduleColors==module
sizeGrWindow(7, 7)
par(mfrow =c(1,1))
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, 1]),
xlab =paste("ModuleMembershipin", module,"module"),
ylab ="Genesignificanceforbodyweight",
main =paste("Modulemembershipvs.genesignificance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis= 1.2,col= module)
注:brown模块中weight与模块成员(MM)的基因显著性(GS)散点图。在这个模块中,GS和MM之间存在极显著的相关性。如图所示,GS和MM是高度相关的,这说明与一个性状高度相关的基因往往也是与该性状相关的模块中最重要的(中心)元素。
===结果汇总输出===
我们已经找到了与我们的兴趣特征高度相关的模块,并通过MM确定了它们的核心参与者。现在,我们将这些统计信息与基因注释合并并导出。
names(datExpr)
names(datExpr)[moduleColors=="brown"]
probes = names(datExpr)
geneInfo0 = data.frame(substanceBXH = probes,
moduleColor = moduleColors,
geneTraitSignificance, GSPvalue)
# 通过显著性对模块进行排序
modOrder = order(-abs(cor(MEs, weight, use = "p")))
# 添加模块成员
for (mod in 1:ncol(geneModuleMembership))
{
oldNames = names(geneInfo0)
geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]],MMPvalue[, modOrder[mod]])
names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""),paste("p.MM.", modNames[modOrder[mod]], sep=""))
}
# 对基因进行排序
geneOrder = order(geneInfo0$moduleColor, abs(geneInfo0$GS.weight))
geneInfo = geneInfo0[geneOrder, ]
# 导出
write.csv(geneInfo, file = "geneInfo.csv")
本文使用 文章同步助手 同步