WGCNA-2实战

2020-02-12 本文已影响0人白云梦_7

#设置工作目录
setwd("D:/Desktop/WGCNA/FemaleLiver-Data/")
#加载包
library(WGCNA)
library(data.table)
library(stringr)
library(openxlsx)
allowWGCNAThreads()
options(stringsAsFactors = FALSE)

1.数据准备

#读取表达数据
femData <- read.csv("LiverFemale3600.csv")
datExpr0 = as.data.frame(t(femData[, -c(1:8)]));
names(datExpr0) = femData$substanceBXH;
rownames(datExpr0) = names(femData)[-c(1:8)];
#检查数据
gsg = goodSamplesGenes(datExpr0, verbose = 3);
if (!gsg$allOK)
{
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0)
    printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
  if (sum(!gsg$goodSamples)>0)
    printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
  # Remove the offending genes and samples from the data:
  datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
#构建样本距离树
sampleTree = hclust(dist(datExpr0), method = "average");
#样本聚类图，查看是否有离群点
pdf(file = "sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
     cex.axis = 1.5, cex.main = 2)
abline(h = 15, col = "red");
dev.off();

Sample clustering to detect outliers

# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)
table(clust)
# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

#读取性状数据
traitData = read.csv("ClinicalTraits.csv");
dim(traitData)
names(traitData)
# remove columns that hold information we do not need.
allTraits = traitData[, -c(31, 16)];
allTraits = allTraits[, c(2, 11:36) ];
dim(allTraits)
names(allTraits)
# Form a data frame analogous to expression data that will hold the clinical traits.
femaleSamples = rownames(datExpr);
traitRows = match(femaleSamples, allTraits$Mice);
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1];
collectGarbage();
#重新构建样本距离树
sampleTree2 = hclust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(datTraits, signed=FALSE);
# Plot the sample dendrogram and the colors underneath.
pdf("Sample dendrogram and trait heatmap.pdf")
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datTraits),
                    main = "Sample dendrogram and trait heatmap")
dev.off()

Sample dendrogram and trait heatmap

2.选择合适的power值

# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
power=sft$powerEstimate
# Plot the results:
pdf("soft-thresholding.pdf")
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()

soft-thresholding

#查看是否符合无尺度分布
pdf("k.pdf")
k <- softConnectivity(datExpr,power=power)
par(mfrow=c(1,2))
hist(k)
scaleFreePlot(k,main="Check Scale free topology\n")
dev.off()

3.构建模块

#one-step
net = blockwiseModules(datExpr, power = power,
                       TOMType = "unsigned", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       saveTOMs = TRUE,
                       saveTOMFileBase = "femaleMouseTOM",
                       verbose = 3)
table(net$colors)
# open a graphics window
# Convert labels to colors for plotting
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)# Plot the dendrogram and the module colors underneath
MEs = orderMEs(net$MEs);
geneTree = net$dendrograms[[1]];
pdf("Module colors.pdf")
plotDendroAndColors(geneTree, moduleColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
dev.off()

Module colors

4.TOM结构

TOM=TOMsimilarityFromExpr(datExpr, power = power);  
dissTOM = 1-TOM
plotTOM = dissTOM^power; 
diag(plotTOM) = NA; 
#TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
#随机选取部分基因作图
nSelect = 400
# For reproducibility, we set the random seed
set.seed(10);
select = sample(nGenes, size = nSelect);
selectTOM = dissTOM[select, select];
# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select];
# Open a graphical window
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^power;
diag(plotDiss) = NA;
pdf("Network heatmap plot, selected genes.pdf")
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
dev.off()

Network heatmap plot, selected genes

？？这个颜色似乎是反过来的……求指点

#模块间相似矩阵
pdf("Eigengene adjacency heatmap.pdf")
plotEigengeneNetworks(MEs, 
                      "Eigengene adjacency heatmap", 
                      marHeatmap = c(3,4,2,2), 
                      plotDendrograms = FALSE, 
                      xLabelsAngle = 90)
dev.off()

Eigengene adjacency heatmap

5.模块-性状关系图寻找显著性状和模块

moduleTraitCor<-cor(MEs,datTraits,use="p")
moduleTraitPvalue<-corPvalueStudent(moduleTraitCor,nSamples)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
# Display the correlation values within a heatmap plot
pdf("Module-trait relationships.pdf")
par(mar = c(6, 8.5, 3, 3));
labeledHeatmap(Matrix = moduleTraitCor,
                xLabels = names(datTraits),
                yLabels = names(MEs),
                ySymbols = names(MEs),
                colorLabels = FALSE,
                colors = blueWhiteRed(50),
                textMatrix = textMatrix,
                setStdMargins = FALSE,
                cex.text = 0.5,
                zlim = c(-1,1),
                main = paste("Module-trait relationships"))
dev.off()

Module-trait relationships

#可看出weight_g和brown相关性高，下面是不同方面进一步验证
#（1）根据性状与模块特征向量基因的相关性及pvalue来挖掘与性状相关的模块
cor_ADR <- signif(WGCNA::cor(datTraits,MEs,use="p",method="pearson"),5)
p.values <- corPvalueStudent(cor_ADR,nSamples=nrow(datTraits))
Freq_MS_max_cor <- which.max(abs(cor_ADR["weight_g",-which(colnames(cor_ADR) == "MEgrey")]))
Freq_MS_max_p <- which.min(p.values["weight_g",-which(colnames(p.values) == "MEgrey")])

#（2）根据基因网络显著性，也就是性状与每个基因表达量相关性在各个模块的均值作为该性状在该模块的显著性，显著性最大的那个模块与该性状最相关
GS1 <- as.numeric(WGCNA::cor(datTraits[,'weight_g'],datExpr,use="p",method="pearson"))
GeneSignificance <- abs(GS1)
ModuleSignificance <- tapply(GeneSignificance,moduleColors,mean,na.rm=T)
which.max(ModuleSignificance[names(ModuleSignificance)!="MEgrey"])

# Isolate weight from the clinical traits
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
# Add the weight to existing module eigengenes
MET = orderMEs(cbind(MEs, weight))
# Plot the relationships among the eigengenes and the trait
pdf("weight.pdf");
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle
= 90)
dev.off()

weight

# 从上图可以看到MEbrown与weight相关
pdf("module heatmap and the eigengene.pdf")
which.module="brown"
ME=MEs[, paste("ME",which.module, sep="")]
par(mfrow=c(2,1), mar=c(0.3, 5.5, 3, 2))
plotMat(t(scale(datExpr[,moduleColors==which.module ]) ),
        nrgcols=30,rlabels=F,rcols=which.module,
        main=which.module, cex.main=2)
par(mar=c(5, 4.2, 0, 0.7))
barplot(ME, col=which.module, main="", cex.main=2,
        ylab="eigengene expression",xlab="MPP")
dev.off()

module heatmap and the eigengene

## 模块内基因与表型数据关联
# 性状跟模块虽然求出了相关性，可以挑选最相关的那些模块来分析，
# 但是模块本身仍然包含非常多的基因，还需进一步的寻找最重要的基因。
# 所有的模块都可以跟基因算出相关系数，所有的连续型性状也可以跟基因的表达
# 值算出相关系数。
# 如果跟性状显著相关基因也跟某个模块显著相关，那么这些基因可能就非常重要
# 。
### 计算模块与基因的相关性矩阵

corType="pearson"

if (corType=="pearsoon") {
  geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))
  MMPvalue = as.data.frame(corPvalueStudent(
             as.matrix(geneModuleMembership), nSamples))
} else {
  geneModuleMembershipA = bicorAndPvalue(datExpr, MEs, robustY=ifelse(corType=="pearson",T,F))
  geneModuleMembership = geneModuleMembershipA$bicor
  MMPvalue   = geneModuleMembershipA$p
}

# 计算性状与基因的相关性矩阵

## 只有连续型性状才能进行计算，如果是离散变量，在构建样品表时就转为0-1矩阵。

if (corType=="pearsoon") {
  geneTraitCor = as.data.frame(cor(datExpr, datTraits, use = "p"))
  geneTraitP = as.data.frame(corPvalueStudent(
             as.matrix(geneTraitCor), nSamples))
} else {
  geneTraitCorA = bicorAndPvalue(datExpr, datTraits, robustY=ifelse(corType=="pearson",T,F))
  geneTraitCor = as.data.frame(geneTraitCorA$bicor)
  geneTraitP   = as.data.frame(geneTraitCorA$p)
}



# 最后把两个相关性矩阵联合起来,指定感兴趣模块进行分析
module = "brown"
pheno = "weight_g"
modNames = substring(colnames(MEs), 3)
# 获取关注的列
module_column = match(module, modNames)
pheno_column = match(pheno,colnames(datTraits))
# 获取模块内的基因
moduleGenes = moduleColors == module
pdf("MMvsGS_brown.pdf")
par(mfrow = c(1,1))
# 与性状高度相关的基因，也是与性状相关的模型的关键基因
verboseScatterplot(abs(geneModuleMembership[moduleGenes, module_column]),
                   abs(geneTraitCor[moduleGenes, pheno_column]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = paste("Gene significance for", pheno),
                   main = paste("Module membership vs. gene significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
dev.off()

MMvsGS_brown

ADJ1=abs(cor(datExpr,use="p"))^power
Alldegrees1=intramodularConnectivity(ADJ1, moduleColors)
colorlevels=unique(moduleColors)
pdf("MMvsGS_total.pdf")
par(mfrow=c(2,as.integer(0.5+length(colorlevels)/2)))
par(mar = c(4,5,3,1))
for (i in c(1:length(colorlevels)))
{
  whichmodule=colorlevels[[i]];
  restrict1 = (moduleColors==whichmodule);
  verboseScatterplot(Alldegrees1$kWithin[restrict1],
                     GeneSignificance[restrict1], col=moduleColors[restrict1],
                     main=whichmodule,
                     xlab = "Connectivity", ylab = "Gene Significance", abline = TRUE)
}
dev.off()

MMvsGS_total

#筛选hub基因
datKME=signedKME(datExpr, MEs, outputColumnName="MM.")
FilterGenes= abs(GS1)> .2 & abs(datKME$MM.brown)>.8
dimnames(data.frame(datExpr))[[2]][FilterGenes]
trait_hubGenes<-colnames(datExpr)[FilterGenes]
pdf("hub_unsigned correlations.pdf")
plotNetworkHeatmap(datExpr,plotGenes = trait_hubGenes,networkType = "unsigned",useTOM = TRUE,power=power,main="unsigned correlations")
dev.off()

hub_unsigned correlations

6.模块的导出

# Select module gene
genes = colnames(datExpr) ## 
inModule = (moduleColors==module);
modgenes = genes[inModule]; 
## 也是提取指定模块的基因名
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modgenes, modgenes)
cyt = exportNetworkToCytoscape(
                                 modTOM,
                                 edgeFile = paste("CytoscapeInput-edges-", paste(module, collapse="-"), ".txt", sep=""),
                                 nodeFile = paste("CytoscapeInput-nodes-", paste(module, collapse="-"), ".txt", sep=""),
                                 weighted = TRUE,
                                 threshold = 0.02,
                                 nodeNames = modgenes,
                                 nodeAttr = moduleColors[inModule]
);


## 导出枢纽基因到 Cytoscape 
hubGene_TOM <- TOM[FilterGenes,FilterGenes]
dimnames(hubGene_TOM) = list(colnames(datExpr)[FilterGenes],colnames(datExpr)[FilterGenes])
cyt = exportNetworkToCytoscape(hubGene_TOM, 
                                 edgeFile = paste("CytoscapeInput-hub-edges-", paste(module, collapse="-"), ".txt", sep=""), 
                                 nodeFile = paste("CytoscapeInput-hub-nodes-", paste(module, collapse="-"), ".txt", sep=""), 
                                 weighted = TRUE, 
                                 threshold = 0.02, 
                                 nodeNames = trait_hubGenes, 
                                 altNodeNames =trait_hubGenes, 
                                 nodeAttr = moduleColors[FilterGenes]
                                 )