Tutorial for the WGCNA package f

2019-04-06 本文已影响32人五谷吃不完了

I. Network analysis of liver expression data from female mice: finding modules related to body weight

参考1：Tutorials for the WGCNA package

参考2：一文学会WGCNA分析

参考3:STEP6:WGCNA相关性分析

第一步：Data input and cleaning

1.1 数据的导入

rm(list = ls())
setwd('E:/gsj/RWD/WGCNA/')
library(WGCNA)
options(stringsAsFactors = FALSE)# 如果需要保存变量的话这一步不能省
#读入文件
femData = read.csv("LiverFemale3600.csv")
#查看一下文件格式，如果不符合，就需要修改
name(femData)
head(femData)

#进行WGCNA分析的时候，要求输入的表达矩阵，行名是样本，列名是基因
datExpr0 = as.data.frame(t(femData[, -c(1:8)]))
names(datExpr0) = femData$substanceBXH
rownames(datExpr0) = names(femData)[-c(1:8)]
dim(datExpr0)

1.2 检查是否有离群值

#检查是否有缺失值，没问题就会返回TRUE
gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK

#如果不是TREU，那么需要进行下面操作
if (!gsg$allOK)
{
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
    printFlush(paste("Removing genes:", paste(names(datExpr0[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
    printFlush(paste("Removing samples:",paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
# Remove the offending genes and samples from the data:
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}

#第二步，对样本进行聚类，判断有无outlier的样本
sampleTree = hclust(dist(datExpr0), method = "average");
sizeGrWindow(12,9)
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)
#从下图可以看出sampleF2_221是一个离群值，要么手动删掉，要么设置一个阈值，剔除掉
abline(h = 15, col = "red")
#按照设定的高度
clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)
table(clust)
#可以看出cluster1包含了我们最后所需要的样本
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#通过以上的处理，最后得到的datExpr对象就可以用来进行后续的分析

缺失值检查

离群样本检查

cluster选择

1.3 导入临床信息

#导入临床信息
traitData = read.csv("ClinicalTraits.csv");
dim(traitData)
names(traitData)
#除去不相关的信息
allTraits = traitData[, -c(31, 16)];
allTraits = allTraits[, c(2, 11:36) ];
dim(allTraits)
names(allTraits)
#每一个样本都会有对应的临床信息，包括体重，长度等
view(allTraits)

#将临床信息和表样本名结合起来
femaleSamples = rownames(datExpr);
traitRows = match(femaleSamples, allTraits$Mice);
datTraits = allTraits[traitRows, -1];
rownames(datTraits) = allTraits[traitRows, 1]
view(datTraits)

#最后在进行后续的网络构建和模块选择之前，我们可以看一下样本和临床信息之间的匹配度
sampleTree2 = hclust(dist(datExpr), method = "average")
#将特性与颜色相关联，白色表示低；红色表示高；灰色表示缺失
traitColors = numbers2colors(datTraits, signed = FALSE);
#出图，红色代表高的，白色代表低的，灰色代表缺失
plotDendroAndColors(sampleTree2, traitColors,groupLabels = names(datTraits),main = "Sample dendrogram and trait heatmap")

#没有问题的话就保存环境变量
save(datExpr, datTraits, file = "FemaleLiver-01-dataInput.RData")

第二步：Automatic network construction and module detection

2.0 构建R环境

######################################
#如果你关掉了R，那么就需要运行下面几步
#rm(list = ls())
#setwd('E:/gsj/RWD/WGCNA/')
#library(WGCNA)
#options(stringsAsFactors = FALSE)#这一步不能省
#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程，但是如果是在本机上使用的话，这一步过程可以跳过
#lnames = load(file = "FemaleLiver-01-dataInput.RData")
#以上，成功导入第一步生成的datExpr和datTraits

2.1 确定合适的阈值

#由不同的方式构建基因网络，这里选择自动一步构建基因网络的方法
#确定合适的阈值范围
powers = c(c(1:10), seq(from = 12, to=20, by=2))
#调用pickSoftThreshold函数分析出合适的阈值
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
#画图，结果展示
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;

plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],xlab="Soft Threshold (power)",
     ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red")
abline(h=0.90,col="green")

plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
#以上会出来两张图，由此确定选取阈值6

阈值选择

2.2 一步构建网络图和模块选择

#一步构建网络和模块选择
net = blockwiseModules(datExpr, power = 6,TOMType = "unsigned", minModuleSize = 30,reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,saveTOMs = TRUE,saveTOMFileBase = "femaleMouseTOM",verbose = 3)
#参数mergeCutHeight为合并模块的一个阈值
#上述的参数设置均为下限值，不同的数据类型有不同的参数设置
#如果电脑的运行跟不上，建议参考大样本的那种网络构建方法

#看一下有多少个模块以及模块当中所包含的基因
#如下图展示的结果，一共有18个模块，从1到18模块，按照基因数递减排列，模块0表示没有分类的基因数
table(net$colors)
#层次聚类图的结果包含在net$dendrograms[[1]]对象中
#用以下代码可以将树图和颜色分布整合，树图是对基因进行的聚类，下面不同颜色代表这个基因处于哪个模块
#设置窗口的大小
sizeGrWindow(12, 9)
mergedColors = labels2colors(net$colors)
#出图
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
 
 #此外，如果用户想要修改一些参数，可以使用recutBlockwiseTrees这个函数
 
 #保存数据
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
save(MEs, moduleLabels, moduleColors, geneTree,
     file = "FemaleLiver-02-networkConstruction-auto.RData")

第三步：relating modules to external information and identifying important genes

3.0 构建R环境

#rm(list = ls())
#setwd('E:/gsj/RWD/WGCNA/')
#library(WGCNA)
#options(stringsAsFactors = FALSE)#这一步不能省
#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程，但是如果是在本机上使用的话，这一步过程可以跳过
#lnames = load(file = "FemaleLiver-01-dataInput.RData")
#以上，成功导入第一步生成的datExpr和datTraits
#lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");
#lnames
#以上，成功导入第二步生成的参数

3.1 计算模块和性状之间的相关性

nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)##不同颜色的模块的ME值矩阵(样本vs模块)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);

#这一步将会展示相关性和P值
sizeGrWindow(10,6)
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                   signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
#这一步将会在热图上展示相关系数
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(datTraits),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))
#以上可以通过这个热图发现和性状相关的基因，在这之后我们主要关注weight这一性状相关的基因

3.2 确定相关模块中的显著相关基因

#性状跟模块虽然求出了相关性，可以挑选最相关的那些模块来分析，但是模块本身仍然包含非常多的基因，还需进一步的寻找最重要的基因
#首先计算模块与基因的相关性矩阵
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight" #单独把weight这一列提出来，做一个data.frame
modNames = substring(names(MEs), 3) #提出每一模块的颜色
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p")); #这一步，模块与基因的相关性矩阵
## 算出每个模块跟基因的皮尔森相关系数矩阵
## MEs是每个模块在每个样本里面的值
## datExpr是每个基因在每个样本的表达量

MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
#再计算性状与基因的相关性矩阵
# 只有连续型性状才能只有计算
geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p")); #这一步，性状与基因的相关性矩阵
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) = paste("GS.", names(weight), sep="");
names(GSPvalue) = paste("p.GS.", names(weight), sep="");

3.3 筛选出和性状以及模块相关性都很高的基因

#这里可以从上图中看出weight这一个性状中，棕色的模块最相关
module = "brown"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene significance for body weight",
                   main = paste("Module membership vs. gene significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
 #这一张图显示的是在棕色模块中，基因先属性和模块的关系（类似于MEs）
 #可以看出，和性状高度相关的基因往往在和性状高度相关的模块中
#经过上述操作，我们已经找到和感兴趣的性状最相关的模型，同时还得到了相关的基因

3.4 将网络分析结果输出

#返回所有的probe ID
names(datExpr)
#返回brown模块对应的基因
names(datExpr)[moduleColors=="brown"]
#可以提供一个转换ID的文件
annot = read.csv(file = "GeneAnnotation.csv");
dim(annot)
names(annot)
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
#计算没有注释到的probe数量
sum(is.na(probes2annot))

#随后构建一个data.frame，描述probe ID、gene symbol、模块颜色、基因与weight模块的显著性、p值
geneInfo0 = data.frame(substanceBXH = probes,
                       geneSymbol = annot$gene_symbol[probes2annot],
                       LocusLinkID = annot$LocusLinkID[probes2annot],
                       moduleColor = moduleColors,
                       geneTraitSignificance,
                       GSPvalue)

modOrder = order(-abs(cor(MEs, weight, use = "p")));
for (mod in 1:ncol(geneModuleMembership))
{
  oldNames = names(geneInfo0)
  geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]],
                         MMPvalue[, modOrder[mod]]);
  names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""),
                       paste("p.MM.", modNames[modOrder[mod]], sep=""))
}
geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight));
geneInfo = geneInfo0[geneOrder, ]
write.csv(geneInfo, file = "geneInfo.csv")

第四步：Interfacing network analysis with other data such as functional annotation and gene ontology

4.1 选取感兴趣的基因进行功能注释或者其他分析

# Read in the probe annotation
annot = read.csv(file = "GeneAnnotation.csv");
# Match probes in the data set to the probe IDs in the annotation file
probes = names(datExpr)
probes2annot = match(probes, annot$substanceBXH)
# Get the corresponding Locuis Link IDs
allLLIDs = annot$LocusLinkID[probes2annot];
# $ Choose interesting modules
intModules = c("brown", "red", "salmon")
for (module in intModules)
{
    # Select module probes
    modGenes = (moduleColors==module)
    # Get their entrez ID codes
    modLLIDs = allLLIDs[modGenes];
    # Write them into a file
    fileName = paste("LocusLinkIDs-", module, ".txt", sep="");
    write.table(as.data.frame(modLLIDs), file = fileName,row.names = FALSE, col.names = FALSE)
}
# As background in the enrichment analysis, we will use all probes in the analysis.
fileName = paste("LocusLinkIDs-all.txt", sep="");
write.table(as.data.frame(allLLIDs), file = fileName,row.names = FALSE, col.names = FALSE)
#经过上述步骤，将会生成下面的文件，后续可以对感兴趣的模块进行富集分析等

第五步：Network visualization using WGCNA functions

5.0 构建R环境

#rm(list = ls())
#setwd('E:/gsj/RWD/WGCNA/')
#library(WGCNA)
#options(stringsAsFactors = FALSE)#这一步不能省
#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程，但是如果是在本机上使用的话，这一步过程可以跳过
#lnames = load(file = "FemaleLiver-01-dataInput.RData")
#以上，成功导入第一步生成的datExpr和datTraits
#lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");
#lnames
#以上，成功导入第二步生成的参数

5.1 可视化基因互作网络

#网络可视化过程
#对所有基因画热图
#这一步速度很慢，不建议做
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);
plotTOM = dissTOM^7;
diag(plotTOM) = NA;
sizeGrWindow(9,9)
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")

#随机选取400个基因
nSelect = 400# For reproducibility, we set the random seedset.seed(10);select = sample(nGenes, size = nSelect);selectTOM = dissTOM[select, select];# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select];# Open a graphical windowsizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing# the color palette; setting the diagonal to NA also improves the clarity of the plotplotDiss = selectTOM^7;diag(plotDiss) = NA;TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")

5.2 可视化网络图中的eigenegenes

#根据模块里面的eigenegenes（类似于主要的基因），对模块进行聚类以及热图，标注出想要看的模块
# Recalculate module eigengenes
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
## 只有连续型性状才能只有计算

MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
weight = as.data.frame(datTraits$weight_g);
names(weight) = "weight"
MET = orderMEs(cbind(MEs, weight))
# 出图，模块与性状之间的关系

#下面是将两张图分别画出的代码
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
# Plot the relationships among the eigengenes and the trait
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
# Plot the dendrogram
sizeGrWindow(6,6);
par(cex = 1.0)
## 模块的聚类图
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
## 性状与模块热图
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),plotDendrograms = FALSE, xLabelsAngle = 90)

第六步：Exporting a gene network to external visualization software

6.0 构建R环境

#rm(list = ls())
#setwd('E:/gsj/RWD/WGCNA/')
#library(WGCNA)
#options(stringsAsFactors = FALSE)#这一步不能省
#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程，但是如果是在本机上使用的话，这一步过程可以跳过
#lnames = load(file = "FemaleLiver-01-dataInput.RData")
#以上，成功导入第一步生成的datExpr和datTraits
#lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");
#lnames
#以上，成功导入第二步生成的参数

6.1 将数据导出VisANT

TOM = TOMsimilarityFromExpr(datExpr, power = 6);
annot = read.csv(file = "GeneAnnotation.csv");
#选择模块
module = "brown"
#选择模块的ID
probes = names(datExpr)
inModule = (moduleColors==module);
modProbes = probes[inModule];
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
vis = exportNetworkToVisANT(modTOM,
                            file = paste("VisANTInput-", module, ".txt", sep=""),
                            weighted = TRUE,
                            threshold = 0,
                            probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) )

#因为brown模块很大，筛选出top30的基因
nTop = 30;
IMConn = softConnectivity(datExpr[, modProbes]);
top = (rank(-IMConn) <= nTop)
vis = exportNetworkToVisANT(modTOM[top, top],
                            file = paste("VisANTInput-", module, "-top30.txt", sep=""),
                            weighted = TRUE,
                            threshold = 0,
                            probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) )

6.2 导出Cytoscape

# Recalculate topological overlap if needed
TOM = TOMsimilarityFromExpr(datExpr, power = 6);
# Read in the annotation file
annot = read.csv(file = "GeneAnnotation.csv");
# Select modules
modules = c("brown", "red");
# Select module probes
probes = names(datExpr)
inModule = is.finite(match(moduleColors, modules));
modProbes = probes[inModule];
modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into edge and node list files Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM,
                               edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
                               nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
                               weighted = TRUE,
                               threshold = 0.02,
                               nodeNames = modProbes,
                               altNodeNames = modGenes,
                               nodeAttr = moduleColors[inModule]);

Tutorial for the WGCNA package f

I. Network analysis of liver expression data from female mice: finding modules related to body weight

参考1：Tutorials for the WGCNA package

参考2：一文学会WGCNA分析

参考3:STEP6:WGCNA相关性分析

第一步：Data input and cleaning

1.1 数据的导入

1.2 检查是否有离群值

1.3 导入临床信息

第二步：Automatic network construction and module detection

2.0 构建R环境

2.1 确定合适的阈值

2.2 一步构建网络图和模块选择

第三步：relating modules to external information and identifying important genes

3.0 构建R环境

3.1 计算模块和性状之间的相关性

3.2 确定相关模块中的显著相关基因

3.3 筛选出和性状以及模块相关性都很高的基因

3.4 将网络分析结果输出

第四步：Interfacing network analysis with other data such as functional annotation and gene ontology

4.1 选取感兴趣的基因进行功能注释或者其他分析

第五步：Network visualization using WGCNA functions

5.0 构建R环境

5.1 可视化基因互作网络

5.2 可视化网络图中的eigenegenes

第六步：Exporting a gene network to external visualization software

6.0 构建R环境

6.1 将数据导出VisANT

6.2 导出Cytoscape

猜你喜欢

热点阅读