转录组专题:WGCNA
2020-02-26 本文已影响0人
挽山
要注意的就一个是软阈值那块几个参数。
#source("http://bioconductor.org/biocLite.R")
#biocLite(c("AnnotationDbi","impute","GO.db","preprocessCore"))
#install.packages("WGCNA")
library(WGCNA)
# 数据准备
fpkm<-read.table(file.choose(), sep = '\t', header = T, stringsAsFactors = F) #file=归一化后的表达矩阵,行为基因,列为样本
# 查看数据信息
head(fpkm)
dim(fpkm)
names(fpkm)
# 设置行名(第一列作为行名)
row.names(fpkm)<-fpkm[,1]
fpkm<-fpkm[,-1]
# WGCNA要求转置数据格式
fpkm_t<-t(fpkm)
gsg = goodSamplesGenes(fpkm_t, verbose = 3)
gsg$allOK
if (!gsg$allOK)
{
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removed genes:", paste(names(fpkm_t)[!gsg$goodGenes], collapse = ", ")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removed samples:", paste(rownames(fpkm_t)[!gsg$goodSamples], collapse = ", ")));
# Remove the offending genes and samples from the data:
fpkm_t = fpkm_t[gsg$goodSamples, gsg$goodGenes]
}
gsg$goodSamples
sampleTree = hclust(dist(fpkm_t), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
sizeGrWindow(12,9)
#pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)
# Plot a line to show the cut
abline(h = 8000, col = "red");
# Determine cluster under the line
# 保存图片 Sample clustering to detect outliers
clust = cutreeStatic(sampleTree, cutHeight = 8000, minSize = 10) ## 把高于8000的切除
table(clust) # 0代表切除的,1代表保留的
keepSamples = (clust==1)
datExpr = fpkm_t[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# Choose a set of soft-thresholding powers 决定模块内基因数量
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
abline(h=0.85,col="red")
abline(h=0.9,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
# 截图,命名为power.png
# 保存图片Scale independence & Mean connectivity
sft
sft$powerEstimate # 最低要选sft > 0.85的,红线划到0.9
### 挖模块(一步法)-----------------------------------------------------------------
net =blockwiseModules(datExpr, power = 12, #选择软阈值
maxBlockSize = 5000,
TOMType = "unsigned",
minModuleSize = 20,
reassignThreshold = 0,
mergeCutHeight = 0.15, # 计算完所有modules后将特征量高度相似的modules进行和并,标准:所有<mergeCutHeight的值,默认0.15,决定模块数量
numericLabels = TRUE,
pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "PFC_PPI_network",# 改
verbose = 3)
MEs = net$MEs # 模块特征值:一个关于modules的特征量矩阵,行数等于筛选的modules数,列数等于样本数
table(net$colors)
length(net$colors)
length(table(net$colors))
table(labels2colors(net$colors))
#---------------------------------------------------------------------------------------------
#导出模块表格
#source("https://bioconductor.org/biocLite.R")
#biocLite("marray")
library(marray) # write.list要用
write.list(net,"7.3-CC-10-0.15_net.txt")
write.table(datExpr,"7.3-CC-10-0.15_res.txt",quote=FALSE,sep="\t")
#处理去重后得到模块号和颜色对应的文件,用于匹配result和net的结果
write.table(net$colors, "7.1-CC-net$colors-10-0.15.txt",row.names = F,quote = F,sep="\t")
write.table(labels2colors(net$colors), "7.1-CC-labels2colors-10-0.15.txt",row.names = F,quote = F,sep="\t")
#画聚类树(判断模块质量)---------------------------------------------------------------
geneTree = net$dendrograms[[1]]
# open a graphics window
sizeGrWindow(12, 9)
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
tree_52<-plotDendroAndColors(geneTree, mergedColors[net$blockGenes[[1]]], # geneTree = net$dendrograms[[1]]
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = 'Cluster Dendrogram For CC set')
# 保存图片 Cluster Dendrogram
#画热图----------------------------------------------------------------------------------
moduleColors = labels2colors(net$colors)
TOM = TOMsimilarityFromExpr(datExpr, power = 10)#####
TOM0 = 1-TOM; # 生成全基因不相似TOM矩阵,比如1-(),把得到的不相关矩阵加幂,这样绘制的TOM图色彩差异会比较明显。
plotTOM = TOM0^7;
TOMplot<-TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot")
# 保存图片 Network heatmap plot
#cex.lab可以更改X轴Y轴label字体的大小,cex.text可以更改热图中字体的大小,colors可以改变颜色。
#导出到Cytoscape ---------------------------------------------------------
# Select modules需要修改,选择需要导出的模块颜色
modules = unique(mergedColors)
# modules =c("blue","yellow","red")
mergedColors = labels2colors(net$colors)
# Select module probes选择模块探测
probes = colnames(datExpr)
inModule = is.finite(match(mergedColors, modules));
modProbes = probes[inModule];
#modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into edge and node list files Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM,
edgeFile = paste("7.2_CC_10_0.15_edges_cytoscape",".txt", sep=""),
nodeFile = paste("7.2_CC_10_0.15_nodes_cytoscape",".txt", sep=""),
weighted = TRUE,
threshold = 0.02,#默认
nodeNames = modProbes,
altNodeNames = modProbes,
nodeAttr = mergedColors[inModule])
#hub基因的识别 方法一------------------------------------------------------------
#hub gene(使用3个标准来筛选枢纽基因:基因与指定模块显著性 > 0.2, greenyellow Module membership value > 0.8, q.weighted(性状关联) < 0.01)
#1. connectivity
ADJ1=abs(cor(datExpr,use="p"))^6
Alldegrees1=intramodularConnectivity(ADJ1, net$colors=='blue')# colorh1代表module的颜色
head(Alldegrees1) #KTotal代表该基因的所有边的连接度,KWithin代表和该基因位于同一个module下的边的连接度,KOut代表和该基因位于不同module下的边的连接度,所以KTotal是KWithin和KOut之和,KDiff代表KWithin和KOut的差值。在module中,会存在hub gene的概念,所谓的hub gene, 就是该module下连接度最大的基因,注意此时只考虑位于该module下的边,就是上文的KWithin。
#2. module member-ship(MM)
datKME = signedKME(datExpr,MEs,outputColumnName="MM.")
head(datKME)
#3. gene signigicancer(GS) 与表型数据相关性
GS1=as.numeric(cor(y,datExpr, use="p"))
#方法二------------------------------------------------------------------------------------
##直接筛选某个模块的hub gene
#FilterGenes<- abs(GS1)> .2 & abs(datKME$MM.brown)>.8
datKME = signedKME(datExpr,MEs,outputColumnName="MM.")
head(datKME)
FilterGenes<- abs(datKME$MM.2)>.9 #根据需要选择模块号
FilterGenes_res<-colnames(fpkm_t)[FilterGenes];length(FilterGenes_res)
write(paste('Module', 'Gene'),'10-FilterGenes_hub_0.8.txt',sep = '\t',append=T)
write(paste('CC_M2(blue)', FilterGenes_res),'10-FilterGenes_hub_0.9.txt',sep = '\t',append=T)
# hub 基因热图 -----------------------------------------------------------------------------
plotNetworkHeatmap(fpkm_t,
plotGenes = paste(FilterGenes_res,sep = ""),
networkType = "unsigned",
useTOM = TRUE,
power=10,
main="hub gene |KME| > 0.95 "
)
# 导出hub基因到Cytoscape ---------------------------------------------------------------
TOM = TOMsimilarityFromExpr(fpkm_t, power = 10)#####
hubGene_TOM <- TOM[FilterGenes,FilterGenes]
dimnames(hubGene_TOM) = list(colnames(fpkm_t)[FilterGenes],
colnames(fpkm_t)[FilterGenes])
cyt = exportNetworkToCytoscape(hubGene_TOM,
edgeFile = paste("Cytoscape_hub_edges", ".txt", sep=""),
#nodeFile = paste("Cytoscape_hub_nodes", ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = FilterGenes_res,
altNodeNames = F,
nodeAttr = F )
#其他方法---------------------------------------------------------------------------------
#hub gene,只给一个
moduleColors = unique(mergedColors)
HubGenes <- chooseTopHubInEachModule(datExpr,moduleColors)
write.table (HubGenes,file = "HubGenes_of_each_module.txt",quote=F,sep='\t')