转录组专题：WGCNA

2020-02-26 本文已影响0人挽山
要注意的就一个是软阈值那块几个参数。
#source("http://bioconductor.org/biocLite.R")
#biocLite(c("AnnotationDbi","impute","GO.db","preprocessCore"))
#install.packages("WGCNA")
library(WGCNA)

#  数据准备
fpkm<-read.table(file.choose(), sep = '\t', header = T, stringsAsFactors = F) #file=归一化后的表达矩阵，行为基因，列为样本

#  查看数据信息
head(fpkm)
dim(fpkm)
names(fpkm)

#  设置行名（第一列作为行名）
row.names(fpkm)<-fpkm[,1]
fpkm<-fpkm[,-1]

#  WGCNA要求转置数据格式
fpkm_t<-t(fpkm)

gsg = goodSamplesGenes(fpkm_t, verbose = 3)
gsg$allOK
if (!gsg$allOK)
{
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0)
    printFlush(paste("Removed genes:", paste(names(fpkm_t)[!gsg$goodGenes], collapse = ", ")));
  if (sum(!gsg$goodSamples)>0)
    printFlush(paste("Removed samples:", paste(rownames(fpkm_t)[!gsg$goodSamples], collapse = ", ")));
  # Remove the offending genes and samples from the data:
  fpkm_t = fpkm_t[gsg$goodSamples, gsg$goodGenes]
}
gsg$goodSamples

sampleTree = hclust(dist(fpkm_t), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
sizeGrWindow(12,9)
#pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)
# Plot a line to show the cut
abline(h = 8000, col = "red");
# Determine cluster under the line
# 保存图片 Sample clustering to detect outliers

clust = cutreeStatic(sampleTree, cutHeight = 8000, minSize = 10) ## 把高于8000的切除
table(clust) # 0代表切除的，1代表保留的
keepSamples = (clust==1)
datExpr = fpkm_t[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

# Choose a set of soft-thresholding powers 决定模块内基因数量
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
abline(h=0.85,col="red")
abline(h=0.9,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
# 截图，命名为power.png
# 保存图片Scale independence & Mean connectivity
sft
sft$powerEstimate # 最低要选sft > 0.85的，红线划到0.9

### 挖模块（一步法）-----------------------------------------------------------------
net =blockwiseModules(datExpr, power = 12, #选择软阈值
                      maxBlockSize = 5000,
                      TOMType = "unsigned", 
                      minModuleSize = 20, 
                      reassignThreshold = 0, 
                      mergeCutHeight = 0.15, # 计算完所有modules后将特征量高度相似的modules进行和并，标准：所有<mergeCutHeight的值，默认0.15，决定模块数量 
                      numericLabels = TRUE, 
                      pamRespectsDendro = FALSE,
                      saveTOMs = TRUE,
                      saveTOMFileBase = "PFC_PPI_network",# 改
                      verbose = 3)
MEs = net$MEs # 模块特征值：一个关于modules的特征量矩阵，行数等于筛选的modules数，列数等于样本数
table(net$colors)
length(net$colors) 
length(table(net$colors)) 
table(labels2colors(net$colors))
#---------------------------------------------------------------------------------------------
#导出模块表格
#source("https://bioconductor.org/biocLite.R")
#biocLite("marray")
library(marray) # write.list要用
write.list(net,"7.3-CC-10-0.15_net.txt")
write.table(datExpr,"7.3-CC-10-0.15_res.txt",quote=FALSE,sep="\t")

#处理去重后得到模块号和颜色对应的文件，用于匹配result和net的结果
write.table(net$colors, "7.1-CC-net$colors-10-0.15.txt",row.names = F,quote = F,sep="\t")
write.table(labels2colors(net$colors), "7.1-CC-labels2colors-10-0.15.txt",row.names = F,quote = F,sep="\t")

#画聚类树（判断模块质量）---------------------------------------------------------------
geneTree = net$dendrograms[[1]]
# open a graphics window
sizeGrWindow(12, 9)
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
tree_52<-plotDendroAndColors(geneTree, mergedColors[net$blockGenes[[1]]], # geneTree = net$dendrograms[[1]]
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = 'Cluster Dendrogram For CC set')
# 保存图片 Cluster Dendrogram

#画热图----------------------------------------------------------------------------------
moduleColors = labels2colors(net$colors) 

TOM = TOMsimilarityFromExpr(datExpr, power = 10)#####
TOM0 = 1-TOM; # 生成全基因不相似TOM矩阵，比如1-()，把得到的不相关矩阵加幂，这样绘制的TOM图色彩差异会比较明显。
plotTOM = TOM0^7;

TOMplot<-TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot")
# 保存图片 Network heatmap plot
#cex.lab可以更改X轴Y轴label字体的大小，cex.text可以更改热图中字体的大小，colors可以改变颜色。

#导出到Cytoscape ---------------------------------------------------------
# Select modules需要修改，选择需要导出的模块颜色
modules = unique(mergedColors)
# modules =c("blue","yellow","red")

mergedColors = labels2colors(net$colors)
# Select module probes选择模块探测
probes = colnames(datExpr)
inModule = is.finite(match(mergedColors, modules));
modProbes = probes[inModule];
#modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)

# Export the network into edge and node list files Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM,
                               edgeFile = paste("7.2_CC_10_0.15_edges_cytoscape",".txt", sep=""),
                               nodeFile = paste("7.2_CC_10_0.15_nodes_cytoscape",".txt", sep=""),
                               weighted = TRUE,
                               threshold = 0.02,#默认
                               nodeNames = modProbes,
                               altNodeNames = modProbes,
                               nodeAttr = mergedColors[inModule])

#hub基因的识别 方法一------------------------------------------------------------
#hub gene(使用3个标准来筛选枢纽基因：基因与指定模块显著性 > 0.2, greenyellow Module membership value > 0.8， q.weighted（性状关联） < 0.01)
#1. connectivity
ADJ1=abs(cor(datExpr,use="p"))^6
Alldegrees1=intramodularConnectivity(ADJ1, net$colors=='blue')# colorh1代表module的颜色
head(Alldegrees1) #KTotal代表该基因的所有边的连接度，KWithin代表和该基因位于同一个module下的边的连接度，KOut代表和该基因位于不同module下的边的连接度，所以KTotal是KWithin和KOut之和，KDiff代表KWithin和KOut的差值。在module中，会存在hub gene的概念，所谓的hub gene, 就是该module下连接度最大的基因，注意此时只考虑位于该module下的边，就是上文的KWithin。
#2. module member-ship(MM)
datKME = signedKME(datExpr,MEs,outputColumnName="MM.")
head(datKME)
#3. gene signigicancer(GS) 与表型数据相关性
GS1=as.numeric(cor(y,datExpr, use="p"))

#方法二------------------------------------------------------------------------------------
##直接筛选某个模块的hub gene
#FilterGenes<- abs(GS1)> .2 & abs(datKME$MM.brown)>.8
datKME = signedKME(datExpr,MEs,outputColumnName="MM.")
head(datKME)

FilterGenes<- abs(datKME$MM.2)>.9 #根据需要选择模块号

FilterGenes_res<-colnames(fpkm_t)[FilterGenes];length(FilterGenes_res)
write(paste('Module', 'Gene'),'10-FilterGenes_hub_0.8.txt',sep = '\t',append=T)
write(paste('CC_M2(blue)', FilterGenes_res),'10-FilterGenes_hub_0.9.txt',sep = '\t',append=T)

# hub 基因热图 -----------------------------------------------------------------------------
plotNetworkHeatmap(fpkm_t, 
                   plotGenes = paste(FilterGenes_res,sep = ""), 
                   networkType = "unsigned", 
                   useTOM = TRUE, 
                   power=10,
                   main="hub gene |KME| > 0.95 "
                  )

# 导出hub基因到Cytoscape ---------------------------------------------------------------
TOM = TOMsimilarityFromExpr(fpkm_t, power = 10)#####
hubGene_TOM <- TOM[FilterGenes,FilterGenes] 
dimnames(hubGene_TOM) = list(colnames(fpkm_t)[FilterGenes], 
                             colnames(fpkm_t)[FilterGenes]) 
cyt = exportNetworkToCytoscape(hubGene_TOM, 
                               edgeFile = paste("Cytoscape_hub_edges", ".txt", sep=""), 
                               #nodeFile = paste("Cytoscape_hub_nodes", ".txt", sep=""), 
                               weighted = TRUE, 
                               threshold = 0.02, 
                               nodeNames = FilterGenes_res, 
                               altNodeNames = F, 
                               nodeAttr = F )

#其他方法---------------------------------------------------------------------------------
#hub gene,只给一个
moduleColors = unique(mergedColors)
HubGenes <- chooseTopHubInEachModule(datExpr,moduleColors)
write.table (HubGenes,file = "HubGenes_of_each_module.txt",quote=F,sep='\t')
转录组专题：WGCNA

猜你喜欢

热点阅读