WGCNA分析

WGCNA-尝试复现

2022-04-15  本文已影响0人  monkey_study

对2016年的WGCNA文章尝试复现,效果不太好,但是过了一遍流程还不错!
文章名称:伴 HBV 感染性肝癌调控枢纽基因筛选与初步鉴定

# title:伴 HBV 感染性肝癌调控枢纽基因筛选与初步鉴定
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse47197
#加载包
rm(list = ls())
options(stringsAsFactors = F)
if (F) {
library(GEOmirror)
library(GEOquery)
library(stringr)
  suppressMessages(library(WGCNA))
  suppressMessages(library(limma))
  
}
f <- 'gse47197_eSet.Rdata'
#数据导入
if(!file.exists(f)){
gse <- geoChina("GSE47197")
save(gse,file=f)
}
load(f)
gset<- gse[[1]]
#GPL16699
#表达矩阵
dat<-exprs(gset)  #表达矩阵
dim(dat)  #28782   124
dat[1:4,1:4]
boxplot(dat,las=2)  #数据已经过标准化处理
#样本信息
pdat <- pData(gset)
#构建分组信息
group_list=tolower(str_split(pdat$title,' ',simplify = T)[,1])
table(group_list)

#下载注释文件#GPL16699
#Download GPL file, put it in the current directory, and load it:
gpl <- getGEO('GPL16699', destdir=".")
colnames(Table(gpl))  
head(Table(gpl)[,c(1,10)]) ## you need to check this , which column do you need
probe2gene=Table(gpl)[,c(1,10)]   #提取探针基因对应关系
head(probe2gene)  #查看数据
library(stringr)  
save(probe2gene,file='probe2gene.Rdata')
load('probe2gene.Rdata')
##检查有多少个空白symbol
nrow(probe2gene[probe2gene$GENE_SYMBOL=='',])  #[1] 8681
ids<- probe2gene
colnames(ids)=c('probe_id','symbol') 
ids=ids[ids$symbol != '',]   #去除空白的基因
#nrow(ids[ids$symbol == '',])  ,再次检查,0个
dat <- dat[rownames(dat)%in%ids$probe_id,]  
ids <- ids[ids$probe_id%in%rownames(dat),]

anno <- function(dat,ids){
tmp <- by(dat,ids$symbol,function(x)rownames(x)[which.max(rowMeans(x))])
probes = as.character(tmp)
dat=dat[rownames(dat) %in% probes ,]
rownames(dat)=ids[ids$probe_id%in%rownames(dat),2]
return(dat)
}
exprSet<- anno(dat = dat,ids = ids)
save(exprSet,group_list,file='gse47197_new_exprSet.Rdata')
###到此为止已经完成了数据输入以及基因注释。

load('gse47197_new_exprSet.Rdata')  #加载数据
dim(exprSet)  #[1] 16390   124
exprSet[1:5,1:5]
group_list[1:6]
colnames(exprSet)=paste(group_list,1:ncol(exprSet),sep='_')
#构建癌旁以及癌表达矩阵
non <-exprSet[,group_list=="non"]
tumor <- exprSet[,group_list=="tumoral"]


###limma进行差异分析
# 表达矩阵 exprSet
# design矩阵
design <- model.matrix(~0+factor(group_list,levels = c("non","tumoral"),ordered = T))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)

# 比较矩阵
contrast.matrix<-makeContrasts("tumoral-non",
                               levels = c("non","tumoral"))
#差异分析
fit <- lmFit(exprSet,design)
fit1 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit1)
de <- topTable(fit2,coef = 1,n=Inf,p.value = 0.05)
dim(de)
nrDEG <- na.omit(de)
head(nrDEG)
DEG <- nrDEG[abs(nrDEG$logFC)>1,]   #此处筛出583个DEG,与原文不一致,求解? 原文5986个
a<- rownames(DEG)
non <-exprSet[a,group_list=="non"]
tumor <- exprSet[a,group_list=="tumoral"]
save(exprSet,group_list,nrDEG,DEG, non,tumor,
     file='gse47197_DEG.Rdata')
load('gse47197_DEG.Rdata')

###WGCNA
tnon <- t(non)
ttumor <- t(tumor)
sampleTree = function(datExpr) {
  nGenes = ncol(datExpr)   #583
  nSamples = nrow(datExpr)   #63个样本
  datExpr_tree <- hclust(dist(datExpr), method = "average") #聚类分析
  par(mar = c(0, 5, 2, 0))
  plot(
    datExpr_tree,
    main = "Sample clustering",
    sub = "",
    xlab = "",   
    cex.axis = 1,  #坐标轴字体大小
    cex.main = 1,   #title 字体大小 main = "Sample clustering"
    cex.lab = 1  #height字体大小
  )}
  sampleTree(tnon)
  sampleTree(ttumor)
  nrow(ttumor)
table(group_list)  

if (F) {
  
  
  #筛选一个cutoff把图中的离群样本删除
  datExpr_tree <- hclust(dist(tnon), method = "average")
  height_cut =50 #筛选离群样本的阈值需要结合图以及分析的结果定
  clust = cutreeStatic(dendro = datExpr_tree , cutHeight = height_cut,minSize = 10) 
  table(clust)
  keepSamples <-  (clust=1)
  dim(tnon)
  tnon = tnon[keepSamples, ]
  dim(tnon)
  # 观察结果
  sampleTree(tnon)
  }

datExpr_tree <- hclust(dist(ttumor), method = "average")
height_cut =50 #筛选离群样本的阈值需要结合图以及分析的结果定
clust = cutreeStatic(dendro = datExpr_tree , cutHeight = height_cut,minSize = 10) 
table(clust)
keepSamples <-  (clust!=0)
dim(ttumor)
ttumor = ttumor[keepSamples, ]
dim(ttumor)
# 观察结果
sampleTree(ttumor)

###选择软阈值
datExpr<- tnon
datExpr<- ttumor
datExpr[1:4,1:4]   #行 样本,列 基因
powers = c(c(1:10), seq(from = 12, to = 30, by = 2))  #构建软阈值权重范围
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
#设置网络构建参数选择范围,计算无尺度分布拓扑矩阵(TOM)
png("step2-beta-value1.png",width = 800,height = 600)
par(mfrow = c(1, 2))  #
cex1 = 0.9   #字体大小
# 无尺度拓扑拟合与软阈值图,纵坐标(无尺度拓扑拟合)越接近于1,代表拟合越接近于无尺度分布
plot(
  sft$fitIndices[, 1],
  -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
  xlab = "Soft Threshold (power)",
  ylab = "Scale Free Topology Model Fit,signed R^2",
  type = "n",
  main = paste("Scale independence")
)

text(
  sft$fitIndices[, 1],
  -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
  labels = powers,
  cex = cex1,
  col = "red"
)
##画红线截取height
abline(h = 0.90, col = "red")
#平均连接度与软阈值图
plot(
  sft$fitIndices[, 1],
  sft$fitIndices[, 5],
  xlab = "Soft Threshold (power)",
  ylab = "Mean Connectivity",
  type = "n",
  main = paste("Mean connectivity")
)

text(
  sft$fitIndices[, 1],
  sft$fitIndices[, 5],
  labels = powers,
  cex = cex1,
  col = "red"
)
dev.off()   #根据原文,非肿瘤power=10,肿瘤9,话说,我画的貌似不太一样,可能跟前面进行了差异基因有关

datExpr <- ttumor;powert=9
#4. 构建共表达网络
net = blockwiseModules(
  datExpr,   #输入数据为转置后的表达矩阵
  power = powert,  #估计的权重系数
  maxBlockSize = 6000,
  TOMType = "unsigned",  #拓朴重叠矩阵类型为无向型
  minModuleSize = 50,   #最少模块数量 50
  detectCutHeight = 0.995,  #设定高度上限0.995
  reassignThreshold = 0,
  mergeCutHeight = 0.25,   #75%相关度为融合阈值
  numericLabels = TRUE,
  pamRespectsDendro = FALSE,
  saveTOMs = T,saveTOMFileBase = "HCCtumor",
  verbose = 3
)
table(net$colors)   #看一下module数量以及对应基因数量。0属于没有聚类到的基因数
##纯纯大怨种,只聚了3类,越来越疑惑作者为什么前面会做个差异分析?而我的结果也跟作者差异那么大
mergedColors = labels2colors(net$colors) # 数字标签与颜色相对应
png("step4-genes-modules.png",width = 800,height = 600)
plotDendroAndColors(
  net$dendrograms[[1]],  #拓扑相似矩阵聚类结果
  mergedColors,  #颜色对应关系
  c("Module colors","morged colors"),
  dendroLabels = FALSE,
  hang = 0.03,
  addGuide = TRUE,
  guideHang = 0.05,
)
# plotDendroAndColors接受一个聚类的对象,以及该对象里面包含的所有个体所对应的颜色。
dev.off()


##聚类结果相似模块的融合,Merging of modules whose expression profiles are very similar
#在聚类树中每一leaf是一个短线,代表一个基因,
#不同分之间靠的越近表示有高的共表达基因,将共表达极其相似的modules进行融合
# Calculate eigengenes
if(T){
  MEList = moduleEigengenes(datExpr, colors = mergedColors)
  MEs = MEList$eigengenes
  # Calculate dissimilarity of module eigengenes
  MEDiss = 1-cor(MEs);#计算模块和模块之间的相关性和相异性
  # Cluster module eigengenes
  METree = hclust(as.dist(MEDiss), method = "average");
  # Plot the result
  #sizeGrWindow(7, 6)
  plot(METree, main = "Clustering of module eigengenes",
       xlab = "", sub = "")
  
  
  #建立基因模块后,可以将模块用颜色来区分,有些模块相似性高,就需要将模块合并。将模块特征基因进行聚类,在完成聚类后合并,0.4高度对应的相似度阈值就是0.6。具体的相似性阈值可以自行设置,进行聚类剪切后,就可以区分哪些模块相似性高,哪些模块相似性低,如下图。
  
  #选择有60%相关性的进行融合
  MEDissThres = 0.4#0.4剪切高度可修改  ####可以完成相似模块的合并,剪切高度是0.4,也就是将相似性高于0.6的模块进行了合并
  # Plot the cut line into the dendrogram
  abline(h=MEDissThres, col = "red")
  
  # Call an automatic merging function
  merge = mergeCloseModules(datExpr, mergedColors, cutHeight = MEDissThres, verbose = 3)
  # The merged module colors
  mergedColors1 = merge$colors;
  # Eigengenes of the new merged modules:
  mergedMEs = merge$newMEs
  
  png("dynamicColors_mergedColors.png",width = 800,height = 600)
  plotDendroAndColors(net$dendrograms[[1]], cbind(mergedColors, mergedColors1),
                      c("Dynamic Tree Cut", "Merged dynamic"),
                      dendroLabels = FALSE, hang = 0.03,
                      addGuide = TRUE, guideHang = 0.05)
  dev.off()
  
}

#模块模块相关性
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#模块颜色信息(模块标签转化为颜色信息)
moduleColors <- labels2colors(net$colors)  
# Recalculate KMEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0); ##不同颜色的模块的ME值矩 (样本vs模块)
moduleTraitCor = cor(MEs, MEs , use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                   signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
png("step5-Module-trait-relationships.png",width = 800,height = 1200,res = 120)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = colnames(MEs),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = blueWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-Module relationships"))
dev.off()
#TOM图 基因相似性
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
geneTree = net$dendrograms[[1]]; 
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6); 
plotTOM = dissTOM^7; 
diag(plotTOM) = NA; 
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")


# GO注释
table(moduleColors)
group_g=data.frame(gene=colnames(datExpr),
                   group=moduleColors)
save(group_g,file='wgcna_group_g.Rdata')



rm(list = ls())
load(file='wgcna_group_g.Rdata')
library(clusterProfiler)
# Convert gene ID into entrez genes
head(group_g)
tmp <- bitr(group_g$gene, fromType="SYMBOL", 
            toType="ENTREZID", 
            OrgDb="org.Hs.eg.db")
head(tmp)
de_gene_clusters=merge(tmp,group_g,by.x='SYMBOL',by.y='gene')
table(de_gene_clusters$group)
head(de_gene_clusters)

list_de_gene_clusters <- split(de_gene_clusters$ENTREZID, 
                               de_gene_clusters$group) #按照颜色group拆分entrezid

# library(ggplot2)
# gcSample= list_de_gene_clusters
# source('function.R') # 这个代码非常复杂,就不给大家了# 就是包装了一个com_kegg_go函数,里面会对分组好的基因集进行批量注释
# 
# # 下一步非常耗时,保守估计半个小时
# 主要是对我们的模块进行功能注释,就是GO/KEGG数据库
# com_kegg_go(gcSample,'top5000')

# Run full GO enrichment test
formula_res <- compareCluster(
  ENTREZID~group, 
  data=de_gene_clusters, 
  fun="enrichGO", 
  OrgDb="org.Hs.eg.db",
  ont = "BP",
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.01,
  qvalueCutoff  = 0.05
)
# Run GO enrichment test and merge terms 
# that are close to each other to remove result redundancy
lineage1_ego <- simplify(
  formula_res, 
  cutoff=0.5, 
  by="p.adjust", 
  select_fun=min
)

# Plot both analysis results
#pdf('female_compared_GO_term_DE_cluster.pdf',width = 11,height = 6)
#dotplot(formula_res, showCategory=5)
#dev.off()
pdf('Go.pdf',width = 13,height = 8)
dotplot(lineage1_ego, showCategory=5)
dev.off()


# Save results
#write.csv(formula_res@compareClusterResult, 
#          file="Microglia_compared_GO_term_DE_cluster.csv")
write.csv(lineage1_ego@compareClusterResult, 
          file="Microglia_compared_GO_term_DE_cluster_simplified.csv")

figure:


hclust.png
step2-beta-value.png
step2-beta-value1.png
step4-genes-modules.png dynamicColors_mergedColors.png
step5-Module-trait-relationships.png
image.png
step7-TOM.png

参考:学徒复现WGCNA文章图表 - 知乎 (zhihu.com)
WGCNA学习笔记 - 简书 (jianshu.com)
r - Cut dendrogram / cluster: Error in function 'cutree': tree incorrect (composante 'merge') - Stack Overflow
这个WGCNA作业终于有学徒完成了! - 云+社区 - 腾讯云 (tencent.com)
limma: Linear Models for Microarray Data (bioconductor.org)
(6条消息) 用limma包进行多组差异表达分析_今天也是个妖精头子呀的博客-CSDN博客_limma 差异表达分析

上一篇 下一篇

猜你喜欢

热点阅读