WGCNA-尝试复现
2022-04-15 本文已影响0人
monkey_study
对2016年的WGCNA文章尝试复现,效果不太好,但是过了一遍流程还不错!
文章名称:伴 HBV 感染性肝癌调控枢纽基因筛选与初步鉴定
# title:伴 HBV 感染性肝癌调控枢纽基因筛选与初步鉴定
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse47197
#加载包
rm(list = ls())
options(stringsAsFactors = F)
if (F) {
library(GEOmirror)
library(GEOquery)
library(stringr)
suppressMessages(library(WGCNA))
suppressMessages(library(limma))
}
f <- 'gse47197_eSet.Rdata'
#数据导入
if(!file.exists(f)){
gse <- geoChina("GSE47197")
save(gse,file=f)
}
load(f)
gset<- gse[[1]]
#GPL16699
#表达矩阵
dat<-exprs(gset) #表达矩阵
dim(dat) #28782 124
dat[1:4,1:4]
boxplot(dat,las=2) #数据已经过标准化处理
#样本信息
pdat <- pData(gset)
#构建分组信息
group_list=tolower(str_split(pdat$title,' ',simplify = T)[,1])
table(group_list)
#下载注释文件#GPL16699
#Download GPL file, put it in the current directory, and load it:
gpl <- getGEO('GPL16699', destdir=".")
colnames(Table(gpl))
head(Table(gpl)[,c(1,10)]) ## you need to check this , which column do you need
probe2gene=Table(gpl)[,c(1,10)] #提取探针基因对应关系
head(probe2gene) #查看数据
library(stringr)
save(probe2gene,file='probe2gene.Rdata')
load('probe2gene.Rdata')
##检查有多少个空白symbol
nrow(probe2gene[probe2gene$GENE_SYMBOL=='',]) #[1] 8681
ids<- probe2gene
colnames(ids)=c('probe_id','symbol')
ids=ids[ids$symbol != '',] #去除空白的基因
#nrow(ids[ids$symbol == '',]) ,再次检查,0个
dat <- dat[rownames(dat)%in%ids$probe_id,]
ids <- ids[ids$probe_id%in%rownames(dat),]
anno <- function(dat,ids){
tmp <- by(dat,ids$symbol,function(x)rownames(x)[which.max(rowMeans(x))])
probes = as.character(tmp)
dat=dat[rownames(dat) %in% probes ,]
rownames(dat)=ids[ids$probe_id%in%rownames(dat),2]
return(dat)
}
exprSet<- anno(dat = dat,ids = ids)
save(exprSet,group_list,file='gse47197_new_exprSet.Rdata')
###到此为止已经完成了数据输入以及基因注释。
load('gse47197_new_exprSet.Rdata') #加载数据
dim(exprSet) #[1] 16390 124
exprSet[1:5,1:5]
group_list[1:6]
colnames(exprSet)=paste(group_list,1:ncol(exprSet),sep='_')
#构建癌旁以及癌表达矩阵
non <-exprSet[,group_list=="non"]
tumor <- exprSet[,group_list=="tumoral"]
###limma进行差异分析
# 表达矩阵 exprSet
# design矩阵
design <- model.matrix(~0+factor(group_list,levels = c("non","tumoral"),ordered = T))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
# 比较矩阵
contrast.matrix<-makeContrasts("tumoral-non",
levels = c("non","tumoral"))
#差异分析
fit <- lmFit(exprSet,design)
fit1 <- contrasts.fit(fit,contrast.matrix)
fit2 <- eBayes(fit1)
de <- topTable(fit2,coef = 1,n=Inf,p.value = 0.05)
dim(de)
nrDEG <- na.omit(de)
head(nrDEG)
DEG <- nrDEG[abs(nrDEG$logFC)>1,] #此处筛出583个DEG,与原文不一致,求解? 原文5986个
a<- rownames(DEG)
non <-exprSet[a,group_list=="non"]
tumor <- exprSet[a,group_list=="tumoral"]
save(exprSet,group_list,nrDEG,DEG, non,tumor,
file='gse47197_DEG.Rdata')
load('gse47197_DEG.Rdata')
###WGCNA
tnon <- t(non)
ttumor <- t(tumor)
sampleTree = function(datExpr) {
nGenes = ncol(datExpr) #583
nSamples = nrow(datExpr) #63个样本
datExpr_tree <- hclust(dist(datExpr), method = "average") #聚类分析
par(mar = c(0, 5, 2, 0))
plot(
datExpr_tree,
main = "Sample clustering",
sub = "",
xlab = "",
cex.axis = 1, #坐标轴字体大小
cex.main = 1, #title 字体大小 main = "Sample clustering"
cex.lab = 1 #height字体大小
)}
sampleTree(tnon)
sampleTree(ttumor)
nrow(ttumor)
table(group_list)
if (F) {
#筛选一个cutoff把图中的离群样本删除
datExpr_tree <- hclust(dist(tnon), method = "average")
height_cut =50 #筛选离群样本的阈值需要结合图以及分析的结果定
clust = cutreeStatic(dendro = datExpr_tree , cutHeight = height_cut,minSize = 10)
table(clust)
keepSamples <- (clust=1)
dim(tnon)
tnon = tnon[keepSamples, ]
dim(tnon)
# 观察结果
sampleTree(tnon)
}
datExpr_tree <- hclust(dist(ttumor), method = "average")
height_cut =50 #筛选离群样本的阈值需要结合图以及分析的结果定
clust = cutreeStatic(dendro = datExpr_tree , cutHeight = height_cut,minSize = 10)
table(clust)
keepSamples <- (clust!=0)
dim(ttumor)
ttumor = ttumor[keepSamples, ]
dim(ttumor)
# 观察结果
sampleTree(ttumor)
###选择软阈值
datExpr<- tnon
datExpr<- ttumor
datExpr[1:4,1:4] #行 样本,列 基因
powers = c(c(1:10), seq(from = 12, to = 30, by = 2)) #构建软阈值权重范围
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
#设置网络构建参数选择范围,计算无尺度分布拓扑矩阵(TOM)
png("step2-beta-value1.png",width = 800,height = 600)
par(mfrow = c(1, 2)) #
cex1 = 0.9 #字体大小
# 无尺度拓扑拟合与软阈值图,纵坐标(无尺度拓扑拟合)越接近于1,代表拟合越接近于无尺度分布
plot(
sft$fitIndices[, 1],
-sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
xlab = "Soft Threshold (power)",
ylab = "Scale Free Topology Model Fit,signed R^2",
type = "n",
main = paste("Scale independence")
)
text(
sft$fitIndices[, 1],
-sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
labels = powers,
cex = cex1,
col = "red"
)
##画红线截取height
abline(h = 0.90, col = "red")
#平均连接度与软阈值图
plot(
sft$fitIndices[, 1],
sft$fitIndices[, 5],
xlab = "Soft Threshold (power)",
ylab = "Mean Connectivity",
type = "n",
main = paste("Mean connectivity")
)
text(
sft$fitIndices[, 1],
sft$fitIndices[, 5],
labels = powers,
cex = cex1,
col = "red"
)
dev.off() #根据原文,非肿瘤power=10,肿瘤9,话说,我画的貌似不太一样,可能跟前面进行了差异基因有关
datExpr <- ttumor;powert=9
#4. 构建共表达网络
net = blockwiseModules(
datExpr, #输入数据为转置后的表达矩阵
power = powert, #估计的权重系数
maxBlockSize = 6000,
TOMType = "unsigned", #拓朴重叠矩阵类型为无向型
minModuleSize = 50, #最少模块数量 50
detectCutHeight = 0.995, #设定高度上限0.995
reassignThreshold = 0,
mergeCutHeight = 0.25, #75%相关度为融合阈值
numericLabels = TRUE,
pamRespectsDendro = FALSE,
saveTOMs = T,saveTOMFileBase = "HCCtumor",
verbose = 3
)
table(net$colors) #看一下module数量以及对应基因数量。0属于没有聚类到的基因数
##纯纯大怨种,只聚了3类,越来越疑惑作者为什么前面会做个差异分析?而我的结果也跟作者差异那么大
mergedColors = labels2colors(net$colors) # 数字标签与颜色相对应
png("step4-genes-modules.png",width = 800,height = 600)
plotDendroAndColors(
net$dendrograms[[1]], #拓扑相似矩阵聚类结果
mergedColors, #颜色对应关系
c("Module colors","morged colors"),
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05,
)
# plotDendroAndColors接受一个聚类的对象,以及该对象里面包含的所有个体所对应的颜色。
dev.off()
##聚类结果相似模块的融合,Merging of modules whose expression profiles are very similar
#在聚类树中每一leaf是一个短线,代表一个基因,
#不同分之间靠的越近表示有高的共表达基因,将共表达极其相似的modules进行融合
# Calculate eigengenes
if(T){
MEList = moduleEigengenes(datExpr, colors = mergedColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);#计算模块和模块之间的相关性和相异性
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
#sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
#建立基因模块后,可以将模块用颜色来区分,有些模块相似性高,就需要将模块合并。将模块特征基因进行聚类,在完成聚类后合并,0.4高度对应的相似度阈值就是0.6。具体的相似性阈值可以自行设置,进行聚类剪切后,就可以区分哪些模块相似性高,哪些模块相似性低,如下图。
#选择有60%相关性的进行融合
MEDissThres = 0.4#0.4剪切高度可修改 ####可以完成相似模块的合并,剪切高度是0.4,也就是将相似性高于0.6的模块进行了合并
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
# Call an automatic merging function
merge = mergeCloseModules(datExpr, mergedColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors1 = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
png("dynamicColors_mergedColors.png",width = 800,height = 600)
plotDendroAndColors(net$dendrograms[[1]], cbind(mergedColors, mergedColors1),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
}
#模块模块相关性
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#模块颜色信息(模块标签转化为颜色信息)
moduleColors <- labels2colors(net$colors)
# Recalculate KMEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0); ##不同颜色的模块的ME值矩 (样本vs模块)
moduleTraitCor = cor(MEs, MEs , use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
png("step5-Module-trait-relationships.png",width = 800,height = 1200,res = 120)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = colnames(MEs),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-Module relationships"))
dev.off()
#TOM图 基因相似性
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
geneTree = net$dendrograms[[1]];
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);
plotTOM = dissTOM^7;
diag(plotTOM) = NA;
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
# GO注释
table(moduleColors)
group_g=data.frame(gene=colnames(datExpr),
group=moduleColors)
save(group_g,file='wgcna_group_g.Rdata')
rm(list = ls())
load(file='wgcna_group_g.Rdata')
library(clusterProfiler)
# Convert gene ID into entrez genes
head(group_g)
tmp <- bitr(group_g$gene, fromType="SYMBOL",
toType="ENTREZID",
OrgDb="org.Hs.eg.db")
head(tmp)
de_gene_clusters=merge(tmp,group_g,by.x='SYMBOL',by.y='gene')
table(de_gene_clusters$group)
head(de_gene_clusters)
list_de_gene_clusters <- split(de_gene_clusters$ENTREZID,
de_gene_clusters$group) #按照颜色group拆分entrezid
# library(ggplot2)
# gcSample= list_de_gene_clusters
# source('function.R') # 这个代码非常复杂,就不给大家了# 就是包装了一个com_kegg_go函数,里面会对分组好的基因集进行批量注释
#
# # 下一步非常耗时,保守估计半个小时
# 主要是对我们的模块进行功能注释,就是GO/KEGG数据库
# com_kegg_go(gcSample,'top5000')
# Run full GO enrichment test
formula_res <- compareCluster(
ENTREZID~group,
data=de_gene_clusters,
fun="enrichGO",
OrgDb="org.Hs.eg.db",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05
)
# Run GO enrichment test and merge terms
# that are close to each other to remove result redundancy
lineage1_ego <- simplify(
formula_res,
cutoff=0.5,
by="p.adjust",
select_fun=min
)
# Plot both analysis results
#pdf('female_compared_GO_term_DE_cluster.pdf',width = 11,height = 6)
#dotplot(formula_res, showCategory=5)
#dev.off()
pdf('Go.pdf',width = 13,height = 8)
dotplot(lineage1_ego, showCategory=5)
dev.off()
# Save results
#write.csv(formula_res@compareClusterResult,
# file="Microglia_compared_GO_term_DE_cluster.csv")
write.csv(lineage1_ego@compareClusterResult,
file="Microglia_compared_GO_term_DE_cluster_simplified.csv")
figure:
data:image/s3,"s3://crabby-images/9f0e7/9f0e7d3535b7dc2b5db7cb931775f43eea245788" alt=""
data:image/s3,"s3://crabby-images/f6421/f6421199a463b243f4008fc26ce5eaf4039cc2ec" alt=""
data:image/s3,"s3://crabby-images/d83da/d83da875224d1dd06e706404b26f390b4a21ed1c" alt=""
data:image/s3,"s3://crabby-images/03ea9/03ea96491884aa196b280d31657b16cdb163787e" alt=""
data:image/s3,"s3://crabby-images/03fb0/03fb0ca5f9b853967a50d281671f17503b0aeea0" alt=""
data:image/s3,"s3://crabby-images/a126b/a126b3c3654f409af71b9489bf94874e10b0aca5" alt=""
data:image/s3,"s3://crabby-images/1c95e/1c95e42a183ecdfd79329d6a71d04d8c98886df0" alt=""
data:image/s3,"s3://crabby-images/cea9e/cea9e3ed8cd1ab7ae2492e8b3317a86f8361a841" alt=""
参考:学徒复现WGCNA文章图表 - 知乎 (zhihu.com)
WGCNA学习笔记 - 简书 (jianshu.com)
r - Cut dendrogram / cluster: Error in function 'cutree': tree incorrect (composante 'merge') - Stack Overflow
这个WGCNA作业终于有学徒完成了! - 云+社区 - 腾讯云 (tencent.com)
limma: Linear Models for Microarray Data (bioconductor.org)
(6条消息) 用limma包进行多组差异表达分析_今天也是个妖精头子呀的博客-CSDN博客_limma 差异表达分析