R绘图 | Cell-Type Enrichment分析探索
出发点
"Identification of evolutionarily conserved gene networks mediating neurodegenerative dementia"这篇文章中中的Cell type enrichment结果图,看是否可以利用自身数据重现该图?
image初步思考:尝试借用富集分析的思路!
从文章上来看,该分析主要目的是展示不同基因模块与5种细胞之间的相关性。按常规思路,首先有样本-基因模块的表达矩阵(可以得到),然后有样本-细胞之间的关系,最后通过相关性计算基因模块-细胞之间的关系。但是,该数据的样本是组织样本,不可能有样本-细胞之间的关系。此外,从图注上来看,颜色映射的也不是相关性数值,而是Fold enrichment(富集度),这时候考虑如何结果富集手段完成类似的分析图?
- 基于文献汇总整理出各个细胞的特征基因,构建起细胞特征基因数据库,就像常见通路富集分析采用的KEGG数据库
- 基于表达值,进行WGCNA分析,构建其不同的基因模块
- 分别选取每个模块中的基因,基于构建的细胞特征基因数据库,分别进行富集分析(基于clusterProfiler包中enricher函数)
- 得到每个模块在不同细胞集上的Enrichment(富集度)和FDR值,将不同模块在不同细胞中的富集度和FDR值进行汇总整理
- 最后,基于汇总好的数据进行热图展示,富集度用颜色梯度表示,FDR值用***表示。
具体如何操作?
清除环境、安装加载R包
rm(list = ls())
#install.packages("clusterProfiler")
library(readxl) # 用于读取excel文件
library(clusterProfiler) #用于富集分析
library(dplyr) #用于数据处理
载入基因数据和细胞特征基因数据库
用于下游富集分析的基因来源于WGCNA结果,首先通过WGCNA得到基因模块,并将每个模块中的基因提取出来,如下图所示:
data<-read_excel("input.xlsx",1) #读入第1个模块的基因信息
gene=as.vector(data$nodeName) #把矩阵变为vector
head(gene,10)
#[1] "CAPZA1" "ARF3" "UBE2L3" "OXSR1" "APEH" "AKR1B1" "IMPA1" "CAST" "VDP" "PRKAA2"
载入自定义数据集(细胞类型-基因对应关系)
该数据源自文献整理,将每个细胞的特征基因进行汇总。并以下图的形式整理成文档,特征基因与细胞各占一列,一一对应。
Database=read.table("Database.txt",sep="\t",quote="",header = T) #读取细胞-基因对应关系数据
TERM2GENE=data.frame(TERM=Database$Type,GENE=Database$Gene_Name) #构建数据库ID-基因对应关系表达矩阵
TERM2NAME=data.frame(TERM=Database$Type,NAME=Database$Type) # #构建数据库ID-数据库名称对应关系表达矩阵
#由于是自定义数据库,没有官方ID,这里直接用名称代替
对首个基因模块进行富集分析
tonumeric <- function(x){
x <- sub(" ", "+", x, fixed = TRUE)
return(unlist(lapply(x, function(x) eval(parse(text=x)))))
} #将分数字符转为数值
g=enricher(gene=gene,pvalueCutoff = 1,pAdjustMethod = "BH",qvalueCutoff = 1,TERM2GENE=TERM2GENE,TERM2NAME=TERM2NAME)
g1=g@result
g1$enrichment=tonumeric(g1$GeneRatio)/tonumeric(g1$BgRatio)
# ID Description GeneRatio BgRatio pvalue p.adjust qvalue geneID Count enrichment
# CM CM CM 4/7 75/645 0.004511199 0.0180448 0.01424589 ALDOA/GAPDH/LDHB/TPI1 4 4.9142857
# SMC SMC SMC 1/7 57/645 0.478393672 0.6656081 0.52548005 DSTN 1 1.6165414
# B B B 1/7 61/645 0.502853404 0.6656081 0.52548005 EZR 1 1.5105386
# Mono Mono Mono 1/7 93/645 0.665608066 0.6656081 0.52548005 PKM 1 0.9907834
write.csv(as.data.frame( g@result), file="enrichment.csv") #输出csv格式结果
对所有基因模块进行批量分析
rm(list = ls())
#利用循环批量富集
data_names <- c("black","blue","brown","green","pink","red","turquoise","yellow") #基因模块的名称
ID <- c("CF","SMC","DC","CM","Mono","Mac","neu","T","EC","B") #细胞数据库中的细胞条目名称
all_data<-data.frame(ID=ID)
for(i in 1:8){
#载入基因list数据
data<-read_excel("input.xlsx",i)
gene=as.vector(data$nodeName)
#载入自定义数据集(细胞类型-基因对应关系)
Database=read.table("Database.txt",sep="\t",quote="",header = T) #读取数据
TERM2GENE=data.frame(TERM=Database$Type,GENE=Database$Gene_Name)
TERM2NAME=data.frame(TERM=Database$Type,NAME=Database$Type)
##开始富集分析
g=enricher(gene=gene,pvalueCutoff = 1,pAdjustMethod = "BH",qvalueCutoff = 1,TERM2GENE=TERM2GENE,TERM2NAME=TERM2NAME)
g_tmp=g@result
g_tmp$enrichment=tonumeric(g_tmp$GeneRatio)/tonumeric(g_tmp$BgRatio)
#write.csv(as.data.frame( g_tmp@result), file=paste0(datanames[i],"_","Celltype_","enrichment.csv")) #输出csv格式结果
g_new=g_tmp[,c(1,6,10)]
all_data <- dplyr::full_join(x=all_data,g_new,by="ID")
}
##对所有模块的富集结果进行整体
rownames(all_data)= all_data[,1]
all_data=all_data[,-1]
all_data[1:4,1:9]
# p.adjust.x enrichment.x p.adjust.y enrichment.y p.adjust.x.x enrichment.x.x p.adjust.y.y enrichment.y.y p.adjust.x.x.x
# CF NA NA 3.642127e-08 3.0171518 0.99608414 1.2755438 NA NA NA
# SMC 0.6656081 1.616541 2.339383e-02 2.0310391 0.99608414 1.1039795 NA NA 0.3099551
# DC NA NA 9.973694e-01 0.9541420 0.99608414 0.8067542 NA NA NA
# CM 0.0180448 4.914286 9.973694e-01 0.8820513 0.01252016 2.5170732 0.0180448 4.914286 0.1028140
write.csv(all_data, file="all_enrichment_resluts.csv") #输出csv格式结果
提取所有模块富集结果中的enrichment值
enrichment_data <- all_data[,seq(2,16,2)] #选取细胞名称和enrichment值
enrichment_data[is.na(enrichment_data)] <- 0 #将enrichmengt值为NA的替换为0
colnames(enrichment_data) = data_names
enrichment_data[1:4,1:8]
# black blue brown green pink red turquoise yellow
# CF 0.000000 3.0171518 1.2755438 0.000000 0.000000 0.0000000 0.2954650 0.7923833
# SMC 1.616541 2.0310391 1.1039795 0.000000 2.828947 0.0000000 0.5753791 2.5717703
# DC 0.000000 0.9541420 0.8067542 0.000000 0.000000 3.0069930 0.8409387 0.3758741
# CM 4.914286 0.8820513 2.5170732 4.914286 4.300000 0.7818182 3.2067797 4.3000000
提取所有模块富集结果中的FDR值
FDR_data <- all_data[,seq(1,16,2)] #选取细胞名称和enrichment值
FDR_data[is.na(FDR_data )] <- 1 #将FDR值为NA的替换为1
colnames(FDR_data ) = data_names
FDR_data[1:4,1:8]
# black blue brown green pink red turquoise yellow
# CF 1.0000000 3.642127e-08 0.99608414 1.0000000 1.0000000 1.000000000 9.978829e-01 9.681490e-01
# SMC 0.6656081 2.339383e-02 0.99608414 1.0000000 0.3099551 1.000000000 9.978829e-01 1.291767e-01
# DC 1.0000000 9.973694e-01 0.99608414 1.0000000 1.0000000 0.005004306 9.978829e-01 9.681490e-01
# CM 0.0180448 9.973694e-01 0.01252016 0.0180448 0.1028140 0.748451087 4.610400e-07 4.479867e-05
save(enrichment_data,FDR_data,file = "heatmap_data.Rdata")#将数据保存为rdata,以便调用
pheatmap绘制热图
rm(list = ls())
load(file = "heatmap_data.Rdata")
library(pheatmap)
tmp1=as.data.frame(ifelse(t(FDR_data) < 0.05, "***", ""))
# CF SMC DC CM Mono Mac neu T EC B
# black ***
# blue *** ***
# brown ***
# green ***
# pink
# red *** ***
# turquoise ***
# yellow ***
tmp2=as.data.frame(ifelse(tmp1 != "", round(t(enrichment_data),2), ""))
# CF SMC DC CM Mono Mac neu T EC B
# black 4.91
# blue 3.02 2.03
# brown 2.52
# green 4.91
# pink
# red 3.01 5.42
# turquoise 3.21
# yellow 4.3
tmp3=data.frame(CF=paste0(tmp1$CF,"\n",tmp2$CF),
SMC=paste0(tmp1$SMC,"\n",tmp2$SMC),
DC=paste0(tmp1$DC,"\n",tmp2$DC),
CM=paste0(tmp1$CM,"\n",tmp2$CM),
Mono=paste0(tmp1$Mono,"\n",tmp2$Mono),
Mac=paste0(tmp1$Mac,"\n",tmp2$Mac),
neu=paste0(tmp1$neu,"\n",tmp2$neu),
T=paste0(tmp1$T,"\n",tmp2$T),
EC=paste0(tmp1$EC,"\n",tmp2$EC),
B=paste0(tmp1$B,"\n",tmp2$B))
rownames(tmp3)=rownames(tmp2)
# CF SMC DC CM Mono Mac neu T EC B
# black \n \n \n ***\n4.91 \n \n \n \n \n \n
# blue ***\n3.02 ***\n2.03 \n \n \n \n \n \n \n \n
# brown \n \n \n ***\n2.52 \n \n \n \n \n \n
# green \n \n \n ***\n4.91 \n \n \n \n \n \n
# pink \n \n \n \n \n \n \n \n \n \n
# red \n \n ***\n3.01 \n \n \n \n ***\n5.42 \n \n
# turquoise \n \n \n ***\n3.21 \n \n \n \n \n \n
# yellow \n \n \n ***\n4.3 \n \n \n \n \n \n
p=pheatmap(t(enrichment_data),fontsize_col=10,fontsize_row=11,
fontsize = 10,
angle_col=45,
show_rownames=T,show_colnames = T,
cluster_row=T,cluster_col=T,scale="column",
treeheight_row=25,treeheight_col=25,
border_color=NA,main="Cell-Type Enrichment",
color = colorRampPalette(c("blue", "white", "red"))(50),
display_numbers = as.matrix(tmp3),
cellwidth = 40, cellheight = 40,
fontsize_number=10,
legend_breaks=c(-2,-1,0,1,2),
legend_labels=c("-2","-1","0 Enrichment(***FDR<0.05)","1","2"),
filename = "Cell-Type Enrichment.pdf") # 画图
print(p)
image
本期旨在探索文献中Cell Type Enrichment分析,并尝试用自己的数据进行重现,具体的方法和文章中所述还是有差别的,因为文中关于该分析的描述(如下图)很多没有看懂!!只是在自身已有的知识储备上进行套用。很多大神应该是可以完整复现的,小编这里采用的是投机取巧的办法~~~
image.png image image往期回顾
R绘图|韦恩图的常见绘制方法
R绘图|ggplot2火山图的绘制
R绘图|ggplot2散点图的绘制
R绘图|pheatmap热图绘制——基础篇
R绘图|pheatmap热图绘制——中阶篇
R绘图|pheatmap热图绘制——高阶篇