单组火山图和多组火山图展示差异表达基因(DEGs) 2023-1
2023-10-11 本文已影响0人
黄甫一
前言
在各种RNA测序数据中,我们几乎都避不开差异表达基因(DEGs)的分析。我们可以用气泡图,提琴图和热图等展示不同细胞亚群或样本的差异基因表达情况,而火山图则是最直观的表现形式之一,因此本文将介绍画火山图的作图。
单组火山图
单组火山图这是比较常见的火山图,但只能展示两组的差异基因。
函数示例:
deg_volcano_plot(all_markers,label='out',fc.filter=1.5,p.val=0.01, wt='avg_log2FC',gene.col='gene',lb.filter=2)
deg_volcano_plot函数的参数:
all_markers 包含显著性,差异倍数,基因名的数据框,可由Seurat的FindAllmarkers或FindMarkers得到
label='out' ,输出文件的前缀。
fc.filter=1.5 ,展示的基因点FC阈值
p.val=0.01,显著性阈值
wt='avg_log2FC' ,差异倍数的列名
gene.col='gene',基因名所在列名
lb.filter=2,标记基因名的FC阈值
library(dplyr)
library(ggplot2)
library(stringr)
library(data.table)
deg_volcano_plot <- function(all_markers,label='out',fc.filter=1.5,p.val=0.01, wt='avg_log2FC',gene.col='gene',lb.filter=2){
try({
a <- all_markers
a$gene <- a[,gene.col]
a$logfc <- a[,wt]
if (wt=='avg_log2FC'){
a$FC<-2^a$logfc
}else{
a$FC<-exp(a$logfc)
}
a$log2FC <- log2(a$FC)
a_h<-a[a$FC>=fc.filter & a$p_val_adj<=p.val,]
a_l<-a[a$FC<1/fc.filter & a$p_val_adj<p.val,]
gene_list <- c(a_h$gene, a_l$gene)
a_all<-rbind(a_h,a_l)
pos<- a$gene %in% a_all$gene
a_no<-a[!pos,]
try(a_h$type<-"Up")
try(a_l$type<-"Down")
try(a_no$type<-"None")
a_all<-rbind(a_h,a_no,a_l)
mat<-a_all
ran<-runif(nrow(a_all),min = 25,max =50)
ran<-100^-ran
mat$p_val_adj<-mat$p_val_adj+ran
cdn <- which(mat$gene %in% gene_list)
mat1 <- mat[cdn,]
pdf(paste0(label,"_vocalno_plot_",".pdf"),12,12)
p1<-ggplot(mat,aes(log2(FC),-1*log10(p_val_adj)))+
geom_point(aes(color =type))+
geom_text_repel(data=subset(mat1, abs(mat1$log2FC) >=log2(lb.filter)),aes(x=log2(FC),y=-log10(p_val_adj),label=gene),size=5,point.padding = 0.5)+
scale_color_manual(values=c('#377EB8',"lightgrey",'#E41A1C'))+
theme(panel.background =element_blank(),axis.line=element_line(colour="black"))
p1<-p1+theme(legend.title=element_blank(),legend.background = element_blank(),legend.key = element_blank(), legend.text = element_text(size=20), legend.position="NA")
p1<-p1+theme_bw()+theme(axis.text = element_text(size = 20))
p1<-p1+geom_vline(xintercept=c(-1,1),lty=3,col="black",lwd=0.8)+geom_hline(yintercept = -log10(0.01),lty=3,col="black",lwd=0.8)
p1<-p1+ggtitle(label)+theme(plot.title = element_text(hjust = 0.5,size=30,face="bold"))+theme(axis.title =element_text(size=20))
print(p1)
dev.off()
})
}
多组火山图
多组火山图相比于前者可以展示多组的差异基因,plot_mutideg函数理论上也能画单组火山图。
函数使用示例:
plot_mutideg(df,incluster='cluster',infc='avg_log2FC',ingene='gene',pos=TRUE,prefix='out',mycol <- c("#E64B357F","#4DBBD57F","#00A0877F","#3C54887F","#F39B7F7F","#8491B47F","#91D1C27F","#DC00007F","#7E61487F"))
plot_mutideg函数的参数:
df 包含显著性,差异倍数,基因名的数据框,可由Seurat的FindAllmarkers得到
incluster,默认为cluster,分组的列名
infc,默认为avg_log2FC,差异倍数所在列名
ingene,默认为gene,基因名所在列名,
pos,默认为TRUE,是否只展示上调基因,
prefix='out',输出文件的前缀
mycol,默认为NULL,调色板,颜色数目需多于分组的数目。
library(ggplot2)
library(tidyverse)
library(ggrepel)
plot_mutideg <- function(df,incluster='cluster',infc='avg_log2FC',ingene='gene',pos=TRUE,prefix='out',
mycol=NULL){
df$cluster=unlist(df[incluster])
df$avg_log2FC=unlist(df[infc])
df$gene=unlist(df[ingene])
df$label <- ifelse(df$p_val_adj<0.01,"adjust P-val<0.01","adjust P-val>=0.01")
top10sig <- df %>% group_by(cluster) %>% top_n(10,wt=avg_log2FC)
df$size <- case_when(!(df$gene %in% top10sig$gene)~ 1,
df$gene %in% top10sig$gene ~ 2)
dt <- filter(df,size==1)
b1 <- top10sig %>% group_by(cluster) %>% summarize(bar = max(avg_log2FC,na.rm = T))
b2 <- top10sig %>% group_by(cluster) %>% summarize(bar = min(avg_log2FC,na.rm = T))
lsed= c(1:length(unique(df$cluster)))-1
dfbar<-data.frame(x=lsed,
y=b1$bar)
dfbar1<-data.frame(x=lsed,
y=b2$bar)
iny=0
if (pos){
p1 <- ggplot()+
geom_col(data = dfbar,
mapping = aes(x = x,y = y),
fill = "#dcdcdc",alpha = 0.6)
iny=-0.5
}else{
p1 <- ggplot()+
geom_col(data = dfbar,
mapping = aes(x = x,y = y),
fill = "#dcdcdc",alpha = 0.6)+
geom_col(data = dfbar1,
mapping = aes(x = x,y = y),
fill = "#dcdcdc",alpha = 0.6)
}
p2 <- p1+
geom_jitter(data = dt,
aes(x = cluster, y = avg_log2FC, color = label),
size = 0.85,
width =0.4)+
geom_jitter(data = data.frame(top10sig),
aes(x = cluster, y = avg_log2FC, color = label),
size = 1,
width =0.4,
shape=17)
dfcol<-data.frame(x=lsed,
y=iny,
label=unique(df$cluster))
p3 <- p2 + geom_tile(data = dfcol,
aes(x=x,y=y),
height=0.4,
color = "black",
fill = mycol,
alpha = 0.6,
show.legend = F)
p4 <- p3+
geom_text_repel(
data=top10sig,
aes(x=cluster,y=avg_log2FC, label=gene),
force = 1.2,
arrow = arrow(length = unit(0.008, "npc"),
type = "open", ends = "last")
)
p5 <- p4 +
scale_color_manual(name=NULL,
values = c("red","black"))
p6 <- p5+
labs(x="Cluster",y="average logFC")+
geom_text(data=dfcol,
aes(x=x,y=y,label=label),
size =6,
color ="white")
p7 <- p6+
theme_minimal()+
theme(
axis.title = element_text(size = 13,
color = "black",
face = "bold"),
axis.line.y = element_line(color = "black",
size = 1.2),
axis.line.x = element_blank(),
axis.text.x = element_blank(),
panel.grid = element_blank(),
legend.position = "top",
legend.direction = "vertical",
legend.justification = c(1,0),
legend.text = element_text(size = 15)
)
pdf(paste(prefix,'_multideg.pdf'),18,10)
print(p7)
dev.off()
}