rna-seq

单组火山图和多组火山图展示差异表达基因(DEGs) 2023-1

2023-10-11  本文已影响0人  黄甫一

前言

在各种RNA测序数据中,我们几乎都避不开差异表达基因(DEGs)的分析。我们可以用气泡图,提琴图和热图等展示不同细胞亚群或样本的差异基因表达情况,而火山图则是最直观的表现形式之一,因此本文将介绍画火山图的作图。

单组火山图
单组火山图

这是比较常见的火山图,但只能展示两组的差异基因。
函数示例:

deg_volcano_plot(all_markers,label='out',fc.filter=1.5,p.val=0.01, wt='avg_log2FC',gene.col='gene',lb.filter=2)

deg_volcano_plot函数的参数:
all_markers 包含显著性,差异倍数,基因名的数据框,可由Seurat的FindAllmarkers或FindMarkers得到
label='out' ,输出文件的前缀。
fc.filter=1.5 ,展示的基因点FC阈值
p.val=0.01,显著性阈值
wt='avg_log2FC' ,差异倍数的列名
gene.col='gene',基因名所在列名
lb.filter=2,标记基因名的FC阈值

library(dplyr)
library(ggplot2)
library(stringr)
library(data.table)
deg_volcano_plot <- function(all_markers,label='out',fc.filter=1.5,p.val=0.01, wt='avg_log2FC',gene.col='gene',lb.filter=2){
try({
  a <- all_markers
  a$gene <- a[,gene.col]
  a$logfc <- a[,wt]
  if (wt=='avg_log2FC'){
  a$FC<-2^a$logfc
  }else{
  a$FC<-exp(a$logfc)
  }
  a$log2FC <- log2(a$FC)
  a_h<-a[a$FC>=fc.filter & a$p_val_adj<=p.val,]
  a_l<-a[a$FC<1/fc.filter & a$p_val_adj<p.val,]

  gene_list <- c(a_h$gene, a_l$gene)
  a_all<-rbind(a_h,a_l)
  pos<- a$gene %in% a_all$gene
  a_no<-a[!pos,]
  try(a_h$type<-"Up")
  try(a_l$type<-"Down")
  try(a_no$type<-"None")
  a_all<-rbind(a_h,a_no,a_l)
  mat<-a_all
  ran<-runif(nrow(a_all),min = 25,max =50)
  ran<-100^-ran
  mat$p_val_adj<-mat$p_val_adj+ran
  cdn <- which(mat$gene %in% gene_list)
  mat1 <- mat[cdn,]
pdf(paste0(label,"_vocalno_plot_",".pdf"),12,12)
  p1<-ggplot(mat,aes(log2(FC),-1*log10(p_val_adj)))+
            geom_point(aes(color =type))+
            geom_text_repel(data=subset(mat1, abs(mat1$log2FC) >=log2(lb.filter)),aes(x=log2(FC),y=-log10(p_val_adj),label=gene),size=5,point.padding = 0.5)+
            scale_color_manual(values=c('#377EB8',"lightgrey",'#E41A1C'))+
            theme(panel.background =element_blank(),axis.line=element_line(colour="black"))

  p1<-p1+theme(legend.title=element_blank(),legend.background = element_blank(),legend.key = element_blank(), legend.text = element_text(size=20), legend.position="NA")
  p1<-p1+theme_bw()+theme(axis.text = element_text(size = 20))
  p1<-p1+geom_vline(xintercept=c(-1,1),lty=3,col="black",lwd=0.8)+geom_hline(yintercept = -log10(0.01),lty=3,col="black",lwd=0.8)
  p1<-p1+ggtitle(label)+theme(plot.title = element_text(hjust = 0.5,size=30,face="bold"))+theme(axis.title =element_text(size=20))
  print(p1)
dev.off()
})
}

多组火山图

多组火山图

相比于前者可以展示多组的差异基因,plot_mutideg函数理论上也能画单组火山图。
函数使用示例:

plot_mutideg(df,incluster='cluster',infc='avg_log2FC',ingene='gene',pos=TRUE,prefix='out',mycol <- c("#E64B357F","#4DBBD57F","#00A0877F","#3C54887F","#F39B7F7F","#8491B47F","#91D1C27F","#DC00007F","#7E61487F"))

plot_mutideg函数的参数:
df 包含显著性,差异倍数,基因名的数据框,可由Seurat的FindAllmarkers得到
incluster,默认为cluster,分组的列名
infc,默认为avg_log2FC,差异倍数所在列名
ingene,默认为gene,基因名所在列名,
pos,默认为TRUE,是否只展示上调基因,
prefix='out',输出文件的前缀
mycol,默认为NULL,调色板,颜色数目需多于分组的数目。

library(ggplot2)
library(tidyverse)
library(ggrepel)
plot_mutideg <- function(df,incluster='cluster',infc='avg_log2FC',ingene='gene',pos=TRUE,prefix='out',
                         mycol=NULL){
df$cluster=unlist(df[incluster])
df$avg_log2FC=unlist(df[infc])
df$gene=unlist(df[ingene])
df$label <- ifelse(df$p_val_adj<0.01,"adjust P-val<0.01","adjust P-val>=0.01")
top10sig <- df %>% group_by(cluster) %>% top_n(10,wt=avg_log2FC)
df$size <- case_when(!(df$gene %in% top10sig$gene)~ 1,
                     df$gene %in% top10sig$gene ~ 2)
dt <- filter(df,size==1)

b1 <- top10sig %>% group_by(cluster) %>% summarize(bar = max(avg_log2FC,na.rm = T))
b2 <- top10sig %>% group_by(cluster) %>% summarize(bar = min(avg_log2FC,na.rm = T))

lsed= c(1:length(unique(df$cluster)))-1
dfbar<-data.frame(x=lsed,
                  y=b1$bar)
dfbar1<-data.frame(x=lsed,
                   y=b2$bar)
iny=0
if (pos){
p1 <- ggplot()+
geom_col(data = dfbar,
         mapping = aes(x = x,y = y),
         fill = "#dcdcdc",alpha = 0.6)
iny=-0.5
}else{
p1 <- ggplot()+
geom_col(data = dfbar,
         mapping = aes(x = x,y = y),
         fill = "#dcdcdc",alpha = 0.6)+
geom_col(data = dfbar1,
         mapping = aes(x = x,y = y),
         fill = "#dcdcdc",alpha = 0.6)
}

p2 <- p1+
geom_jitter(data = dt,
            aes(x = cluster, y = avg_log2FC, color = label),
            size = 0.85,
            width =0.4)+
geom_jitter(data = data.frame(top10sig),
            aes(x = cluster, y = avg_log2FC, color = label),
            size = 1,
            width =0.4,
            shape=17)

dfcol<-data.frame(x=lsed,
                  y=iny,
                  label=unique(df$cluster))
p3 <- p2 + geom_tile(data = dfcol,
                     aes(x=x,y=y),
                     height=0.4,
                     color = "black",
                     fill = mycol,
                     alpha = 0.6,
                     show.legend = F)

p4 <- p3+
geom_text_repel(
                data=top10sig,
                aes(x=cluster,y=avg_log2FC, label=gene),
                force = 1.2,
                arrow = arrow(length = unit(0.008, "npc"),
                              type = "open", ends = "last")
                )

p5 <- p4 +
scale_color_manual(name=NULL,
                   values = c("red","black"))

p6 <- p5+
labs(x="Cluster",y="average logFC")+
geom_text(data=dfcol,
          aes(x=x,y=y,label=label),
          size =6,
          color ="white")
p7 <- p6+
theme_minimal()+
theme(
      axis.title = element_text(size = 13,
                                color = "black",
                                face = "bold"),
      axis.line.y = element_line(color = "black",
                                 size = 1.2),
      axis.line.x = element_blank(),
      axis.text.x = element_blank(),
      panel.grid = element_blank(),
      legend.position = "top",
      legend.direction = "vertical",
      legend.justification = c(1,0),
      legend.text = element_text(size = 15)
      )

pdf(paste(prefix,'_multideg.pdf'),18,10)
print(p7)
dev.off()
}
上一篇下一篇

猜你喜欢

热点阅读