R可视化：组间差异结果的可视化

2021-06-29 本文已影响0人生信学习者2

前言

对质谱数据做单变量的组间差异检验后，我们获得了检验的结果。常用展示结果的方法有图和表的方式，我们在文献中常用图来展示，而表可作为附表补充图。更多知识分享请到 https://zouhua.top/。

本文讲使用火山图，韦恩图和扩展火山图展示组间差异结果，最后再结合热图展示所有的组间差异蛋白的丰度分布情况。

Importing data

library(dplyr)
library(tibble)
library(convert)
library(ggplot2)
library(ggrepel)
library(data.table)

phen <- read.csv("phenotype_20210625.csv")
NC_PC_DEP <- fread("NC_PC_limma_Mass_LOG2Impute.csv")
NC_PT_DEP <- fread("NC_PT_limma_Mass_LOG2Impute.csv")
ExprSet <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")

subgrp <- c("NC", "PC", "PT")
grp.col <- c("#568875", "#73FAFC", "#EE853D")

Volcano Function

# function
volcanofun <- function(datset=NC_PC_DEP,
                       genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1"),
                       group_name=subgrp[1:2],
                       group_col=grp.col[1:2],
                       pval=0.05, 
                       fc=1){
  
  # datset=NC_PC_DEP
  # genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1")
  # group_name=subgrp[1:2]
  # group_col=grp.col[1:2]
  # pval=0.05
  # fc=1
  
  dat <- datset %>% 
    mutate(color=factor(Enrichment, 
                        levels = c(group_name, "Nonsignif")))  
  # print(table(dat$color))
  dat_status <- table(dat$color)
  dat_status_number <- as.numeric(dat_status)
  dat_status_name <- names(dat_status)
  legend_label <- c(paste0(dat_status_name[1], " (", dat_status_number[1], ")"),
                    paste0(dat_status_name[2], " (", dat_status_number[2], ")"),
                    paste0("Nonsignif", " (", dat_status_number[3], ")"))
  
  dat.signif <- subset(dat, adj.P.Val < pval & abs(logFC) > fc) %>%
    filter(GeneID%in%genelist)
  print(table(dat.signif$color))
  
  group_col_new <- c(group_col, "#979797")
  group_name_new <- levels(dat$color)
  
  xlabel <- paste0("log2(", paste(group_name, collapse="/"), ")")
  
  # Make a basic ggplot2 object with x-y values
  pl <- ggplot(dat, aes(x = -logFC, y = -log10(adj.P.Val), color = color))+ 
          geom_point(size = 2, alpha = 1, stroke = 1)+
          scale_color_manual(name = NULL,
                             values = group_col_new,
                             labels = c(legend_label, "Nonsignif"))+
          xlab(xlabel) + 
          ylab(expression(-log[10]("adjusted p-value")))+ 
          geom_hline(yintercept=-log10(pval), alpha=.8, linetype=2, size=.7)+
          geom_vline(xintercept=fc, alpha=.8, linetype=2, size=.7)+
          geom_vline(xintercept=-fc, alpha=.8, linetype=2, size=.7)+ 
          geom_text_repel(data = dat.signif,
                          aes(label = GeneID),
                          size = 4,
                          max.overlaps = getOption("ggrepel.max.overlaps", default = 80),
                          segment.linetype = 1,
                          segment.curvature = -1e-20,
                          box.padding = unit(0.35, "lines"),
                          point.padding = unit(0.3, "lines"),
                          arrow = arrow(length = unit(0.005, "npc")),
                          color = "black",     # text color
                          bg.color = "white", # shadow color
                          bg.r = 0.15)+
          annotate("text", x=min(dat$logFC), y=-log10(pval), label=pval, size=6, color="red")+
          annotate("text", x=fc, y=0, label=fc, size=6, color="red")+
          annotate("text", x=-fc, y=0, label=-fc, size=6, color="red")+
          scale_y_continuous(trans = "log1p")+
          guides(color=guide_legend(override.aes = list(size = 3)))+
          theme_bw()+ 
          theme(axis.title = element_text(color = "black", size = 12),
                axis.text = element_text(color = "black", size = 10),
                text = element_text(size = 8, color = "black", family="serif"),
                panel.grid = element_blank(),
                #legend.position = "right",
                legend.position = c(.15, .1),
                legend.key.height = unit(0.6,"cm"),
                legend.text = element_text(face = "bold", color = "black", size = 8),
                strip.text = element_text(face = "bold", size = 14))
  return(pl)
}

Gene_boxplot <- function(datset=ExprSet,
        genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1"),
        group_name=subgrp[1:2],
        group_col=grp.col[1:2]){

  # datset=ExprSet
  # genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1")
  # group_name=subgrp[1:2]
  # group_col=grp.col[1:2]
  
  pheno <- pData(datset) %>%
    filter(SubGroup%in%group_name)
  pheno$SubGroup <- factor(as.character(pheno$SubGroup), levels = group_name)

  
  edata <- data.frame(exprs(datset)) %>%
    dplyr::select(rownames(pheno)) %>%
    rownames_to_column("GeneID") %>%
    filter(GeneID%in%genelist) %>%
    column_to_rownames("GeneID")
  
  mdat <- pheno %>% dplyr::select(SubGroup) %>%
    rownames_to_column("SampleID") %>%
    inner_join(t(edata) %>% data.frame() %>% rownames_to_column("SampleID"), by = "SampleID") %>%
    column_to_rownames("SampleID")
  
  plotdata <- mdat %>% tidyr::gather(key="geneID", value="value", -SubGroup) %>%
    mutate(SubGroup=factor(SubGroup, levels = group_name))
  
  plotdata$geneID <- factor(plotdata$geneID, levels = genelist)  
  
  pl <- ggplot(plotdata, aes(x = SubGroup, y = value, fill= SubGroup))+
    stat_boxplot(geom = "errorbar", width = 0.15,
                 position = position_dodge(0.4)) +    
    geom_boxplot(width = 0.4, 
                 outlier.colour = "black", 
                 outlier.shape = 21, 
                 outlier.size = .5)+
    scale_fill_manual(values = group_col)+
    #scale_y_continuous(labels = scales::scientific)+
    facet_wrap(facets = "geneID", scales = "free_y", nrow = 2)+    
    labs(x="", y="Relative Abundance (Mass Spectrometry)")+
    guides(fill=F)+
    theme_classic()+
    theme(axis.title = element_text(color = "black", size = 12),
          axis.text.x = element_text(color = "black", size = 10, hjust = .5, vjust = .5),
          text = element_text(size = 8, color = "black", family="serif"),
          panel.grid = element_blank(),
          strip.text = element_text(face = "bold", size = 12))    
    
  return(pl)
}

NC_PC_volcano <- volcanofun(datset=NC_PC_DEP,
           genelist=c("FIBB","FIBA","FA5","OIT3","IBP2","GANAB","RELN","LYSC","CADH1","EGF"),
           group_name=subgrp[1:2],
           group_col=grp.col[1:2])

NC_PC_volcano
ggsave("NC_PC_Mass_volcano.pdf", NC_PC_volcano, width = 8, height = 6)
NC_PC_boxplot <- Gene_boxplot(datset=ExprSet,
        genelist=c("FIBB","FIBA","FA5","OIT3","IBP2","GANAB","RELN","LYSC","CADH1","EGF"),
        group_name=subgrp[1:2],
        group_col=grp.col[1:2])
NC_PC_boxplot
ggsave("NC_PC_Mass_SpecificDEP_boxplot.pdf", NC_PC_boxplot, width = 8, height = 6)

Venn plot for Overlap DEGs

get_DEG_List <- function(datset1=NC_PC_DEP,
                         datset2=NC_PT_DEP){
  # datset1=NC_PC_DEP
  # datset2=NC_PT_DEP
  
  diff_gene1 <- datset1 %>% filter(Enrichment != "Nonsignif")
  diff_gene2 <- datset2 %>% filter(Enrichment != "Nonsignif")
  res <- list(diff1=diff_gene1$GeneID,
                         diff2=diff_gene2$GeneID)
  return(res)
}

diff_gene_list <- get_DEG_List(datset1=NC_PC_DEP, datset2=NC_PT_DEP)

# eulerr
require(eulerr)
diff_venn <- euler(diff_gene_list, shape = "ellipse")

# pdf(file = "Mass_DEPs_Venn.pdf", width = 5, height = 4)
plot(diff_venn, 
     fills = alpha(grp.col[2:3], 0.5),
     labels = c("NC vs PC", "NC vs PT"),
     quantities = TRUE,
     col = "black")
# dev.off()

quadrant plot

A quadrant plot shows the common or different Proteins between two comparisons.

1st quadrant: the Proteins up-regulate in both PC and PT
2nd quadrant: the Proteins up-regulate in PC but down-regulate in PT
3rd quadrant: the Proteins down-regulate in both PC and PT
4th quadrant: the Proteins down-regulate in PC but up-regulate in PT

quadrant_plot <- function(datset1=NC_PC_DEP, 
                          datset2=NC_PT_DEP,
                          thresholdFC=3,
                          group_name=subgrp,
                          axis_len=12){
  
  # datset1=NC_PC_DEP
  # datset2=NC_PT_DEP
  # thresholdFC=1
  # group_name=subgrp
  # axis_len=5
  
  overlap_gene <- union(datset1$GeneID, datset2$GeneID)
  dat_x <- datset1 %>% filter(GeneID%in%overlap_gene)
  dat_y <- datset2 %>% filter(GeneID%in%overlap_gene)
  
  # the Differential Genes 
  dat_x_signif <- subset(dat_x, Enrichment != "Nonsignif")
  dat_y_signif <- subset(dat_y, Enrichment != "Nonsignif")
  
  # enriched in NC and Disease in both  
  common_signif <- intersect(dat_x_signif$GeneID, dat_y_signif$GeneID)
  
  # enriched in NC and Disease in each
  dat_x_sig_only <- setdiff(dat_x_signif$GeneID, common_signif)
  dat_y_sig_only <- setdiff(dat_y_signif$GeneID, common_signif)
  non_signif <- setdiff(overlap_gene, c(common_signif, dat_x_sig_only, dat_y_sig_only))
  
  
  gene_type_name <- c(paste(group_name[2:3], collapse = "/"),
                      paste(group_name[2], "only"),
                      paste(group_name[3], "only"),
                      "Nonsignif")
  
  gene_type_df <- rbind(data.frame(GeneID=common_signif, group=gene_type_name[1]),
                        data.frame(GeneID=dat_x_sig_only, group=gene_type_name[2]),
                        data.frame(GeneID=dat_y_sig_only, group=gene_type_name[3]),
                        data.frame(GeneID=non_signif, group=gene_type_name[4]))
  mdat <- inner_join(dat_x %>% dplyr::select(GeneID, logFC) %>% dplyr::rename(xvalue=logFC),
                     dat_y %>% dplyr::select(GeneID, logFC) %>% dplyr::rename(yvalue=logFC),
                     by = "GeneID") %>%
    inner_join(gene_type_df, by="GeneID") %>%
    mutate(group=factor(group, levels = rev(gene_type_name)))
  
  print(table(mdat$group))
  
  common_signif_gene <- mdat %>% filter(GeneID%in%common_signif)
  
  common_signif_gene <- mdat %>% filter(GeneID%in%common_signif) %>%
    mutate(GeneID_v2=ifelse(abs(xvalue) > thresholdFC | abs(yvalue) > thresholdFC, GeneID, NA))
  
  common_signif_gene_v2 <- na.omit(common_signif_gene)
  print(table(common_signif_gene_v2$group))

  require(magrittr)
  # constants
  axis_begin  <- -axis_len
  axis_end    <- axis_len
  total_ticks <- 8
  
  # point to plot
  my_point <- data.frame(x=1, y=1)
  # chart junk data
  tick_frame <- data.frame(ticks = seq(axis_begin, axis_end, by=2), zero=0) %>%
    subset(ticks != 0)
  tick_frame <- data.frame(ticks = seq(axis_begin, axis_end, by=2), zero=0) %>%
    subset(ticks != 0)
  lab_frame <- data.frame(lab = seq(axis_begin, axis_end, 2), zero = 0) %>%
    subset(lab != 0)
  tick_sz <- (tail(lab_frame$lab, 1) -  lab_frame$lab[1]) / 128
  
  x_title <- paste(group_name[1], "vs", group_name[2])
  y_title <- paste(group_name[1], "vs", group_name[3])
  
  pl <- ggplot(mdat)+
    geom_segment(x = 0, xend = 0, 
                 y = lab_frame$lab[1], yend = tail(lab_frame$lab, 1),
                 size = 0.5) +
    geom_segment(y = 0, yend = 0, 
                 x = lab_frame$lab[1], xend = tail(lab_frame$lab, 1),
                 size = 0.5) +
    geom_segment(data = tick_frame, 
                 aes(x = ticks, xend = ticks, 
                     y = zero, yend = zero + tick_sz)) +
    geom_segment(data = tick_frame, 
                 aes(x = zero, xend = zero + tick_sz, 
                     y = ticks, yend = ticks)) + 
    geom_text(data=lab_frame, aes(x=lab, y=zero, label=lab),
              family = 'Times', vjust=1.5) +
    geom_text(data=lab_frame, aes(x=zero, y=lab, label=lab),
              family = 'Times', hjust=1.5) +
    annotate("text", x = axis_len-1, y = -.7, color = "black", size=3, 
             label = paste0("log2(", x_title, ")"))+
    annotate("text", x = -.7, y = axis_len-1, color = "black", size=3, angle=90, 
             label = paste0("log2(", y_title, ")"))+
    geom_point(aes(x = xvalue, y = yvalue, color=group), size = 1.5)+
    geom_text_repel(data = common_signif_gene,  
                    aes(x=xvalue, y=yvalue, label = GeneID_v2),
                    size = 5,
                    max.overlaps = getOption("ggrepel.max.overlaps", default = 80),
                    segment.linetype = 1,
                    segment.curvature = -1e-20,
                    box.padding = unit(0.35, "lines"),
                    point.padding = unit(0.3, "lines"),
                    arrow = arrow(length = unit(0.005, "npc")),
                    # color = "white",     # text color
                    # bg.color = "grey30", # shadow color
                    bg.r = 0.15)+
    scale_color_manual(values = c("#A6A6A6", "#7BBDE0", "#B67FD0", "#FDC361"))+
    guides(color=guide_legend(title = NULL, keywidth=.9, keyheight=.9, linetype=2))+
    theme_void()+
    theme(panel.grid = element_blank(),
          text = element_text(size = 8, color = "black", family="serif"),      
          legend.text=element_text(size=11, color = "black"),
          legend.position = c(.7, .2),
          legend.justification = c(0, 0),
          legend.background = element_rect(linetype=2, color = "black", fill="white"))
  
  return(pl)
}

quadrantpl <- quadrant_plot(datset1=NC_PC_DEP, 
                            datset2=NC_PT_DEP,
                            thresholdFC=3,
                            group_name=subgrp,
                            axis_len=8)
quadrantpl
ggsave("Mass_DEPs_quadrant.pdf", quadrantpl, width = 6, height = 6)

Heatmap Of DEGs

heatFun <- function(datset1=NC_PC_DEP, 
                    datset2=NC_PT_DEP,
                    thresholdFC=1,
                    ExprSet=ExprSet,
                    group_name=subgrp){
  
  # datset1=NC_PC_DEP
  # datset2=NC_PT_DEP
  # thresholdFC=1
  # ExprSet=ExprSet
  # group_name=subgrp
  
  
  diff_gene1 <- datset1 %>% filter(Enrichment != "Nonsignif") %>%
    filter(abs(logFC) > thresholdFC)
  diff_gene2 <- datset2 %>% filter(Enrichment != "Nonsignif") %>%
    filter(abs(logFC) > thresholdFC)
  
  union_gene <- Reduce(union, list(diff_gene1$GeneID, diff_gene2$GeneID))
  
  pheno <- pData(ExprSet) %>% data.frame() %>%
    rownames_to_column("SampleID") %>%
    filter(SubGroup%in%group_name) %>%
    mutate(SubGroup=factor(SubGroup, levels = group_name)) %>%
    arrange(SubGroup) %>%
    column_to_rownames("SampleID")
  
  edata <- exprs(ExprSet) %>% data.frame() %>%
    rownames_to_column("geneid") %>%
    filter(geneid%in%union_gene) %>%
    dplyr::select(c("geneid", rownames(pheno))) %>%
    column_to_rownames("geneid")
  
  # scale data: z-score
  scale_rows <- function (x) {
      m = apply(x, 1, mean, na.rm = T)
      s = apply(x, 1, sd, na.rm = T)
      return((x - m)/s)
  }  
  edata_scaled <- t(scale_rows(edata))
  require(circlize)
  col_fun <- colorRamp2(c(round(range(edata_scaled)[1]), 0, 
                          round(range(edata_scaled)[2])),
                        c("blue", "white", "red")) 
  # row split 
  dat_status <- table(pheno$SubGroup)
  dat_status_number <- as.numeric(dat_status)
  dat_status_name <- names(dat_status)
  row_split <- c()
  for (i in 1:length(dat_status_number)) {
    row_split <- c(row_split, rep(i, dat_status_number[i]))
  }
  require(ComplexHeatmap)
  pl <- Heatmap(
          edata_scaled, 
          #col = col_fun,
          cluster_rows = FALSE,
          row_order = rownames(pheno),
          show_column_names = FALSE,
          show_row_names = FALSE,
          row_names_gp = gpar(fontsize = 12),
          row_names_side = "right",
          row_dend_side = "left",
          column_title = NULL, 
          heatmap_legend_param = list(
            title = "Relative Abundance\nZscore",
            title_position = "topcenter",
            border = "black",
            legend_height = unit(10, "cm"),
            direction = "horizontal"),
         row_split = row_split,
        left_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = 2:4),
            labels = group_name, 
            labels_gp = gpar(col = "black", fontsize = 12))),         
         column_km = 3
    )
  return(pl)
}

heatFun(datset1=NC_PC_DEP, 
        datset2=NC_PT_DEP,
        thresholdFC=1,
        ExprSet=ExprSet,
        group_name=subgrp)

结果效果图类似如下截图

systemic information

sessionInfo()

R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)

Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggrepel_0.9.1.9999          SummarizedExperiment_1.20.0 GenomicRanges_1.42.0        GenomeInfoDb_1.26.4        
 [5] IRanges_2.24.1              S4Vectors_0.28.1            MatrixGenerics_1.2.1        matrixStats_0.58.0         
 [9] data.table_1.14.0           umap_0.2.7.0                vegan_2.5-6                 lattice_0.20-41            
[13] permute_0.9-5               Rtsne_0.15                  convert_1.64.0              marray_1.68.0              
[17] limma_3.46.0                Biobase_2.50.0              BiocGenerics_0.36.0         ggplot2_3.3.3              
[21] tibble_3.1.0                dplyr_1.0.5                

loaded via a namespace (and not attached):
 [1] nlme_3.1-150           bitops_1.0-6           tools_4.0.2            backports_1.2.1        utf8_1.2.1            
 [6] R6_2.5.0               DBI_1.1.1              mgcv_1.8-34            colorspace_2.0-0       withr_2.4.1           
[11] tidyselect_1.1.0       curl_4.3               compiler_4.0.2         DelayedArray_0.16.3    scales_1.1.1          
[16] askpass_1.1            digest_0.6.27          foreign_0.8-81         rmarkdown_2.7          rio_0.5.16            
[21] XVector_0.30.0         pkgconfig_2.0.3        htmltools_0.5.1.1      rlang_0.4.10           readxl_1.3.1          
[26] generics_0.1.0         jsonlite_1.7.2         zip_2.1.1              car_3.0-10             RCurl_1.98-1.3        
[31] magrittr_2.0.1         GenomeInfoDbData_1.2.4 Matrix_1.3-2           Rcpp_1.0.6             munsell_0.5.0         
[36] fansi_0.4.2            abind_1.4-5            reticulate_1.18        lifecycle_1.0.0        stringi_1.4.6         
[41] yaml_2.2.1             edgeR_3.32.1           carData_3.0-4          MASS_7.3-53.1          zlibbioc_1.36.0       
[46] grid_4.0.2             forcats_0.5.0          crayon_1.4.1           haven_2.3.1            splines_4.0.2         
[51] hms_1.0.0              locfit_1.5-9.4         knitr_1.31             pillar_1.6.0           ggpubr_0.4.0          
[56] ggsignif_0.6.0         glue_1.4.2             evaluate_0.14          vctrs_0.3.7            cellranger_1.1.0      
[61] gtable_0.3.0           openssl_1.4.3          purrr_0.3.4            tidyr_1.1.3            assertthat_0.2.1      
[66] xfun_0.20              openxlsx_4.2.3         broom_0.7.6            RSpectra_0.16-0        rstatix_0.7.0

Reference

Core functional nodes and sex-specific pathways in human ischaemic and dilated cardiomyopathy