RNAseq数据分析生信绘图R作图

R可视化:时序连线点图

2020-12-07  本文已影响0人  生信学习者2

最近处理一批小鼠衰老数据,如何展示不同时间点的基因表达数据比较好玩,因此记录一下。更多知识分享请到 https://zouhua.top/

加载R包

library(dplyr)
library(tibble)
library(ggplot2)
library(data.table)
library(convert)
library(ggpubr)

age <- c("1", "3", "6", "9", "12", "15", "18", "21", "24", "27")
age.col <- c("#283891", "#EED3AC", "#C1272D", "#9DCEDC", 
             "#ED1C24", "#9DCEDC", "#F89C31", "#B6B6BA", "#6DC06A", "#2D6BB4")

导入数据

phen <- read.csv("phenotype.csv")
prof <- fread("TPM.tsv") %>%
  column_to_rownames("V1")
mouse2human <- read.table("gene_mouse2human.tsv", 
                          sep = "\t", header = T)
genelist <- read.table("gene_list.tsv", header = T)
#mouse2human %>% filter(HGNC_symbol%in%genelist$gene2)

处理数据

get_expr_Set <- function(x, y, gene_list=gene_list,
                         ncount=10, occurrence=0.2){
  # x <- phen
  # y <- prof
  # ncount=10
  # occurrence=0.2
  
  prf <- y[rowSums(y) > ncount, ]
  prf <- prf[, colSums(prf) > 0]
  sid <- intersect(x$SampleID, colnames(y))
  phe <- x %>% filter(SampleID%in%sid) %>% 
    column_to_rownames("SampleID_v2") 
  prf.cln <- prf %>% dplyr::select(as.character(phe$SampleID)) %>% 
    rownames_to_column("tmp") %>% 
    filter(apply(dplyr::select(., -one_of("tmp")), 1, function(x) {
            sum(x != 0)/length(x)}) > occurrence) %>%
    column_to_rownames("tmp")
  
  # determine the right order between profile and phenotype 
  for(i in 1:ncol(prf.cln)){ 
    if (!(colnames(prf.cln)[i] == phe$SampleID[i])) {
      stop(paste0(i, " Wrong"))
    }
  }
  
  # change ensemble id into HGNC symbol
  mmu_hsa_gene <- inner_join(
    prf.cln %>% rownames_to_column("geneid"), 
    mouse2human %>% dplyr::select(ensembl_id_mouse, HGNC_symbol), 
    by = c("geneid" = "ensembl_id_mouse")) %>%
    dplyr::select(-geneid)
  
  mmu_hsa_gene$median <- apply(mmu_hsa_gene[,-ncol(mmu_hsa_gene)], 1, median)
  mmu_hsa_gene <- with(mmu_hsa_gene, 
                       mmu_hsa_gene[order(HGNC_symbol, median, decreasing = T), ])
  mmu_hsa_gene.new <- mmu_hsa_gene[!duplicated(mmu_hsa_gene$HGNC_symbol), ] %>%
    dplyr::select(-median)
  rownames(mmu_hsa_gene.new) <- NULL
  mmu_hsa_gene.new <- mmu_hsa_gene.new  %>% column_to_rownames("HGNC_symbol")
  
  mmu_hsa_gene.new.cln <- mmu_hsa_gene.new %>% rownames_to_column("gene") %>%
    filter(gene%in%gene_list$gene2) %>%
    column_to_rownames("gene")
  exprs <- as.matrix(mmu_hsa_gene.new.cln)
  metadata <-  new("AnnotatedDataFrame", data=phe)
  experimentData <- new("MIAME",
        name="Mouse aging", lab="Lab",
        contact="hua zou@outlook.com",
        title="Mouse Aging Experiment",
        abstract="The gene ExpressionSet",
        url="www.zouhua.top",
        other=list(notes="Created from text files"))
  expressionSet <- new("ExpressionSet", exprs=exprs,
                       phenoData=metadata, 
                       experimentData=experimentData)
  
  return(expressionSet)
}
gene_expr_set <- get_expr_Set(phen, prof, gene_list=genelist)

获取画图数据

dat_expr <- exprs(gene_expr_set) %>% data.frame()
dat_expr.cln <- dat_expr[, colSums(dat_expr) > 0]
dat_phen <- data.frame(SampleID=gene_expr_set$SampleID, Age=gene_expr_set$Age)
mdat <- inner_join(dat_phen, 
                   dat_expr.cln %>% t() %>% data.frame() %>% rownames_to_column("SampleID"),
                   by = "SampleID") %>% 
        tidyr::gather(key="Gene", value="TMP_Value", -c("SampleID", "Age")) %>%
        mutate(Age=factor(Age, levels = age))

画图

pl <- ggplot(mdat, aes(x=Age, y=TMP_Value))+
  geom_dotplot(aes(fill=Age), binaxis='y', stackdir='center', dotsize=0.9,
               bins = 30)+ 
  # stat_summary(fun.data="mean_sdl", fun.args = list(mult=1), 
  #                geom="crossbar", width=0.5)+
  stat_summary(fun.data=mean_sdl, fun.args = list(mult=1), 
        geom="errorbar", color="red", width=0.2)+
  stat_summary(fun.y=mean, geom="point", color="black", fill="white", size=2)+
  stat_summary(fun.y=mean, geom="line", group=1, linetype=1, size=0.7)+
  stat_compare_means(comparisons = list(c("1", "3")),
                     method = "wilcox.test")+
  scale_fill_manual(values = age.col)+
  scale_x_discrete(breaks=age,
        labels=paste0(age, "\nMonth"))+
  labs(x="", y="Gene expression value (TPM)")+
  facet_wrap(facets = "Gene", scales = "free")+
  theme_bw()+ 
  theme(axis.ticks.length = unit(0.2, "cm"),
          axis.title = element_text(face = "bold", size = 12),
          axis.text.x = element_text(face = "bold", size = 10),
          axis.text.y = element_text(size = 10, face = "bold"),
          strip.text = element_text(color = 'red', face = 'bold', size = rel(1.5)),
          strip.background = element_rect(colour = 'black', size = rel(2)))
pl

保存图形

ggsave("test.pdf", width = 14, height = 8, dpi = 300)
#ggsave("test.png", width = 14, height = 8, dpi = 300)

参考

参考文章如引起任何侵权问题,可以与我联系,谢谢。

上一篇下一篇

猜你喜欢

热点阅读