生物信息可视化R语言在生物信息学的应用

一键完成单因素方差分析及可视化的R脚本

2021-04-14  本文已影响0人  邱俊辉
library(optparse)
library(tidyverse)
library(agricolae)
library(reshape2)
library(ggplot2)
library(ggpubr)
mytheme=theme(
  panel.grid.major=element_blank(),
  panel.grid.minor=element_blank(),
  plot.title = element_text(vjust = -8.5,hjust = 0.1),
  axis.title.y =element_text(size = 20,face = "bold",colour = "black"),
  axis.title.x =element_text(size = 24,face = "bold",colour = "black"),
  axis.text = element_text(size = 20,face = "bold"),
  axis.text.x = element_text(colour = "black",size = 14),
  axis.text.y = element_text(colour = "black",size = 14),
  legend.text = element_text(size = 15,face = "bold"),
  legend.position = "none"#是否删除图例) 
)
option_list=list(
  make_option(c("-f","--file"),type = "character",default = FALSE,
              help = "The input file"),
  make_option(c("-d","--depvar"),type = "character",default = FALSE,
              help="The column name of the dependent variable"),
  make_option(c("-i","--indepvar"),type="character",default=FALSE,
              help="The column name of the independent variable"),
  make_option(c("-t","--type"),type = "character",default = FALSE,
              help = "Type of drawing: boxplot or barplot"),
  make_option(c("-o","--out"),type = "character",default = FALSE,
              help = "the out put file name")
)
opt = parse_args(OptionParser(option_list = option_list, usage = "This Script is use for Analysis of variance and plotting"))
###参数检验
if(opt$type != "boxplot" && opt$type != "barplot"){
  print("Please input the right type of drawing:boxplot or barplot!")
  stop("Please input the right type of drawing:boxplot or barplot!")
}

out_name=paste(opt$out,"pdf",sep = ".")
#数据列从第一列开始是ID,第二列是分组信息,剩下的列均为数据列
df=read.table(opt$file,sep = "\t",header = T)
colname_list=colnames(df)
if(opt$depvar %in% colname_list == FALSE){
  print("Please input the correct column name of the dependent variable")
  print(colname_list)
  stop("Please input the correct column name of the dependent variable")
}

depvar_index=which(colname_list==opt$depvar)
indepvar_index=which(colname_list==opt$indepvar)
ss = df[depvar_index]
colnames(ss) = c("count")
ss$group = df[,indepvar_index]
# 正态性检验 Shapiro-Wilk normality test,保存p-value
normality=shapiro.test(ss$count)
p1 = normality$p.value
# 方差齐性检验 Bartlett test of homogeneity of variances,保存p-value
homo = bartlett.test(count~group, data = ss)
p2 = homo$p.value
model = aov(count~group, data = ss)
###绘制柱状图


if(opt$type=="barplot"){
  if (p1 > 0.05 & p2 > 0.05) {
    p1 = round(p1,3)
    p2 = round(p2,3)
    name_i = opt$depvar
    wtx1 = summary(model)
    wtx2 = wtx1[[1]]
    wtx3 = wtx2[5]
    # 条件2. anova存在显著差异分组
    if ( wtx3$`Pr(>F)`[1]< 0.05) {
      # 进行多重比较,不矫正P值
      out = LSD.test(model,"group", p.adj="none")
      aa = out$group
      aa$group = row.names(aa)
      wen1 = as.data.frame(tapply(ss$count,ss$group,mean,na.rm=TRUE))
      wen2 = as.data.frame(tapply(ss$count,ss$group,sd,na.rm=TRUE))
      went = cbind(wen1,wen2)
      wentao = merge(aa,went, by="row.names",all=F)
      colnames(wentao) = c(colnames(wentao[1:4]),"mean" ,"SD")
      aa = mutate(wentao, ymin = mean - SD, ymax =  mean + SD)
      a = max(aa$mean)*1.2
      p=ggplot(aa , aes(x = group, y = mean,colour= group)) +
        geom_bar(aes(colour= group,fill = group),stat = "identity", width = 0.4,position = "dodge") +
        geom_text(aes(label = groups,y=ymax, x = group,vjust = -0.3,size = 6))+
        geom_errorbar(aes(ymin=ymin,ymax=ymax),colour="black",width=0.1,size = 1)+
        scale_y_continuous(expand = c(0,0),limits = c(0,a))+
        labs(x=paste(name_i,"of all group", sep = "_"),y="group",
             title = paste("Normality test",p1,"Homogeneity of variance",p2,sep = ":"))+mytheme
      if (length(unique(data_box$group))>3){    
        p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
      }
      ggsave(out_name, p, width = 8.3, height = 5.8)
    }else if ( wtx3$`Pr(>F)`[1]>= 0.05){#  anova不存在显著差异分组
      out = LSD.test(model,"group", p.adj="none")
      aa = out$groups
      aa$group = row.names(aa)
      wen1 = as.data.frame(tapply(ss$count,ss$group,mean,na.rm=TRUE))
      wen2 = as.data.frame(tapply(ss$count,ss$group,sd,na.rm=TRUE))
      went = cbind(wen1,wen2)
      wentao = merge(aa,went, by="row.names",all=F)
      colnames(wentao) = c(colnames(wentao[1:4]),"mean" ,"SD")
      aa = mutate(wentao, ymin = mean - SD, ymax =  mean + SD)
      a = max(aa$mean)*1.2
      res = round(wtx3$`Pr(>F)`[1],3)
      p = ggplot(aa , aes(x = group, y = mean,colour= group)) +
        geom_bar(aes(colour= group,fill = group),stat = "identity", width = 0.4,position = "dodge") +
        geom_errorbar(aes(ymin=ymin,ymax=ymax),colour="black",width=0.1,size = 1)+
        scale_y_continuous(expand = c(0,0),limits = c(0,a))+
        labs(x=paste(name_i,"of all group", sep = "_"),y="group",
             title = paste("Normality test",p1,"Homogeneity of variance",p2,"aov",res,sep = ":"))+mytheme
      if (length(unique(data_box$group))>3){    
        p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
      }
      ggsave(out_name, p, width = 8.3, height = 5.8)
    }
  }else if( p1 <.05| p2 <.05){
    p1 = round(p1,3)
    p2 = round(p2,3)
    name_i = opt$depvar
    krusk=compare_means(count~group, data = ss, method = "kruskal.test")
    sumkrusk=as.data.frame(krusk)
    if ( sumkrusk[3]< 0.05) {
      out = LSD.test(model,"group", p.adj="none")
      aa = out$group
      aa$group = row.names(aa)
      wen1 = as.data.frame(tapply(ss$count,ss$group,mean,na.rm=TRUE))
      wen2 = as.data.frame(tapply(ss$count,ss$group,sd,na.rm=TRUE))
      went = cbind(wen1,wen2)
      wentao = merge(aa,went, by="row.names",all=F)
      colnames(wentao) = c(colnames(wentao[1:4]),"mean" ,"SD")
      aa = mutate(wentao, ymin = mean - SD, ymax =  mean + SD)
      a = max(aa$mean)*1.2
      p = ggplot(aa , aes(x = group, y = mean,colour= group)) + 
        geom_bar(aes(colour= group,fill = group),stat = "identity", width = 0.4,position = "dodge") + 
        geom_errorbar(aes(ymin=ymin,ymax=ymax),colour="black",width=0.1,size = 1)+
        scale_y_continuous(expand = c(0,0),limits = c(0,a))+
        labs(x=paste(name_i,"of all group", sep = "_"), y="group",
             title = paste("Normality test",p1,"Homogeneity of variance",p2,"kruskal.test",sumkrusk[3],sep = ":"))+mytheme
      if (length(unique(data_box$group))>3){    
        p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
      }
      ggsave(out_name, p, width = 8.3, height = 5.8)
    }else if(sumkrusk[3] >= 0.05){
      out = LSD.test(model,"group", p.adj="none")
      aa = out$group
      aa$group = row.names(aa)
      wen1 = as.data.frame(tapply(ss$count,ss$group,mean,na.rm=TRUE))
      wen2 = as.data.frame(tapply(ss$count,ss$group,sd,na.rm=TRUE))
      went = cbind(wen1,wen2)
      wentao = merge(aa,went, by="row.names",all=F)
      colnames(wentao) = c(colnames(wentao[1:4]),"mean" ,"SD")
      aa = mutate(wentao, ymin = mean - SD, ymax =  mean + SD)
      a = max(aa$mean)*1.2
      mi=c("#1B9E77" ,"#D95F02", "#7570B3","#E7298A")
      p = ggplot(aa , aes(x = group, y = mean,colour= group)) + 
        geom_bar(aes(colour= group,fill = group),stat = "identity", width = 0.4,position = "dodge") + 
        geom_errorbar(aes(ymin=ymin,ymax=ymax),colour="black",width=0.1,size = 1)+
        scale_y_continuous(expand = c(0,0),limits = c(0,a))+
        labs(x=paste(name_i,"of all group", sep = "_"),y="group",
             title = paste("Normality test",p1,"Homogeneity of variance",p2,"kruskal.test",sumkrusk[3],sep = ":"))+mytheme
      if (length(unique(data_box$group))>3){    
        p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
      }
      ggsave(out_name, p, width = 8.3, height = 5.8)
    }
    }
  
}else if(opt$type=="boxplot"){
  if (p1>.05& p2 >.05){
    p1 = round(p1,3)
    p2 = round(p2,3)
    name_i = opt$depvar
    wtx1 = summary(model)
    wtx2 = wtx1[[1]]
    wtx3 = wtx2[5]
    if ( wtx3$`Pr(>F)`[1]< 0.05) {
      out = LSD.test(model,"group", p.adj="none")#进行多重比较,不矫正P值
      aa = out$group#结果显示:标记字母法
      aa$group = row.names(aa)
      a = max(aa$count)*1.2
      data_box = df[,c(1,indepvar_index,depvar_index)]
      colnames(data_box) = c("ID" , "group","dd" )
      stat = out$groups
      data_box$stat=stat[as.character(data_box$group),]$groups
      max=max(data_box[,c("dd")])
      min=min(data_box[,c("dd")])
      x = data_box[,c("group","dd")]
      y = x %>% group_by(group) %>% summarise_(Max=paste('max(',"dd",')',sep=""))
      y=as.data.frame(y)
      rownames(y)=y$group
      data_box$y=y[as.character(data_box$group),]$Max + (max-min)*0.05
      p=ggplot(data_box, aes(x=group, y=data_box[["dd"]], color=group)) +
        geom_boxplot(alpha=1, outlier.size=0, size=0.7, width=0.5, fill="transparent") +
        labs(x=paste(name_i," group", sep = "_"),y="group",
             title = paste("Normality test",p1,"Homogeneity of variance",p2,sep = ":"))+
        geom_text(data=data_box, aes(x=group, y=y, color=group, label= stat)) +
        geom_jitter( position=position_jitter(0.17), size=1, alpha=0.7)+theme(legend.position="none")+mytheme
      if (length(unique(data_box$group))>3){    
        p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
        }
      ggsave(out_name, p, width = 8.3, height = 5.8)
    }else if(wtx3$`Pr(>F)`[1]>= 0.05){
      out = LSD.test(model,"group", p.adj="none")#进行多重比较,不矫正P值
      aa = out$group#结果显示:标记字母法
      aa$group = row.names(aa)
      a = max(aa$count)*1.2
      data_box = df[,c(1,indepvar_index,depvar_index)]
      colnames(data_box) = c("ID" , "group","dd" )
      stat = out$groups
      data_box$stat=stat[as.character(data_box$group),]$groups
      max=max(data_box[,c("dd")])
      min=min(data_box[,c("dd")])
      x = data_box[,c("group","dd")]
      y = x %>% group_by(group) %>% summarise_(Max=paste('max(',"dd",')',sep=""))
      y=as.data.frame(y)
      rownames(y)=y$group
      data_box$y=y[as.character(data_box$group),]$Max + (max-min)*0.05
      res = round(wtx3$`Pr(>F)`[1],3)
      p = ggplot(data_box, aes(x=group, y=data_box[["dd"]], color=group)) +
        geom_boxplot(alpha=1, outlier.size=0, size=0.7, width=0.5, fill="transparent") +
        labs(x=paste(name_i,"box", sep = "_"),
             y="group",
             title = paste("Normality test",p1,"Homogeneity of variance",p2,"aov",res,sep = ":"))+
        geom_jitter( position=position_jitter(0.17), size=1, alpha=0.7)+theme(legend.position="none")+mytheme
      if (length(unique(data_box$group))>3){    
        p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
      }
      ggsave(out_name, p, width = 8.3, height = 5.8)
      }
   
  }else if (p1 <.05| p2 <.05){
    p1 = round(p1,3)
    p2 = round(p2,3)
    name_i = opt$depvar
    krusk=compare_means(count~group, data = ss, method = "kruskal.test")
    sumkrusk=as.data.frame(krusk)
    if( sumkrusk[3]< 0.05){
      out = LSD.test(model,"group", p.adj="none")
      aa = out$group
      aa$group = row.names(aa)
      a = max(aa$count)*1.2
      data_box = df[,c(1,indepvar_index,depvar_index)]
      colnames(data_box) = c("ID" , "group","dd" )
      stat = out$groups
      data_box$stat=stat[as.character(data_box$group),]$groups
      max=max(data_box[,c("dd")])
      min=min(data_box[,c("dd")])
      x = data_box[,c("group","dd")]
      y = x %>% group_by(group) %>% summarise_(Max=paste('max(',"dd",')',sep=""))
      y=as.data.frame(y)
      rownames(y)=y$group
      data_box$y=y[as.character(data_box$group),]$Max + (max-min)*0.05
      wtq = levels(as.factor(df$group))
      lis = combn(wtq, 2)
      print(lis)
      x =lis
      my_comparisons = tapply(x,rep(1:ncol(x),each=nrow(x)),function(i)i)
      p=ggplot(data_box, aes(x=group, y=data_box[["dd"]], color=group)) +
        geom_boxplot(alpha=1, outlier.size=0, size=0.7, width=0.5, fill="transparent") +
        labs(x=paste(name_i,"of all group", sep = "_"),y="group",
             title = paste("Normality test",p1,"Homogeneity of variance",p2,sep = ":"))+
        geom_jitter( position=position_jitter(0.17), size=1, alpha=0.7)+theme(legend.position="none")+
        stat_compare_means()+
        stat_compare_means(comparisons=my_comparisons,label = "p.signif",hide.ns = F)+mytheme
      if (length(unique(data_box$group))>3){    
        p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
      }
      ggsave(out_name, p, width = 8.3, height = 5.8)
    }else if(sumkrusk[3] >= 0.05){
      out = LSD.test(model,"group", p.adj="none")
      aa = out$group
      aa$group = row.names(aa)
      a = max(aa$count)*1.2
      data_box = df[,c(1,indepvar_index,depvar_index)]
      colnames(data_box) = c("ID" , "group","dd" )
      stat = out$groups
      data_box$stat=stat[as.character(data_box$group),]$groups
      max=max(data_box[,c("dd")])
      min=min(data_box[,c("dd")])
      x = data_box[,c("group","dd")]
      y = x %>% group_by(group) %>% summarise_(Max=paste('max(',"dd",')',sep=""))
      y=as.data.frame(y)
      rownames(y)=y$group
      data_box$y=y[as.character(data_box$group),]$Max + (max-min)*0.05
      res = round(sumkrusk[3],3)
      p=ggplot(data_box, aes(x=group, y=data_box[["dd"]], color=group)) +
        geom_boxplot(alpha=1, outlier.size=0, size=0.7, width=0.5, fill="transparent") +
        labs(x=paste(name_i,"box", sep = "_"),
             y="group",title = paste("Normality test",p1,"Homogeneity of variance",p2,"aov",res,sep = ":"))+
        geom_jitter( position=position_jitter(0.17), size=1, alpha=0.7)+theme(legend.position="none")+mytheme
      if (length(unique(data_box$group))>3){    
        p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
      }
      ggsave(out_name, p, width = 8.3, height = 5.8)
    }
  }
  
}
    

脚本有五个参数

-f:输出的数据,第一列是样本名,第二列是自变量也就是分组信息,第三列至以后就是因变量,就是分组效应

-d:因变量的列名

-i:自变量的列名

-t:选择可视化的类型,箱线图或者条形图

-o:输出文件名称的前缀

使用示例:

Rscript aov.R -f input2.txt -d response -i trt -t boxplot -o 123
image.png
上一篇下一篇

猜你喜欢

热点阅读