一键完成单因素方差分析及可视化的R脚本
2021-04-14 本文已影响0人
邱俊辉
library(optparse)
library(tidyverse)
library(agricolae)
library(reshape2)
library(ggplot2)
library(ggpubr)
mytheme=theme(
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
plot.title = element_text(vjust = -8.5,hjust = 0.1),
axis.title.y =element_text(size = 20,face = "bold",colour = "black"),
axis.title.x =element_text(size = 24,face = "bold",colour = "black"),
axis.text = element_text(size = 20,face = "bold"),
axis.text.x = element_text(colour = "black",size = 14),
axis.text.y = element_text(colour = "black",size = 14),
legend.text = element_text(size = 15,face = "bold"),
legend.position = "none"#是否删除图例)
)
option_list=list(
make_option(c("-f","--file"),type = "character",default = FALSE,
help = "The input file"),
make_option(c("-d","--depvar"),type = "character",default = FALSE,
help="The column name of the dependent variable"),
make_option(c("-i","--indepvar"),type="character",default=FALSE,
help="The column name of the independent variable"),
make_option(c("-t","--type"),type = "character",default = FALSE,
help = "Type of drawing: boxplot or barplot"),
make_option(c("-o","--out"),type = "character",default = FALSE,
help = "the out put file name")
)
opt = parse_args(OptionParser(option_list = option_list, usage = "This Script is use for Analysis of variance and plotting"))
###参数检验
if(opt$type != "boxplot" && opt$type != "barplot"){
print("Please input the right type of drawing:boxplot or barplot!")
stop("Please input the right type of drawing:boxplot or barplot!")
}
out_name=paste(opt$out,"pdf",sep = ".")
#数据列从第一列开始是ID,第二列是分组信息,剩下的列均为数据列
df=read.table(opt$file,sep = "\t",header = T)
colname_list=colnames(df)
if(opt$depvar %in% colname_list == FALSE){
print("Please input the correct column name of the dependent variable")
print(colname_list)
stop("Please input the correct column name of the dependent variable")
}
depvar_index=which(colname_list==opt$depvar)
indepvar_index=which(colname_list==opt$indepvar)
ss = df[depvar_index]
colnames(ss) = c("count")
ss$group = df[,indepvar_index]
# 正态性检验 Shapiro-Wilk normality test,保存p-value
normality=shapiro.test(ss$count)
p1 = normality$p.value
# 方差齐性检验 Bartlett test of homogeneity of variances,保存p-value
homo = bartlett.test(count~group, data = ss)
p2 = homo$p.value
model = aov(count~group, data = ss)
###绘制柱状图
if(opt$type=="barplot"){
if (p1 > 0.05 & p2 > 0.05) {
p1 = round(p1,3)
p2 = round(p2,3)
name_i = opt$depvar
wtx1 = summary(model)
wtx2 = wtx1[[1]]
wtx3 = wtx2[5]
# 条件2. anova存在显著差异分组
if ( wtx3$`Pr(>F)`[1]< 0.05) {
# 进行多重比较,不矫正P值
out = LSD.test(model,"group", p.adj="none")
aa = out$group
aa$group = row.names(aa)
wen1 = as.data.frame(tapply(ss$count,ss$group,mean,na.rm=TRUE))
wen2 = as.data.frame(tapply(ss$count,ss$group,sd,na.rm=TRUE))
went = cbind(wen1,wen2)
wentao = merge(aa,went, by="row.names",all=F)
colnames(wentao) = c(colnames(wentao[1:4]),"mean" ,"SD")
aa = mutate(wentao, ymin = mean - SD, ymax = mean + SD)
a = max(aa$mean)*1.2
p=ggplot(aa , aes(x = group, y = mean,colour= group)) +
geom_bar(aes(colour= group,fill = group),stat = "identity", width = 0.4,position = "dodge") +
geom_text(aes(label = groups,y=ymax, x = group,vjust = -0.3,size = 6))+
geom_errorbar(aes(ymin=ymin,ymax=ymax),colour="black",width=0.1,size = 1)+
scale_y_continuous(expand = c(0,0),limits = c(0,a))+
labs(x=paste(name_i,"of all group", sep = "_"),y="group",
title = paste("Normality test",p1,"Homogeneity of variance",p2,sep = ":"))+mytheme
if (length(unique(data_box$group))>3){
p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
}
ggsave(out_name, p, width = 8.3, height = 5.8)
}else if ( wtx3$`Pr(>F)`[1]>= 0.05){# anova不存在显著差异分组
out = LSD.test(model,"group", p.adj="none")
aa = out$groups
aa$group = row.names(aa)
wen1 = as.data.frame(tapply(ss$count,ss$group,mean,na.rm=TRUE))
wen2 = as.data.frame(tapply(ss$count,ss$group,sd,na.rm=TRUE))
went = cbind(wen1,wen2)
wentao = merge(aa,went, by="row.names",all=F)
colnames(wentao) = c(colnames(wentao[1:4]),"mean" ,"SD")
aa = mutate(wentao, ymin = mean - SD, ymax = mean + SD)
a = max(aa$mean)*1.2
res = round(wtx3$`Pr(>F)`[1],3)
p = ggplot(aa , aes(x = group, y = mean,colour= group)) +
geom_bar(aes(colour= group,fill = group),stat = "identity", width = 0.4,position = "dodge") +
geom_errorbar(aes(ymin=ymin,ymax=ymax),colour="black",width=0.1,size = 1)+
scale_y_continuous(expand = c(0,0),limits = c(0,a))+
labs(x=paste(name_i,"of all group", sep = "_"),y="group",
title = paste("Normality test",p1,"Homogeneity of variance",p2,"aov",res,sep = ":"))+mytheme
if (length(unique(data_box$group))>3){
p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
}
ggsave(out_name, p, width = 8.3, height = 5.8)
}
}else if( p1 <.05| p2 <.05){
p1 = round(p1,3)
p2 = round(p2,3)
name_i = opt$depvar
krusk=compare_means(count~group, data = ss, method = "kruskal.test")
sumkrusk=as.data.frame(krusk)
if ( sumkrusk[3]< 0.05) {
out = LSD.test(model,"group", p.adj="none")
aa = out$group
aa$group = row.names(aa)
wen1 = as.data.frame(tapply(ss$count,ss$group,mean,na.rm=TRUE))
wen2 = as.data.frame(tapply(ss$count,ss$group,sd,na.rm=TRUE))
went = cbind(wen1,wen2)
wentao = merge(aa,went, by="row.names",all=F)
colnames(wentao) = c(colnames(wentao[1:4]),"mean" ,"SD")
aa = mutate(wentao, ymin = mean - SD, ymax = mean + SD)
a = max(aa$mean)*1.2
p = ggplot(aa , aes(x = group, y = mean,colour= group)) +
geom_bar(aes(colour= group,fill = group),stat = "identity", width = 0.4,position = "dodge") +
geom_errorbar(aes(ymin=ymin,ymax=ymax),colour="black",width=0.1,size = 1)+
scale_y_continuous(expand = c(0,0),limits = c(0,a))+
labs(x=paste(name_i,"of all group", sep = "_"), y="group",
title = paste("Normality test",p1,"Homogeneity of variance",p2,"kruskal.test",sumkrusk[3],sep = ":"))+mytheme
if (length(unique(data_box$group))>3){
p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
}
ggsave(out_name, p, width = 8.3, height = 5.8)
}else if(sumkrusk[3] >= 0.05){
out = LSD.test(model,"group", p.adj="none")
aa = out$group
aa$group = row.names(aa)
wen1 = as.data.frame(tapply(ss$count,ss$group,mean,na.rm=TRUE))
wen2 = as.data.frame(tapply(ss$count,ss$group,sd,na.rm=TRUE))
went = cbind(wen1,wen2)
wentao = merge(aa,went, by="row.names",all=F)
colnames(wentao) = c(colnames(wentao[1:4]),"mean" ,"SD")
aa = mutate(wentao, ymin = mean - SD, ymax = mean + SD)
a = max(aa$mean)*1.2
mi=c("#1B9E77" ,"#D95F02", "#7570B3","#E7298A")
p = ggplot(aa , aes(x = group, y = mean,colour= group)) +
geom_bar(aes(colour= group,fill = group),stat = "identity", width = 0.4,position = "dodge") +
geom_errorbar(aes(ymin=ymin,ymax=ymax),colour="black",width=0.1,size = 1)+
scale_y_continuous(expand = c(0,0),limits = c(0,a))+
labs(x=paste(name_i,"of all group", sep = "_"),y="group",
title = paste("Normality test",p1,"Homogeneity of variance",p2,"kruskal.test",sumkrusk[3],sep = ":"))+mytheme
if (length(unique(data_box$group))>3){
p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
}
ggsave(out_name, p, width = 8.3, height = 5.8)
}
}
}else if(opt$type=="boxplot"){
if (p1>.05& p2 >.05){
p1 = round(p1,3)
p2 = round(p2,3)
name_i = opt$depvar
wtx1 = summary(model)
wtx2 = wtx1[[1]]
wtx3 = wtx2[5]
if ( wtx3$`Pr(>F)`[1]< 0.05) {
out = LSD.test(model,"group", p.adj="none")#进行多重比较,不矫正P值
aa = out$group#结果显示:标记字母法
aa$group = row.names(aa)
a = max(aa$count)*1.2
data_box = df[,c(1,indepvar_index,depvar_index)]
colnames(data_box) = c("ID" , "group","dd" )
stat = out$groups
data_box$stat=stat[as.character(data_box$group),]$groups
max=max(data_box[,c("dd")])
min=min(data_box[,c("dd")])
x = data_box[,c("group","dd")]
y = x %>% group_by(group) %>% summarise_(Max=paste('max(',"dd",')',sep=""))
y=as.data.frame(y)
rownames(y)=y$group
data_box$y=y[as.character(data_box$group),]$Max + (max-min)*0.05
p=ggplot(data_box, aes(x=group, y=data_box[["dd"]], color=group)) +
geom_boxplot(alpha=1, outlier.size=0, size=0.7, width=0.5, fill="transparent") +
labs(x=paste(name_i," group", sep = "_"),y="group",
title = paste("Normality test",p1,"Homogeneity of variance",p2,sep = ":"))+
geom_text(data=data_box, aes(x=group, y=y, color=group, label= stat)) +
geom_jitter( position=position_jitter(0.17), size=1, alpha=0.7)+theme(legend.position="none")+mytheme
if (length(unique(data_box$group))>3){
p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
}
ggsave(out_name, p, width = 8.3, height = 5.8)
}else if(wtx3$`Pr(>F)`[1]>= 0.05){
out = LSD.test(model,"group", p.adj="none")#进行多重比较,不矫正P值
aa = out$group#结果显示:标记字母法
aa$group = row.names(aa)
a = max(aa$count)*1.2
data_box = df[,c(1,indepvar_index,depvar_index)]
colnames(data_box) = c("ID" , "group","dd" )
stat = out$groups
data_box$stat=stat[as.character(data_box$group),]$groups
max=max(data_box[,c("dd")])
min=min(data_box[,c("dd")])
x = data_box[,c("group","dd")]
y = x %>% group_by(group) %>% summarise_(Max=paste('max(',"dd",')',sep=""))
y=as.data.frame(y)
rownames(y)=y$group
data_box$y=y[as.character(data_box$group),]$Max + (max-min)*0.05
res = round(wtx3$`Pr(>F)`[1],3)
p = ggplot(data_box, aes(x=group, y=data_box[["dd"]], color=group)) +
geom_boxplot(alpha=1, outlier.size=0, size=0.7, width=0.5, fill="transparent") +
labs(x=paste(name_i,"box", sep = "_"),
y="group",
title = paste("Normality test",p1,"Homogeneity of variance",p2,"aov",res,sep = ":"))+
geom_jitter( position=position_jitter(0.17), size=1, alpha=0.7)+theme(legend.position="none")+mytheme
if (length(unique(data_box$group))>3){
p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
}
ggsave(out_name, p, width = 8.3, height = 5.8)
}
}else if (p1 <.05| p2 <.05){
p1 = round(p1,3)
p2 = round(p2,3)
name_i = opt$depvar
krusk=compare_means(count~group, data = ss, method = "kruskal.test")
sumkrusk=as.data.frame(krusk)
if( sumkrusk[3]< 0.05){
out = LSD.test(model,"group", p.adj="none")
aa = out$group
aa$group = row.names(aa)
a = max(aa$count)*1.2
data_box = df[,c(1,indepvar_index,depvar_index)]
colnames(data_box) = c("ID" , "group","dd" )
stat = out$groups
data_box$stat=stat[as.character(data_box$group),]$groups
max=max(data_box[,c("dd")])
min=min(data_box[,c("dd")])
x = data_box[,c("group","dd")]
y = x %>% group_by(group) %>% summarise_(Max=paste('max(',"dd",')',sep=""))
y=as.data.frame(y)
rownames(y)=y$group
data_box$y=y[as.character(data_box$group),]$Max + (max-min)*0.05
wtq = levels(as.factor(df$group))
lis = combn(wtq, 2)
print(lis)
x =lis
my_comparisons = tapply(x,rep(1:ncol(x),each=nrow(x)),function(i)i)
p=ggplot(data_box, aes(x=group, y=data_box[["dd"]], color=group)) +
geom_boxplot(alpha=1, outlier.size=0, size=0.7, width=0.5, fill="transparent") +
labs(x=paste(name_i,"of all group", sep = "_"),y="group",
title = paste("Normality test",p1,"Homogeneity of variance",p2,sep = ":"))+
geom_jitter( position=position_jitter(0.17), size=1, alpha=0.7)+theme(legend.position="none")+
stat_compare_means()+
stat_compare_means(comparisons=my_comparisons,label = "p.signif",hide.ns = F)+mytheme
if (length(unique(data_box$group))>3){
p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
}
ggsave(out_name, p, width = 8.3, height = 5.8)
}else if(sumkrusk[3] >= 0.05){
out = LSD.test(model,"group", p.adj="none")
aa = out$group
aa$group = row.names(aa)
a = max(aa$count)*1.2
data_box = df[,c(1,indepvar_index,depvar_index)]
colnames(data_box) = c("ID" , "group","dd" )
stat = out$groups
data_box$stat=stat[as.character(data_box$group),]$groups
max=max(data_box[,c("dd")])
min=min(data_box[,c("dd")])
x = data_box[,c("group","dd")]
y = x %>% group_by(group) %>% summarise_(Max=paste('max(',"dd",')',sep=""))
y=as.data.frame(y)
rownames(y)=y$group
data_box$y=y[as.character(data_box$group),]$Max + (max-min)*0.05
res = round(sumkrusk[3],3)
p=ggplot(data_box, aes(x=group, y=data_box[["dd"]], color=group)) +
geom_boxplot(alpha=1, outlier.size=0, size=0.7, width=0.5, fill="transparent") +
labs(x=paste(name_i,"box", sep = "_"),
y="group",title = paste("Normality test",p1,"Homogeneity of variance",p2,"aov",res,sep = ":"))+
geom_jitter( position=position_jitter(0.17), size=1, alpha=0.7)+theme(legend.position="none")+mytheme
if (length(unique(data_box$group))>3){
p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))
}
ggsave(out_name, p, width = 8.3, height = 5.8)
}
}
}
脚本有五个参数
-f:输出的数据,第一列是样本名,第二列是自变量也就是分组信息,第三列至以后就是因变量,就是分组效应
-d:因变量的列名
-i:自变量的列名
-t:选择可视化的类型,箱线图或者条形图
-o:输出文件名称的前缀
使用示例:
Rscript aov.R -f input2.txt -d response -i trt -t boxplot -o 123
image.png