R语言:DESeq计算组间Fold Change
2020-08-17 本文已影响0人
胡童远
导读
利用R语言DESeq2函数包deseq函数分析两组间细菌丰度的差异,寻找组间abs(log2FoldChange)显著大于1的细菌。
一、下载安装
也有其他的下载安装方法,成功即使王道
BiocManager::install("DESeq2")
# C:\Users\win10\AppData\Local\Temp\RtmpoNelOf\downloaded_packages
install.packages("XML", type = "binary")
# C:\Users\win10\AppData\Local\Temp\RtmpoNelOf\downloaded_packages
# 指定路径的方法
# .libPaths()
# .libPaths("C:/Users/Administrator/Desktop/test")
# install.packages("zoo", lib="C:/software/Rpackages")
# library("zoo", lib.loc="C:/software/Rpackages")
二、数据格式
data = read.table(paste(project, "txt", sep="."), header=T, sep="\t", row.names=1)
head(data)
三、提取菌门、菌科、菌属、样品序号
# 提取菌门、菌科、菌属
new_name=c()
phylum=c()
for(i in 1:length(rownames(data)))
{
new_name = c(new_name, paste(unlist(strsplit(rownames(data)[i], "__|;"))[c(10, 12)], collapse="_"))
# 提取菌科菌属
phylum = c(phylum, unlist(strsplit(rownames(data)[i], "__|;"))[4])
# 提取菌门
}
color = data.frame(phylum, new_name) # 菌门染色信息
rownames(data) = new_name # 修改行名
write.csv(data.frame(rownames(data), data$taxonomy), file=paste(project, "mapping.csv", sep="_"), row.names=F) # 制作菌名mapping文件
data = data[, -length(data[1,])] # 剔除最后一列
head(data)
CON1 CON10 CON11 CON12 CON13 CON14 CON15 CON16
Enterococcaceae_Enterococcus 0 0 2 0 0 83 0 0
Actinomycetaceae_Actinomyces 0 0 0 0 0 0 0 0
Micrococcaceae_Rothia 33 0 0 0 0 0 0 6
Clostridiaceae_Clostridium 619 1748 137 52 5 3053 57 404
Clostridiaceae_SMB53 15 73 0 0 0 138 30 8
Lachnospiraceae_Lachnospira 170 0 6 219 283 147 92 25
# 获取样品序号
prefix=c()
for(i in 1:length(colnames(data)))
{
prefix = c(prefix, unlist(strsplit(colnames(data)[i], "[0-9]+")))
# 从样品名中用正则拆出数字,作为序号
}
prefix
[1] "CON" "CON" "CON" "CON" "CON" "CON" "CON" "CON" "CON"
[10] "CON" "CON" "CON" "CON" "CON" "CON" "CON" "CON" "CON"
[19] "CON" "CON" "CON" "CON" "CON" "CON" "CON" "CON" "CON"
[28] "CON" "CON" "CON" "CON" "CON" "CON" "CON" "CON" "CON"
[37] "CON" "CON" "CON" "CON" "CON" "CON" "CON" "CON" "CON"
[46] "CON" "CON" "CON" "CON" "CON" "CON" "CON" "NPM_F" "NPM_F"
[55] "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F"
[64] "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F"
[73] "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F"
[82] "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F"
[91] "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F"
[100] "NPM_F" "NPM_F" "NPM_F"
condition = factor(prefix, levels = unique(prefix))
group = data.frame(row.names = colnames(data), condition)
# 分组设因子
head(group)
condition
CON1 CON
CON10 CON
CON11 CON
CON12 CON
CON13 CON
CON14 CON
四、丰度热图
# pheatmap
library("pheatmap")
colors = list(Group = c(Control="#1B9E77", NPM_F="#D95F02"))
# colors = list(Group = c(Control="#1B9E77", NPM="#D95F02"))
pheatmap(data, filename = paste(project, "abundance_heatmap.pdf", sep="_"),
scale="column",
cellwidth=20,
cellheight=20,
cluster_cols = F,
annotation_col = group,
annotation_colors = colors,
annotation_names_col = F)
五、DESeq分析
new_data = data
for(i in 1:length(data[,1]))
{
for(j in 1:length(data[1,]))
{
new_data[i, j] = data[i, j] + 1 # 每个数加1
}
}
# deseq分析:
# 格式化
format = DESeqDataSetFromMatrix(new_data, group, design = ~condition)
# deseq分析
analyze = DESeq(format)
# 提取结果
result = results(analyze, contrast = c("condition", unique(prefix))) # 分组变量名、分组名
result = result[order(result$pvalue),] # p值排序
# summary(result)
write.csv(result, file = paste(project, "deseq_result.csv", sep="_"))
# 提取差异细菌,这里我用的方法是倍差法,获取padj(p值经过多重校验校正后的值)小于0.05,表达倍数取以2为对数后大于1或者小于-1的差异表达基因
diff_taxon = subset(result, padj < 0.05 & abs(log2FoldChange) > 1)
write.csv(diff_taxon, file= paste(project, "diff_taxon_padj_0.05_fc_1.csv", sep="_"))
六、DESeq可视化
# 整理绘图文件
input = data.frame(name=rownames(diff_taxon), log2fc=diff_taxon$log2FoldChange) # 提取log2FoldChange
input = merge(input, color, by.x="name", by.y="new_name", all.x=T) # 添加门信息
input = input[order(input$log2fc, decreasing=F),] # 排序
head(input)
name log2fc phylum
6 Enterococcaceae_Enterococcus -4.828971 Firmicutes
1 Actinomycetaceae_Actinomyces -3.046460 Actinobacteria
5 Enterobacteriaceae_Escherichia -2.648138 Proteobacteria
12 Streptococcaceae_Streptococcus -1.975614 Firmicutes
11 Micrococcaceae_Rothia -1.897897 Actinobacteria
7 Erysipelotrichaceae_[Eubacterium] -1.744487 Firmicutes
result = ggplot(input, aes(x=log2fc, y=name, fill = phylum)) +
geom_point(pch = 21, size = 4, color = "black") +
labs(x="log2(Fold Change)", y="Taxa", fill="Phylum") +
scale_y_discrete(limits = factor(input$name)) +
geom_vline(xintercept = 0, size = 1, linetype = "solid") +
theme(legend.background = element_rect(color = "black", linetype = "solid", size = 1))
ggsave(result, filename = paste(project, "diff_taxon.pdf", sep = "_"))
ggsave(result, filename = paste(project, "diff_taxon.png", sep = "_"))