生物信息学

R语言:DESeq计算组间Fold Change

2020-08-17  本文已影响0人  胡童远

导读

利用R语言DESeq2函数包deseq函数分析两组间细菌丰度的差异,寻找组间abs(log2FoldChange)显著大于1的细菌。

一、下载安装

也有其他的下载安装方法,成功即使王道

BiocManager::install("DESeq2")
# C:\Users\win10\AppData\Local\Temp\RtmpoNelOf\downloaded_packages
install.packages("XML", type = "binary")
# C:\Users\win10\AppData\Local\Temp\RtmpoNelOf\downloaded_packages

# 指定路径的方法
# .libPaths()
# .libPaths("C:/Users/Administrator/Desktop/test")
# install.packages("zoo", lib="C:/software/Rpackages")
# library("zoo", lib.loc="C:/software/Rpackages")

二、数据格式

data = read.table(paste(project, "txt", sep="."), header=T, sep="\t", row.names=1)
head(data)

三、提取菌门、菌科、菌属、样品序号

# 提取菌门、菌科、菌属
new_name=c()
phylum=c()
for(i in 1:length(rownames(data)))
{
    new_name = c(new_name, paste(unlist(strsplit(rownames(data)[i], "__|;"))[c(10, 12)], collapse="_"))
    # 提取菌科菌属
    phylum = c(phylum, unlist(strsplit(rownames(data)[i], "__|;"))[4])
    # 提取菌门
}
color = data.frame(phylum, new_name)  # 菌门染色信息
rownames(data) = new_name  # 修改行名
write.csv(data.frame(rownames(data), data$taxonomy), file=paste(project, "mapping.csv", sep="_"), row.names=F)  # 制作菌名mapping文件
data = data[, -length(data[1,])]  # 剔除最后一列
head(data)
                             CON1 CON10 CON11 CON12 CON13 CON14 CON15 CON16
Enterococcaceae_Enterococcus    0     0     2     0     0    83     0     0
Actinomycetaceae_Actinomyces    0     0     0     0     0     0     0     0
Micrococcaceae_Rothia          33     0     0     0     0     0     0     6
Clostridiaceae_Clostridium    619  1748   137    52     5  3053    57   404
Clostridiaceae_SMB53           15    73     0     0     0   138    30     8
Lachnospiraceae_Lachnospira   170     0     6   219   283   147    92    25

# 获取样品序号
prefix=c()
for(i in 1:length(colnames(data)))
{
    prefix = c(prefix, unlist(strsplit(colnames(data)[i], "[0-9]+")))
    # 从样品名中用正则拆出数字,作为序号
}
prefix
  [1] "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"  
 [10] "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"  
 [19] "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"  
 [28] "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"  
 [37] "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"  
 [46] "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "CON"   "NPM_F" "NPM_F"
 [55] "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F"
 [64] "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F"
 [73] "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F"
 [82] "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F"
 [91] "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F" "NPM_F"
[100] "NPM_F" "NPM_F" "NPM_F"

condition = factor(prefix, levels = unique(prefix))
group = data.frame(row.names = colnames(data), condition)
# 分组设因子
head(group)
      condition
CON1        CON
CON10       CON
CON11       CON
CON12       CON
CON13       CON
CON14       CON

四、丰度热图

# pheatmap
library("pheatmap")

colors = list(Group = c(Control="#1B9E77", NPM_F="#D95F02"))
# colors = list(Group = c(Control="#1B9E77", NPM="#D95F02"))

pheatmap(data, filename = paste(project, "abundance_heatmap.pdf", sep="_"), 
         scale="column", 
         cellwidth=20, 
         cellheight=20, 
         cluster_cols = F,
         annotation_col = group,
         annotation_colors = colors,
         annotation_names_col = F)

五、DESeq分析

new_data = data
for(i in 1:length(data[,1]))
{
    for(j in 1:length(data[1,]))
    {
        new_data[i, j] = data[i, j] + 1  # 每个数加1
    }
}

# deseq分析:
# 格式化
format = DESeqDataSetFromMatrix(new_data, group, design = ~condition) 

# deseq分析
analyze = DESeq(format)

# 提取结果
result = results(analyze, contrast = c("condition", unique(prefix)))  # 分组变量名、分组名
result = result[order(result$pvalue),]  # p值排序

# summary(result)
write.csv(result, file = paste(project, "deseq_result.csv", sep="_"))

# 提取差异细菌,这里我用的方法是倍差法,获取padj(p值经过多重校验校正后的值)小于0.05,表达倍数取以2为对数后大于1或者小于-1的差异表达基因
diff_taxon = subset(result, padj < 0.05 & abs(log2FoldChange) > 1)
write.csv(diff_taxon, file= paste(project, "diff_taxon_padj_0.05_fc_1.csv", sep="_"))

六、DESeq可视化

# 整理绘图文件
input = data.frame(name=rownames(diff_taxon), log2fc=diff_taxon$log2FoldChange)  # 提取log2FoldChange
input = merge(input, color, by.x="name", by.y="new_name", all.x=T)  # 添加门信息
input = input[order(input$log2fc, decreasing=F),]  # 排序
head(input)
                                name    log2fc         phylum
6       Enterococcaceae_Enterococcus -4.828971     Firmicutes
1       Actinomycetaceae_Actinomyces -3.046460 Actinobacteria
5     Enterobacteriaceae_Escherichia -2.648138 Proteobacteria
12    Streptococcaceae_Streptococcus -1.975614     Firmicutes
11             Micrococcaceae_Rothia -1.897897 Actinobacteria
7  Erysipelotrichaceae_[Eubacterium] -1.744487     Firmicutes


result = ggplot(input, aes(x=log2fc, y=name, fill = phylum)) +
  geom_point(pch = 21, size = 4, color = "black") +
  labs(x="log2(Fold Change)", y="Taxa", fill="Phylum") +
  scale_y_discrete(limits = factor(input$name)) +
  geom_vline(xintercept = 0, size = 1, linetype = "solid") +
  theme(legend.background = element_rect(color = "black", linetype = "solid", size = 1))

ggsave(result, filename = paste(project, "diff_taxon.pdf", sep = "_"))
ggsave(result, filename = paste(project, "diff_taxon.png", sep = "_"))

参考:https://cloud.tencent.com/developer/article/1378845

上一篇下一篇

猜你喜欢

热点阅读