物种稀释曲线

2022-11-29 本文已影响0人 jjjscuedu

在微生物测序分析及相关的文章中，我们经常可以看到这样的图，称为稀释曲线（Rarefaction curves）。

稀释曲线是从样品中随机抽取一定测序量的数据（序列条数），统计它们所对应的OTUs种类（代表物种），并以抽取的测序数据量与对应的代表OTUs来构建曲线。横坐标代表随机抽取的序列数量，纵坐标代表观测到的OTUs种类数量，样本曲线的延伸终点的横坐标位置为对应样本的测序数量。稀释曲线可直接反映测序数据量的合理性，并间接反映样品中物种的丰富程度，当曲线趋向平坦时，说明测序数据量渐进合理，更多的数据量只会产生少量新OTUs（物种）；反之表明不饱和，增加数据量可以发现更多OTUs。

其实做过基因组工作者的话，kmer的前期思想和这个也是差不多的。

如果还记得我们前面学习过的alpha指数的话，这种稀释曲线实质上反映了Alpha多样性中的物种丰富度指数（Richness）信息。当然，现在我们还可以看到以Shannon指数、Chao1指数等绘制的稀释曲线。无论这些稀释曲线代表了物种丰富度（累计数量）、Shannon指数、Chao1指数还是其它，总之均代表了Alpha多样性指数的信息，因此它们均可以统称为Alpha多样性曲线。

下面，我们就来测试稀释曲线怎么画。测试数据还是我们上一个帖子的测试数据。

library(vegan) #用于计算 Shannon 熵指数、Simpson 指数、Chao1 指数、ACE 指数等，同时用于抽样

library(picante) #用于计算 PD_whole_tree，若不计算它就无需加载。

library(ggplot2) #用于 ggplot2 作图

library(doBy) #用于分组统计

library(ggalt) #用于绘制拟合曲线

#读取 OTU 丰度表

otu <- read.delim("otu_table.tsv", row.names = 1, sep = '\t', stringsAsFactors = FALSE, check.names = FALSE)

otu <- t(otu)

第一种方法：vegan包rarecurve()可以直接绘制稀释曲线

rarecurve(otu, step = 1000,col=rainbow(12))

第二种方法：自己来取样实现（复杂，好处是可以对最终的图进行细调）

#计算多种 Alpha 多样性指数，结果返回至向量，和前面的alpha指数的帖子类似，只不过只返回一个指数而已

alpha_index <- function(x, method = 'richness', tree = NULL, base = exp(1)) {

if (method == 'richness') result <- rowSums(x > 0) #丰富度指数

else if (method == 'chao1') result <- estimateR(x)[2, ] #Chao1 指数

else if (method == 'ace') result <- estimateR(x)[4, ] #ACE 指数

else if (method == 'shannon') result <- diversity(x, index = 'shannon', base = base) #Shannon 指数

else if (method == 'simpson') result <- diversity(x, index = 'simpson') #Gini-Simpson 指数

else if (method == 'pielou') result <- diversity(x, index = 'shannon', base = base) / log(estimateR(x)[1, ], base) #Pielou 均匀度

else if (method == 'gc') result <- 1 - rowSums(x == 1) / rowSums(x) #goods_coverage

else if (method == 'pd' & !is.null(tree)) { #PD_whole_tree

pd <- pd(x, tree, include.root = FALSE)

result <- pd[ ,1]

names(result) <- rownames(pd)

}

result

}

#运用R语言vegan包的rrarefy函数将OTU丰度矩阵稀释一次使每个样本中的序列数量一致(细菌20000真菌500)以减少测序序列变化引起的误差。

#根据抽样步长（step），统计每个稀释梯度下的 Alpha 多样性指数，结果返回至列表

alpha_curves <- function(x, step, method = 'richness', rare = NULL, tree = NULL, base = exp(1)) {

x_nrow <- nrow(x)

if (is.null(rare)) rare <- rowSums(x) else rare <- rep(rare, x_nrow)

alpha_rare <- list()

for (i in 1:x_nrow) {

step_num <- seq(0, rare[i], step)

if (max(step_num) < rare[i]) step_num <- c(step_num, rare[i])

alpha_rare_i <- NULL

for (step_num_n in step_num) alpha_rare_i <- c(alpha_rare_i, alpha_index(x = rrarefy(x[i, ], step_num_n), method = method, tree = tree, base = base))

names(alpha_rare_i) <- step_num

alpha_rare <- c(alpha_rare, list(alpha_rare_i))

}

names(alpha_rare) <- rownames(x)

alpha_rare

}

其实，第二个函数调用了第一个函数，首先对丰度表进行稀释抽样，并使用alpha_index()计算各抽样深度下的Alpha多样性指数。其中，x为输入的物种丰度表；step为抽样步长，例如当设置step=1000时，则以1000为一个梯度，抽样1000、2000、3000……条序列，直到最大深度或者达到设定值；rare，设定一个值，抽样到此深度下停止，否则将默认抽样至各样本的最大深度；其余三个参数method、tree、base和alpha_index()一致。

##以下以物种丰富度指数为例绘制 Alpha 多样性曲线，以 2000 步长（step=2000）为例统计

richness_curves <- alpha_curves(otu, step = 2000, method = 'richness')

#获得 ggplot2 作图文件

plot_richness <- data.frame()

for (i in names(richness_curves)) {

richness_curves_i <- (richness_curves[[i]])

richness_curves_i <- data.frame(rare = names(richness_curves_i), alpha = richness_curves_i, sample = i, stringsAsFactors = FALSE)

plot_richness <- rbind(plot_richness, richness_curves_i)

}

rownames(plot_richness) <- NULL

plot_richness$rare <- as.numeric(plot_richness$rare)

plot_richness$alpha <- as.numeric(plot_richness$alpha)

ggplot(plot_richness, aes(rare, alpha, color = sample)) +

geom_line(size=2) +

labs(x = 'Number of sequences', y = 'Richness', color = NULL) +

theme(panel.grid = element_blank(), panel.background = element_rect(fill = 'transparent', color = 'black'), legend.key = element_rect(fill = 'transparent')) +

geom_vline(xintercept = min(rowSums(otu)), linetype = 2) +

scale_x_continuous(breaks = seq(0, 30000, 5000), labels = as.character(seq(0, 30000, 5000)))

物种稀释曲线

猜你喜欢

热点阅读