Pan permutation
2021-09-08 本文已影响0人
胡童远
rarefaction对PAV数据进行permutation
rarefaction: Rarefaction curves for a pan-genome
heaps: Heaps law estimate
library(micropan)
library(ggplot2)
library(tidyr)
输入
data(xmpl.panmat)
# gene clsuter/family - genome pav table
df = xmpl.panmat
nrow(df) # 5 基因组
ncol(df) # 642 基因cluster

排列数
factorial(5) * choose(5, 5)
heaps law model
# open or closed: If alpha < 1 it indicates an open pan-genome
heaps(df, n.perm = 120)

排列和计算pan
# permutation
rare <- rarefaction(df, n.perm = 120)

整理数据
# rare[1:5, 1:5] # 检查
rare = data.frame(rare)
rare = rare[-1,-1]
rownames(rare) = 1:nrow(rare)
rare = t(rare)
rare 数据分布
table(rare[,1])
table(rare[,2])
table(rare[,3])
table(rare[,4])
table(rare[,5])
len_uni = c()
for(i in 1:ncol(rare))
{
len_uni = c(len_uni, length(unique(rare[, i])))
}
plot(len_uni)


箱型图
## ggplot
library(stringr)
data = data.frame(rare)
data$id = rownames(data)
data = melt(data, id='id')
gid = c()
for(i in data$variable)
{
gid = c(gid, unlist(strsplit(i, "X"))[2])
}
data$GID = gid
ggplot(data, aes(x = GID, y = value)) +
geom_boxplot(width = 0.5, color = "deepskyblue3",
size = 1) +
geom_point(size = 2, color = "deepskyblue3",
position = position_dodge(0.2)) +
theme_classic() +
labs(x="", y="Number of Gene Families") +
theme(axis.title = element_text(size = 25),
axis.text.y = element_text(size = 18),
axis.text.x = element_text(size = 20),
axis.line = element_line(size = 1),
axis.ticks = element_line(size = 1))

拟合曲线 -案例
PEPPAN pangenome数据整理
setwd("xudong/peppan/")
library(micropan)
library(ggplot2)
library(tidyr)
df = read.table("peppan.PEPPAN.gene_content.Rtab", head=T, sep="\t", row.names=1)
# gene clsuter/family - genome pav table
df = data.frame(t(df))
# permutation number 组合数
factorial(74) * choose(74, 74)
# open or closed: If alpha < 1 it indicates an open pan-genome
heaps(df, n.perm = 1000)
# permutation
rare <- rarefaction(df, n.perm = 10000)
# 整理数据
# rare[1:5, 1:5] # 检查
rare = data.frame(rare)
rare = rare[-1,-1]
rownames(rare) = 1:nrow(rare)
rare = t(rare)
# rare unique 数据分布
len_uni = c()
for(i in 1:ncol(rare))
{
len_uni = c(len_uni, length(unique(rare[, i])))
}
plot(len_uni)
nls拟合曲线,利用BPGA的pan curve估计值
# nls 拟合曲线 a*x^b
data$ID = as.numeric(data$GID)
f = function(x, a, b, c){a*x^b+c}
result = nls(data$value ~ f(data$ID, a, b, c),
data = data,
start = list(a = 1824, b = 0.05, c = 1))
summary(result)

绘制曲线 -案例
# 箱图 + 曲线
a = 12.78; b = 0.8383; c = 1854
fun <- function(x){a*x^b+c}
add = data.frame(GID=factor(1:74), value=fun(1:74))
res =
ggplot(NULL, aes(x, y)) +
geom_boxplot(data, mapping = aes(x = GID, y = value),
width = 0.5, color = "deepskyblue3",
size = 0.5, outlier.size = 0.2,
outlier.color = NA,
outlier.shape = NA) +
theme_classic() +
labs(x="Number of Genomes",
y="Number of Gene Families",
title = "y=12.78x^0.8383+1854, heap's alpha < 1") +
theme(axis.title = element_text(size = 25),
axis.text.y = element_text(size = 18),
axis.text.x = element_text(size = 8, angle=60, hjust=1),
axis.line = element_line(size = 1),
axis.ticks = element_line(size = 1)) +
scale_y_continuous(limits = c(1800, 2400),
breaks = c(1800,1900,2000,2100,2200,2300,2400)) +
geom_point(add, mapping = aes(x=GID, y=value),
size = 2, color = "red")
ggsave(res, file="rarefaction.pdf", width = 10)
ggsave(res, file="rarefaction.png", width = 10)

置信区间
## 置信区间
library(Rmisc)
upper = c()
lower = c()
for(i in 1:74)
{
tmp = data[data$ID==i,]
tmp_data = tmp$value
tmp_ci = CI(tmp_data, ci=0.95)
upper = c(upper, tmp_ci[1])
lower = c(lower, tmp_ci[3])
print(i)
}
add$upper = upper
add$lower = lower
# 绘图
input = melt(add, id='GID')
ggplot(input, mapping = aes(x=GID, y=value, color = variable)) +
theme_classic() +
labs(x="Number of Genomes",
y="Number of Gene Families",
title = "y=12.78x^0.8383+1854, heap's alpha < 1") +
theme(axis.title = element_text(size = 25),
axis.text.y = element_text(size = 18),
axis.text.x = element_text(size = 8, angle=60, hjust=1),
axis.line = element_line(size = 1),
axis.ticks = element_line(size = 1)) +
scale_y_continuous(limits = c(1800, 2400),
breaks = c(1800,1900,2000,2100,2200,2300,2400)) +
geom_line(aes(group = variable))

perm = 1的随机监督曲线
####################### NO permutation
## peppan curve data #############################################
#######################
setwd("xudong/peppan/")
library(micropan)
library(ggplot2)
library(tidyr)
tb = read.table("peppan_add.txt", head=T, sep="\t", row.names=1)

group = data.frame(id = rownames(tb), group = tb$Source)
tb = tb[, -1]
# gene clsuter/family - genome pav table
#####################
## 自编 pangenome 计数
#####################
pan = c()
gene = ncol(tb)
for(i in 1:nrow(tb))
{
tmp = tb[1:i,]
num = apply(tmp, 2, FUN=sum)
count = 0
for(j in 1:length(num)){if(num[j]==0){count=count+1}}
pan = c(pan, gene - count)
}
group$pan = pan
# permutation
# tb2 <- rarefaction(tb, n.perm = 1)
# 方法类似,但是需要自己整理数据
## ggplot
group$group = factor(group$group, levels = unique(group$group))
group$id = factor(group$id, levels = group$id)
group$x = factor(rownames(group), levels = rownames(group))
col_list = read.table("C:/Users/hutongyuan/Desktop/group_color.list",sep="\t", check.names=F, na.string="", stringsAsFactors=F, quote="", comment.char="")
colors = col_list$V1[1:7]
names(colors) <- unique(group$group)
p =
ggplot(group, aes(x = id, y=pan, color = group)) +
geom_point() +
geom_line(aes(group = group)) +
theme_classic() +
labs(x="Number of Genomes",
y="Number of Gene Families",
color = "Source") +
theme(axis.title = element_text(size = 25),
axis.text.y = element_text(size = 18),
axis.text.x = element_text(size = 8, angle=60, hjust=1),
axis.line = element_line(size = 1),
axis.ticks = element_line(size = 1),
legend.title=element_text(face="bold")) +
scale_y_continuous() +
scale_color_manual(values = colors) +
scale_y_continuous(limits = c(1800, 2400),
breaks = c(1800,1900,2000,2100,2200,2300,2400))
ggsave(p, file="rarefaction_p1.pdf", width = 10)
ggsave(p, file="rarefaction_p1.png", width = 10)

rarecurve绘制reads-otus稀释曲线
rarefy: Rarefaction Species Richness
library(vegan)
data(BCI)
raremax <- min(rowSums(BCI))
rarefy(BCI, raremax)
rarecurve(BCI, step = 20, sample = raremax, col = "blue", cex = 0.6)


rarecurve基于如此丰度表绘制rarefaction curve
文献案例
标题:Pyrosequencing enumerates and contrasts soil microbial diversity
杂志:ISME
时间:2007
文中提到DOTUR采用Michaelis–Menten (MM) equation拟合reads-OTUs稀释曲线
更多:
How can I fit rarefaction curves?
幂律分布的参数估计方法及R实现
R语言曲线拟合函数(绘图)
R: fitting power law curve to data