Bracken merge samples
2022-07-24 本文已影响0人
胡童远
用了类似构造泛基因组表格的方法
1 kraken2bracken分析
kraken2数据处理
# 帮助文档
helpdoc(){
cat <<EOF
Description:
This is a help document
- Plot circos
Usage:
$0 -i <input file> -o <output file>
Option:
-i this is a input file/fold
-o this is a output file/fold
EOF
}
# 参数传递
while getopts ":i:o:" opt
do
case $opt in
i)
infile=`echo $OPTARG`
;;
o)
outfile=`echo $OPTARG`
;;
?)
echo "未知参数"
exit 1;;
esac
done
# 若无指定任何参数则输出帮助文档
if [ $# = 0 ]
then
helpdoc
exit 1
fi
# 核心代码
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate metawrap-env2
db_kraken2="/public/home/zzumgg03/huty/databases/kraken2_soil501/kraken_database"
for i in `ls split_conta/$infile/`; do
kraken2 --db $db_kraken2 --threads 30 --report 04_bin_501/kraken_out/${i}_kraken.txt --use-names --report-zero-counts --paired split_conta/$infile/$i/${i}_1.fastq split_conta/$infile/$i/${i}_2.fastq
done
bracken数据处理
## bracken
mkdir kraken_bracken
mkdir kraken_bracken/kraken_quant
for i in `ls kraken`; do
mv kraken/$i/* kraken_bracken/kraken_quant/
echo -e "$i done..."
done
mkdir bracken_quant
mkdir bracken_report
nohup bash sc_kraken_bracken.sh &
sc_kraken_bracken.sh
# code
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate bracken
for i in `ls kraken_quant/`; do
base=${i%_kraken.txt}
bracken \
-d /public/home/zzumgg03/huty/databases/kraken2_soil501/kraken_database \
-i kraken_quant/$i \
-t 10 \
-o bracken_quant/${base}_bracken.txt \
-w bracken_report/${base}_bracken.report
done
unclassified reads统计
for i in `ls kraken_quant/ | grep "_kraken.txt"`; do
base=${i%_kraken.txt}
num=`cat kraken_quant/$i | head -n 1 | awk '{print $2}'`
echo -e "$base\t$num" >> kraken_unclassified.txt
done
nohup bash sc_extract_unclassified.sh &
用合并kraken结果表的方法已经不可以了,bracken对物种进行了选择,不再是全部物种。
total reads=unclassified reads + root/bacteria reads统计
bracken重新分配reads后会丢掉一些物种,导致bracken reads减少,total reads也减少。这里在kraken的数据中计算total reads保证其稳定不变。
for i in `ls kraken_quant/ | grep "_kraken.txt"`; do
base=${i%_kraken.txt}
num=`cat kraken_quant/$i | head -n 2 | awk '{SUM+=$2}END{print SUM}'`
echo -e "$base\t$num" >> kraken_unclassified_root.txt
echo -e "$i done..."
done
2 合并准备
## merge
mkdir bracken_merge
mkdir bracken_merge/bracken_tmp
for i in `ls bracken_quant`; do
echo -e "$i" >> bracken_merge/sample.list
cat bracken_quant/$i | sed "1d" | awk -F"\t" 'BEGIN{OFS="\t"}{print $1,$6}' \
> bracken_merge/bracken_tmp/$i
cat bracken_quant/$i | sed "1d" | awk -F"\t" 'BEGIN{OFS="\t"}{print $1}' \
>> bracken_merge/species_raw.list
echo -e "$i done..."
done
cd bracken_merge/
nohup cat species_raw.list | sort | uniq > species.list &
3 合并
(file) sample.list
DP8400029164BR_L01_61_bracken.txt
DP8400029164BR_L01_64_bracken.txt
DP8400029164BR_L01_68_bracken.txt
DP8400029166BR_L01_63_bracken.txt
DP8400029166BR_L01_65_bracken.txt
(file) species.list
Abiotrophia defectiva
Acanthamoeba polyphaga mimivirus
Acaryochloris marina
Acetilactobacillus jinshanensis
Acetoanaerobium sticklandii
(fold) bracken_tmp/
python脚本 sc_merge.py
#!/usr/bin/env python
import re,sys,os
import pandas as pd
import numpy as np
# 读取文件,去除换行符给新列表
# 读取需要判断PAV的gene list
with open("species.list", 'r') as list_genes:
list_genes = list_genes.readlines()
list_genes_enter = []
for each in list_genes:
list_genes_enter.append(each.strip())
# 读取每个基因组的gene list
with open("sample.list", 'r') as list_genomes:
list_genomes = list_genomes.readlines()
list_genomes_enter = []
for each in list_genomes:
list_genomes_enter.append(each.strip())
# 构造数据框
num_row = len(list_genes_enter)
num_col = len(list_genomes_enter)
num_total = num_row * num_col
df = pd.DataFrame(np.zeros(num_total).reshape((num_row, num_col)),
columns = list_genomes_enter,
index = list_genes_enter)
# 遍历所有基因集,遍历所有行名(基因)是否存在于各基因集(CGR2),重新给表格赋值
route="./bracken_tmp"
for each_genome in list_genomes_enter:
target_file = "{}/{}".format(route, each_genome)
# 读取基因集
with open(target_file, 'r') as target:
target = target.readlines()
for each_target in target:
# 遍历基因集,基因,数字
each_target = each_target.strip()
each_target_gene = re.split(r'\t', each_target)[0]
each_target_num = re.split(r'\t', each_target)[1]
# 判断行名基因是否在基因集,并给表格元素赋值
# loc: 字符定位表格元素
# iloc: 数字定位
df.loc[each_target_gene, each_genome] = each_target_num
print("\033[32m {} DONE!\033[0m".format(each_genome))
# 表格保存
df.to_csv('table_merge.txt', sep='\t', index = True)
运行
# zhengzhou super computer
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate py37
nohup python3 sc_merge.py &
4 分类注释
# taxonomy
conda activate py37
python3 ../script/kraken_anno_k.py \
kraken_quant/sample_1/DP8400029164BR_L01_61_kraken.report \
kraken_anno.txt
cat kraken_anno.txt | awk -F"\t" '{if($4=="S") print $6}' \
> kraken_anno_s.txt
kraken_anno_k.py 脚本
#!/usr/bin/env python3
import os, sys, re
ms, infile_name, outfile_name = sys.argv
with open(infile_name, 'r') as infile:
with open(outfile_name, 'w') as o:
Domain = ""
Family = ""
for line in infile:
line = line.strip()
lines = re.split(r'\t', line)
lines[5] = re.split(r'^[ ]*', lines[5])[1]
if lines[3] == "D":
Domain = lines[5]
o.write("\t".join(lines))
o.write("\n")
Kingdom = "NG" ## Kingdom非必须,Domain必须,在这里赋空值
elif lines[3] == "K":
Kingdom = lines[5]
lines[5] = Domain+";"+Kingdom
o.write("\t".join(lines))
o.write("\n")
elif lines[3] == "P":
Phylum = lines[5]
lines[5] = Domain+";"+Kingdom+";"+Phylum
o.write("\t".join(lines))
o.write("\n")
elif lines[3] == "C":
Class = lines[5]
lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class
o.write("\t".join(lines))
o.write("\n")
elif lines[3] == "O":
Order = lines[5]
lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order
o.write("\t".join(lines))
o.write("\n")
elif lines[3] == "F":
Family = lines[5]
lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order+";"+Family
o.write("\t".join(lines))
o.write("\n")
elif lines[3] == "G":
Genus = lines[5]
lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order+";"+Family+";"+Genus
o.write("\t".join(lines))
o.write("\n")
elif lines[3] == "S":
Species = lines[5]
lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order+";"+Family+";"+Genus+";"+Species
o.write("\t".join(lines))
o.write("\n")