人体菌群研究

Bracken merge samples

2022-07-24  本文已影响0人  胡童远

用了类似构造泛基因组表格的方法

1 kraken2bracken分析

kraken2数据处理

# 帮助文档
helpdoc(){
    cat <<EOF
Description:

    This is a help document
    - Plot circos

Usage:

    $0 -i <input file> -o <output file>

Option:

    -i    this is a input file/fold
    -o    this is a output file/fold
EOF
}

# 参数传递
while getopts ":i:o:" opt
do
    case $opt in
        i)
        infile=`echo $OPTARG`
        ;;
        o)
        outfile=`echo $OPTARG`
        ;;
        ?)
        echo "未知参数"
        exit 1;;
    esac
done

# 若无指定任何参数则输出帮助文档
if [ $# = 0 ]
then
    helpdoc
    exit 1
fi

# 核心代码
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate metawrap-env2
db_kraken2="/public/home/zzumgg03/huty/databases/kraken2_soil501/kraken_database"

for i in `ls split_conta/$infile/`; do
    kraken2 --db $db_kraken2 --threads 30 --report 04_bin_501/kraken_out/${i}_kraken.txt --use-names --report-zero-counts --paired split_conta/$infile/$i/${i}_1.fastq split_conta/$infile/$i/${i}_2.fastq
done

bracken数据处理

## bracken
mkdir kraken_bracken
mkdir kraken_bracken/kraken_quant
for i in `ls kraken`; do
    mv kraken/$i/* kraken_bracken/kraken_quant/
    echo -e "$i done..."
done

mkdir bracken_quant
mkdir bracken_report
nohup bash sc_kraken_bracken.sh &

sc_kraken_bracken.sh

# code
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate bracken

for i in `ls kraken_quant/`; do
    base=${i%_kraken.txt}
    bracken \
    -d /public/home/zzumgg03/huty/databases/kraken2_soil501/kraken_database \
    -i kraken_quant/$i \
    -t 10 \
    -o bracken_quant/${base}_bracken.txt \
    -w bracken_report/${base}_bracken.report
done

unclassified reads统计

for i in `ls kraken_quant/ | grep "_kraken.txt"`; do
    base=${i%_kraken.txt}
    num=`cat kraken_quant/$i | head -n 1 | awk '{print $2}'`
    echo -e "$base\t$num" >> kraken_unclassified.txt
done
nohup bash sc_extract_unclassified.sh &

用合并kraken结果表的方法已经不可以了,bracken对物种进行了选择,不再是全部物种。

total reads=unclassified reads + root/bacteria reads统计
bracken重新分配reads后会丢掉一些物种,导致bracken reads减少,total reads也减少。这里在kraken的数据中计算total reads保证其稳定不变。

for i in `ls kraken_quant/ | grep "_kraken.txt"`; do
    base=${i%_kraken.txt}
    num=`cat kraken_quant/$i | head -n 2 | awk '{SUM+=$2}END{print SUM}'`
    echo -e "$base\t$num" >> kraken_unclassified_root.txt
    echo -e "$i done..."
done

2 合并准备

## merge
mkdir bracken_merge
mkdir bracken_merge/bracken_tmp
for i in `ls bracken_quant`; do
    echo -e "$i" >> bracken_merge/sample.list
    cat bracken_quant/$i | sed "1d" | awk -F"\t" 'BEGIN{OFS="\t"}{print $1,$6}' \
    > bracken_merge/bracken_tmp/$i
    cat bracken_quant/$i | sed "1d" | awk -F"\t" 'BEGIN{OFS="\t"}{print $1}' \
    >> bracken_merge/species_raw.list
    echo -e "$i done..."
done

cd bracken_merge/
nohup cat species_raw.list | sort | uniq > species.list &

3 合并

(file) sample.list

DP8400029164BR_L01_61_bracken.txt
DP8400029164BR_L01_64_bracken.txt
DP8400029164BR_L01_68_bracken.txt
DP8400029166BR_L01_63_bracken.txt
DP8400029166BR_L01_65_bracken.txt

(file) species.list

Abiotrophia defectiva
Acanthamoeba polyphaga mimivirus
Acaryochloris marina
Acetilactobacillus jinshanensis
Acetoanaerobium sticklandii

(fold) bracken_tmp/

python脚本 sc_merge.py

#!/usr/bin/env python
import re,sys,os
import pandas as pd
import numpy as np
# 读取文件,去除换行符给新列表
# 读取需要判断PAV的gene list
with open("species.list", 'r') as list_genes:
    list_genes = list_genes.readlines()
    list_genes_enter = []
    for each in list_genes:
        list_genes_enter.append(each.strip())

# 读取每个基因组的gene list
with open("sample.list", 'r') as list_genomes:
    list_genomes = list_genomes.readlines()
    list_genomes_enter = []
    for each in list_genomes:
        list_genomes_enter.append(each.strip())

# 构造数据框
num_row = len(list_genes_enter)
num_col = len(list_genomes_enter)
num_total = num_row * num_col
df = pd.DataFrame(np.zeros(num_total).reshape((num_row, num_col)),
                  columns = list_genomes_enter,
                  index = list_genes_enter)

# 遍历所有基因集,遍历所有行名(基因)是否存在于各基因集(CGR2),重新给表格赋值
route="./bracken_tmp"
for each_genome in list_genomes_enter:
    target_file = "{}/{}".format(route, each_genome)
    # 读取基因集
    with open(target_file, 'r') as target:
        target = target.readlines()
        for each_target in target:
        # 遍历基因集,基因,数字
            each_target = each_target.strip()
            each_target_gene = re.split(r'\t', each_target)[0]
            each_target_num = re.split(r'\t', each_target)[1]
            # 判断行名基因是否在基因集,并给表格元素赋值
            # loc: 字符定位表格元素
            # iloc: 数字定位
            df.loc[each_target_gene, each_genome] = each_target_num
    print("\033[32m {} DONE!\033[0m".format(each_genome))

# 表格保存
df.to_csv('table_merge.txt', sep='\t', index = True)

运行

# zhengzhou super computer
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate py37
nohup python3 sc_merge.py &

4 分类注释

# taxonomy
conda activate py37

python3 ../script/kraken_anno_k.py \
kraken_quant/sample_1/DP8400029164BR_L01_61_kraken.report \
kraken_anno.txt

cat kraken_anno.txt | awk -F"\t" '{if($4=="S") print $6}' \
> kraken_anno_s.txt

kraken_anno_k.py 脚本

#!/usr/bin/env python3
import os, sys, re
ms, infile_name, outfile_name = sys.argv

with open(infile_name, 'r') as infile:
    with open(outfile_name, 'w') as o:
        Domain = ""
        Family = ""
        for line in infile:
            line = line.strip()
            lines = re.split(r'\t', line)
            lines[5] = re.split(r'^[ ]*', lines[5])[1]
            if lines[3] == "D":
                Domain = lines[5]
                o.write("\t".join(lines))
                o.write("\n")
                Kingdom = "NG"  ## Kingdom非必须,Domain必须,在这里赋空值
            elif lines[3] == "K":
                Kingdom = lines[5]
                lines[5] = Domain+";"+Kingdom
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "P":
                Phylum = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "C":
                Class = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "O":
                Order = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "F":
                Family = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order+";"+Family
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "G":
                Genus = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order+";"+Family+";"+Genus
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "S":
                Species = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order+";"+Family+";"+Genus+";"+Species
                o.write("\t".join(lines))
                o.write("\n")

上一篇 下一篇

猜你喜欢

热点阅读