泛基因组分析-基本流程代码

2024-01-24 本文已影响0人花生学生信

以下是自己使用map-to-pan分析泛基因组的部分代码，仅供参考:
统计fasta序列的N50、N90以及最短、最长序列

##tj_N50.py 
import sys
from Bio import SeqIO
import os

def calculate_Nx(sequence_lengths, x):
    total_length = sum(sequence_lengths)
    target_length = total_length * x / 100
    sorted_lengths = sorted(sequence_lengths, reverse=True)
    cumulative_length = 0
    for length in sorted_lengths:
        cumulative_length += length
        if cumulative_length >= target_length:
            return length

def calculate_stats(fasta_file):
    stats = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence_length = len(record.seq)
        stats.append((record.id, sequence_length))
    stats.sort(key=lambda x: x[1])

    sequence_lengths = [length for _, length in stats]
    n50 = calculate_Nx(sequence_lengths, 50)
    n90 = calculate_Nx(sequence_lengths, 90)
    min_length = sequence_lengths[0]
    max_length = sequence_lengths[-1]
    return stats, n50, n90, min_length, max_length

fasta_files = sys.argv[1:]  # 通过命令行参数传入FASTA文件路径列表

print("文件名\t最短序列长度\t最长序列长度\tN50\tN90")
for fasta_file in fasta_files:
    sample_stats, n50, n90, min_length, max_length = calculate_stats(fasta_file)
    file_name = os.path.basename(fasta_file)
    print(f"{file_name}\t{min_length}\t{max_length}\t{n50}\t{n90}")

过滤前

现在需要筛选大于500bp的序列做后续分析:

##tiqu_500.py
import sys

def extract_long_sequences(fasta_file):
    long_sequences = []
    current_sequence = ""
    current_sequence_name = ""
    with open(fasta_file, "r") as file:
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if current_sequence and len(current_sequence) > 500:
                    long_sequences.append((current_sequence_name, current_sequence))
                current_sequence_name = line[1:]
                current_sequence = ""
            else:
                current_sequence += line
        if current_sequence and len(current_sequence) > 500:
            long_sequences.append((current_sequence_name, current_sequence))
    return long_sequences

fasta_file = sys.argv[1]  # 通过命令行参数传入FASTA文件路径
long_sequences = extract_long_sequences(fasta_file)

# 打印提取出的长序列
for sequence_name, sequence in long_sequences:
    print(f">{sequence_name}")
    print(sequence)

过滤后

再写个简单的批处理shell：

cat ../fqid|while read id
do
python tiqu_500.py ../contigs/$id.fa >01_500bp/$id.500.fa
python tj_N50.py 01_500bp/$id_500.fa >>N50_inf.txt
done

quast评估，嫌慢可以写成parallel并行

cat ../fqid|while read samples
do
#mkdir 02_quast/${samples}
ref=/public/home/fengting/work/zyyy/DK/jelly/ref_IR64/IR64.genome
th=4
refgff=/public/home/fengting/work/cax/NIP/new_add/gff/IR64.IGDBv1.Allset.gff
/public/home/fengting/demo/pan111/EUPAN/tools/quast-5.1.0rc1/quast.py \
 -t ${th} \
    --large 01_500bp/${samples}.500.fa \
    -r ${ref} \
    -g ${refgff} \
   -o ./02_quast/${samples} \
    --no-icarus \
    --no-snps \
    --min-identity 90.0
done

使用blastx将未比对上的核苷酸数据与unirice蛋白数据库比对

cat id|while read samples
do 
###为了并行处理
echo "blastx -query 01_500bp/${samples}.500.fa -out bl/${samples}.bl -db os_nr/unirice -outfmt 6 -evalue 1e-6 -num_threads 8" >>blxwo.sh
done

题外话
昨晚挂了一个blastn的任务，跑了一天，导致其它任务进行的极其缓慢，cpu占用也极其低，关掉这个任务就好了。

###去掉文件名的后缀
import os

# 获取当前文件夹路径
folder_path = os.getcwd()

# 遍历当前文件夹中的所有文件
for filename in os.listdir(folder_path):
    # 检查文件是否以.bl为后缀
    if filename.endswith(".bl"):
        # 去掉.bl后缀
        new_filename = filename[:-3]
        
        # 重命名文件
        os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, new_filename))

提取blast结果

cat sxid|while read id
do
cut -f  1 ../bl/$id.bl |uniq|sort -V > blid/$id
perl ~/scripts/perl/per.pl blid/$id ../01_500bp/$id.500.fa >blfa/$id.fa
done

去冗余

###提取核苷酸序列
perl ~/scripts/perl/per.pl DL001 DL001-1.500.fa >DL001.fa
####cd-hit 去冗余，24-1-25更新去冗余命令

cdhit -i in.fasta -o out.fasta -d 30 -G 1 -M 16000 -T 20  -uL 0.05 -S 2 -U 2 -g 1 -aL 0.95 -aS 1


###euapn去冗余
##blastCluster
eupan rmRedundant blastCluster DL001.fa 1 ~/miniconda3/bin/

##cdhit
eupan rmRedundant cdhitCluster DL001.fa 2 ~/miniconda3/bin/

为了防止合并后有序列的名字是重复的，我们将每个文件的sample名字添加到序列后

import os

# 获取当前文件夹路径
folder_path = os.getcwd()

# 获取当前文件夹中以DL开头的文件列表
file_list = [file for file in os.listdir(folder_path) if file.startswith('DL')]

# 遍历文件列表
for file_name in file_list:
    # 构建原始文件路径和新文件路径
    file_path = os.path.join(folder_path, file_name)
    new_file_path = os.path.join(folder_path, f"new_{file_name}")

    # 打开原始文件和新文件
    with open(file_path, 'r') as file, open(new_file_path, 'w') as new_file:
        # 逐行读取原始文件内容
        for line in file:
            # 如果当前行以>开头，则在该行后面添加文件名
            if line.startswith('>'):
                line = line.strip() + f" {file_name}\n"
            
            # 将修改后的行写入新文件
            new_file.write(line)
    
    print(f"文件 {file_name} 处理完成，生成新文件 {new_file_path}")

每条序列都含有样本名

整合所有文件

import os

# 获取当前文件夹路径
folder_path = os.getcwd()

# 获取当前文件夹中以new开头的文件列表
file_list = [file for file in os.listdir(folder_path) if file.startswith('new')]

# 创建一个新的文件用于整合所有内容
new_file_path = os.path.join(folder_path, "combined_file.txt")

# 遍历文件列表
with open(new_file_path, 'w') as new_file:
    for file_name in file_list:
        # 构建原始文件路径
        file_path = os.path.join(folder_path, file_name)
        
        # 打开原始文件
        with open(file_path, 'r') as file:
            # 逐行读取原始文件内容，并写入新文件
            for line in file:
                new_file.write(line)

print(f"所有以new开头的文件已经整合到新文件 {new_file_path} 中")

泛基因组分析-基本流程代码

quast评估，嫌慢可以写成parallel并行

使用blastx将未比对上的核苷酸数据与unirice蛋白数据库比对

提取blast结果

去冗余

为了防止合并后有序列的名字是重复的，我们将每个文件的sample名字添加到序列后

未完待续

猜你喜欢

热点阅读