泛基因组分析-基本流程代码
2024-01-24 本文已影响0人
花生学生信
以下是自己使用map-to-pan分析泛基因组的部分代码,仅供参考:
统计fasta序列的N50、N90以及最短、最长序列
##tj_N50.py
import sys
from Bio import SeqIO
import os
def calculate_Nx(sequence_lengths, x):
total_length = sum(sequence_lengths)
target_length = total_length * x / 100
sorted_lengths = sorted(sequence_lengths, reverse=True)
cumulative_length = 0
for length in sorted_lengths:
cumulative_length += length
if cumulative_length >= target_length:
return length
def calculate_stats(fasta_file):
stats = []
for record in SeqIO.parse(fasta_file, "fasta"):
sequence_length = len(record.seq)
stats.append((record.id, sequence_length))
stats.sort(key=lambda x: x[1])
sequence_lengths = [length for _, length in stats]
n50 = calculate_Nx(sequence_lengths, 50)
n90 = calculate_Nx(sequence_lengths, 90)
min_length = sequence_lengths[0]
max_length = sequence_lengths[-1]
return stats, n50, n90, min_length, max_length
fasta_files = sys.argv[1:] # 通过命令行参数传入FASTA文件路径列表
print("文件名\t最短序列长度\t最长序列长度\tN50\tN90")
for fasta_file in fasta_files:
sample_stats, n50, n90, min_length, max_length = calculate_stats(fasta_file)
file_name = os.path.basename(fasta_file)
print(f"{file_name}\t{min_length}\t{max_length}\t{n50}\t{n90}")
过滤前
现在需要筛选大于500bp的序列做后续分析:
##tiqu_500.py
import sys
def extract_long_sequences(fasta_file):
long_sequences = []
current_sequence = ""
current_sequence_name = ""
with open(fasta_file, "r") as file:
for line in file:
line = line.strip()
if line.startswith(">"):
if current_sequence and len(current_sequence) > 500:
long_sequences.append((current_sequence_name, current_sequence))
current_sequence_name = line[1:]
current_sequence = ""
else:
current_sequence += line
if current_sequence and len(current_sequence) > 500:
long_sequences.append((current_sequence_name, current_sequence))
return long_sequences
fasta_file = sys.argv[1] # 通过命令行参数传入FASTA文件路径
long_sequences = extract_long_sequences(fasta_file)
# 打印提取出的长序列
for sequence_name, sequence in long_sequences:
print(f">{sequence_name}")
print(sequence)
过滤后
再写个简单的批处理shell:
cat ../fqid|while read id
do
python tiqu_500.py ../contigs/$id.fa >01_500bp/$id.500.fa
python tj_N50.py 01_500bp/$id_500.fa >>N50_inf.txt
done
quast评估,嫌慢可以写成parallel并行
cat ../fqid|while read samples
do
#mkdir 02_quast/${samples}
ref=/public/home/fengting/work/zyyy/DK/jelly/ref_IR64/IR64.genome
th=4
refgff=/public/home/fengting/work/cax/NIP/new_add/gff/IR64.IGDBv1.Allset.gff
/public/home/fengting/demo/pan111/EUPAN/tools/quast-5.1.0rc1/quast.py \
-t ${th} \
--large 01_500bp/${samples}.500.fa \
-r ${ref} \
-g ${refgff} \
-o ./02_quast/${samples} \
--no-icarus \
--no-snps \
--min-identity 90.0
done
使用blastx将未比对上的核苷酸数据与unirice蛋白数据库比对
cat id|while read samples
do
###为了并行处理
echo "blastx -query 01_500bp/${samples}.500.fa -out bl/${samples}.bl -db os_nr/unirice -outfmt 6 -evalue 1e-6 -num_threads 8" >>blxwo.sh
done
题外话
昨晚挂了一个blastn的任务,跑了一天,导致其它任务进行的极其缓慢,cpu占用也极其低,关掉这个任务就好了。
###去掉文件名的后缀
import os
# 获取当前文件夹路径
folder_path = os.getcwd()
# 遍历当前文件夹中的所有文件
for filename in os.listdir(folder_path):
# 检查文件是否以.bl为后缀
if filename.endswith(".bl"):
# 去掉.bl后缀
new_filename = filename[:-3]
# 重命名文件
os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, new_filename))
提取blast结果
cat sxid|while read id
do
cut -f 1 ../bl/$id.bl |uniq|sort -V > blid/$id
perl ~/scripts/perl/per.pl blid/$id ../01_500bp/$id.500.fa >blfa/$id.fa
done
去冗余
###提取核苷酸序列
perl ~/scripts/perl/per.pl DL001 DL001-1.500.fa >DL001.fa
####cd-hit 去冗余,24-1-25更新去冗余命令
cdhit -i in.fasta -o out.fasta -d 30 -G 1 -M 16000 -T 20 -uL 0.05 -S 2 -U 2 -g 1 -aL 0.95 -aS 1
###euapn去冗余
##blastCluster
eupan rmRedundant blastCluster DL001.fa 1 ~/miniconda3/bin/
##cdhit
eupan rmRedundant cdhitCluster DL001.fa 2 ~/miniconda3/bin/
为了防止合并后有序列的名字是重复的,我们将每个文件的sample名字添加到序列后
import os
# 获取当前文件夹路径
folder_path = os.getcwd()
# 获取当前文件夹中以DL开头的文件列表
file_list = [file for file in os.listdir(folder_path) if file.startswith('DL')]
# 遍历文件列表
for file_name in file_list:
# 构建原始文件路径和新文件路径
file_path = os.path.join(folder_path, file_name)
new_file_path = os.path.join(folder_path, f"new_{file_name}")
# 打开原始文件和新文件
with open(file_path, 'r') as file, open(new_file_path, 'w') as new_file:
# 逐行读取原始文件内容
for line in file:
# 如果当前行以>开头,则在该行后面添加文件名
if line.startswith('>'):
line = line.strip() + f" {file_name}\n"
# 将修改后的行写入新文件
new_file.write(line)
print(f"文件 {file_name} 处理完成,生成新文件 {new_file_path}")
每条序列都含有样本名
整合所有文件
import os
# 获取当前文件夹路径
folder_path = os.getcwd()
# 获取当前文件夹中以new开头的文件列表
file_list = [file for file in os.listdir(folder_path) if file.startswith('new')]
# 创建一个新的文件用于整合所有内容
new_file_path = os.path.join(folder_path, "combined_file.txt")
# 遍历文件列表
with open(new_file_path, 'w') as new_file:
for file_name in file_list:
# 构建原始文件路径
file_path = os.path.join(folder_path, file_name)
# 打开原始文件
with open(file_path, 'r') as file:
# 逐行读取原始文件内容,并写入新文件
for line in file:
new_file.write(line)
print(f"所有以new开头的文件已经整合到新文件 {new_file_path} 中")