python--处理 fasta 和 fastq 文件
2022-04-22 本文已影响0人
Z_bioinfo
fasta文件格式:fasta 文件为一个 ID 对应一个 序列,可以是 转录本序列, 蛋白序列
'''
V350033524L1C001R00100020323/1
AAGCTGTCCCATCAATAGCTGCCGCTGAAGGGTGGGGCTGGATGGCGTAAGCTACAGCTGAAGGAAGAACGTGAGCACGAGGCACTGAGGTGATTGGCTG
V350033524L1C001R00100069491/1
AAGCTGTCCAATCAATAGCTGCCGCTGAAGGGTGGGGCTGGATGGCGTAAGCTACAGCTGAAGGAAGAACGTGAGCACGAGGCACTGAGGTGATTGGCTG
'''
读取 fasta 文件保存为字典,对应的 ID 为 key,序列为 value:
with open(r'E:/python练习文件/test.fa', 'r') as fa:
fa_dict = {}
for line in fa:
# 去除末尾换行符
line = line.replace('\n','')
if line.startswith('>'):
# 去除 > 号
seq_name = line[1:]
fa_dict[seq_name] = ''
else:
# 去除末尾换行符并连接多行序列
fa_dict[seq_name] = line.replace('\n','')
# 查看结果,可以看到完整的把换行的序列拼接到一起了,保存为字典格式。
print(fa_dict)
{'V350033524L1C001R00100020323/1': 'AAGCTGTCCCATCAATAGCTGCCGCTGAAGGGTGGGGCTGGATGGCGTAAGCTACAGCTGAAGGAAGAACGTGAGCACGAGGCACTGAGGTGATTGGCTG', 'V350033524L1C001R00100069491/1': 'AAGCTGTCCAATCAATAGCTGCCGCTGAAGGGTGGGGCTGGATGGCGTAAGCTACAGCTGAAGGAAGAACGTGAGCACGAGGCACTGAGGTGATTGGCTG', 'V350033524L1C001R00100073913/1': 'AAGCTGTCCTATCAATAGCTGCCGCTGAAGGGTGGGGCTGGATGGCGTAAGCTACAGCTCAAGGAAGAACGTGAGCACGAGGCACTGAGGTGATTGGCTG', 'V350033524L1C001R00100149315/1': 'AAGCTGTCCTATCAATAGCTGCCGCTGAAGGGTGGGGCTGGATGGCGTAAGCTACAGCTCAAGGAAGAACGTGAGCACGAGGCACTGAGGTGATTGGCTG', 'V350033524L1C001R00100216048/1': 'AAGCTGTCCAATCAATAGCTGCCGCTGAAGGGTGGGGCTGGATGGCGTAAGCTACAGCTCAAGGAAGAACGTGAGCACGAGGCACTGAGGTGATTGGCTG'}
计算每条序列的 GC 含量及计算每条序列长度
# 遍历字典
for name,seq in fa_dict.items():
# 输出名称
print(name+ '\n')
# 计算 G,C 碱基数量
G_base = seq.count('G')
C_base = seq.count('C')
# 计算含量
G_per = G_base/len(seq)*100
C_per = C_base/len(seq)*100
#计算长度
length = len(seq)
# 保留小数点后两位
G_per = round(G_per,2)
C_per = round(C_per,2)
# 打印输出
print("G percent is: "+str(G_per)+ "%")
print("G percent is: "+str(C_per)+ "%")
print("lenth: "+str(length))
V350033524L1C001R00100020323/1
G percent is: 37.0%
G percent is: 20.0%
lenth: 100
V350033524L1C001R00100069491/1
G percent is: 37.0%
G percent is: 19.0%
lenth: 100
V350033524L1C001R00100073913/1
G percent is: 36.0%
G percent is: 20.0%
lenth: 100
V350033524L1C001R00100149315/1
G percent is: 36.0%
G percent is: 20.0%
lenth: 100
V350033524L1C001R00100216048/1
G percent is: 36.0%
G percent is: 20.0%
lenth: 100