Linux与生物信息

脚本 | Python | 提取第[12]个或第3个密码子建树

2022-04-07  本文已影响0人  shwzhao

马兜铃基因组文章 Aristolochia fimbriata genome 采用了很多策略进行建树,比如只提取第 1 和第 2 个密码子,还给出了提取脚本。

参考:https://github.com/yihenghu/Aristolochia_fimbriata_genome_analysis

因为每个人对中间文件的命名方式不同,所以对脚本做了一点点改变,可以在命令行输入前后缀。

$ ls
OG00001.fa  OG00002.fa  OG00003.fa
$ python3 get_concatenated_codon.py OG fa
$ ls
OG00001.fa  OG00001.fa.12  OG00001.fa.3  OG00002.fa  OG00002.fa.12  OG00002.fa.3  OG00003.fa  OG00003.fa.12  OG00003.fa.3
$ cat OG00001.fa*
>gene1
ATGATGATGATGATGATGATGATGATGATG
>gene2
ATCATCATCATCATCATCATCATCATCATC
>gene3
ATAATAATAATAATAATAATAATAATAATA
>gene1
ATATATATATATATATATAT
>gene2
ATATATATATATATATATAT
>gene3
ATATATATATATATATATAT
>gene1
GGGGGGGGGG
>gene2
CCCCCCCCCC
>gene3
AAAAAAAAAA
import sys
import glob
import re
from Bio import SeqIO

fa_file = '*.' + str(sys.argv[2])
fa_file1 = str(sys.argv[1]) + '(\d+).' + str(sys.argv[2])
fa_file2 = str(sys.argv[1]) + '%s.' + str(sys.argv[2])
out_file12 = fa_file2 + '.12'
out_file3 = fa_file2 + '.3'

def rm_3(seq):
    new = ''
    for index, i in enumerate(str(seq)):
        if (index + 1) % 3 != 0:
            new += i
    return new


def save_3(seq):
    new = ''
    for index, i in enumerate(str(seq)):
        if (index + 1) % 3 == 0:
            new += i
    return new


fileID_list = []
for file in glob.glob(fa_file):
    fileID = re.search(
            fa_file1,
            file).group(1)
    fileID_list.append(fileID)

# Coalescent-based tree estimated from concatenated alignments of single gene first codon and second codon positions
for fileID in sorted(fileID_list):
    ID2seq = {}
    seqRs = SeqIO.parse(fa_file2 % fileID, 'fasta')
    for seqR in seqRs:
        ID2seq[seqR.id] = rm_3(seqR.seq)

    with open(out_file12 % fileID, 'w') as fw:
        for ID, seq in ID2seq.items():
            fw.write('>' + ID + '\n' + seq + '\n')

# Coalescent-based tree estimated from concatenated alignments of single gene third codon positions
for fileID in sorted(fileID_list):
    ID2seq = {}
    seqRs = SeqIO.parse(fa_file2 % fileID, 'fasta')
    for seqR in seqRs:
        ID2seq[seqR.id] = save_3(seqR.seq)

    with open(out_file3 % fileID, 'w') as fw:
        for ID, seq in ID2seq.items():
            fw.write('>' + ID + '\n' + seq + '\n')
上一篇 下一篇

猜你喜欢

热点阅读