python3.7模块pyfaidx处理fasta文件

2019-10-11  本文已影响0人  wangsb_2020

1.pyfaidx

2.使用

from pyfaidx import Fasta
#读取fasta文件
genes = Fasta('cds.fa')
#获得所有fasta序列的名字
print(genes.keys())

#获得某个基因的序列和序列名
print(genes['Ntab0307420.1'])
print(genes['Ntab0307420.1'].long_name)
print(genes['Ntab0307420.1'].name)

#单个基因进行裁剪
print(genes['Ntab0307420.1'][:].fancy_name)
print(genes['Ntab0307420.1'][0:10])
#单个基因的长度
print(len(genes['Ntab0307420.1']))

3.应用实例

方案一:
from pyfaidx import Fasta
#读取fasta文件
pep = Fasta('./pep.fa')
#新文件
pep_len = open("pep_len.fa", "w")
#写入文件
for n in pep.keys():
    print(">{} {}\n{}".format(n, len(pep[n]), pep[n]), file=pep_len)
方案二:
fr = open('cds.fa', 'r')
fw = open('cds_len.fa', 'w')
seq = {}
for line in fr:
    if line.startswith('>'):
        name = line.split()[0]
        seq[name] = ''
    else:
        seq[name] += line.replace('\n', '')
fr.close()

for i in seq.keys():
    print("{} {}\n{}".format(i, len(seq[i]), seq[i]), file=fw)
fw.close()
#原始文件
$ head pep.fa
>Ntab0075600.1
MNNNIEAMENNEKRASSIRDADSEKEPQINYRGVKAMPFIIGNETFEKLGAIGTLSNLLV
YLTTVFNLKHITATTLINVFNGTTNFATLLGAFLSDTYFGRYKTLGFASIMSFLGLFVIA
LTAVFKNLHPPHCESKDISNCIGPTGWQMAFLLSGFGLLIIGAAGIRPCNLAFGADQFNP
NTESGKRGINSFFNWYFFTLTFAQMVSVTLVVYVQSDVSWSIGLAIPAIFMLISCFLFFG
GTKIYVKVKPEGSPLTSVVQVLVVSIKKRRLKLPEQPLKSLFSYTPPKSINSKLSYTHQF
RFLDKAAIVTPEDQIKSDGSAANQWNLCSLQQVEEAKCVVRVIPIWAAAIVYHVGIIQQQ
QFVVFQALQSNRHLGNSNFQIPAATYTIFSMLSLTLWLPIYDRIIVPLLRRLTGKEGGIT
ILQRMGIGIFLIVLSSLVSAFIEERRRKLVFTNPAVGVHSERGLVSSMSALWLVPQLCLA
GLAEAFCAIGQVEFYYKQFPENMRSIAGSFFFLGMAASSYLNSFLISIVHHTTEKAKTGN
#结果
$ head -4 pep_len.txt
>Ntab0075600.1 600
MNNNIEAMENNEKRASSIRDADSEKEPQINYRGVKAMPFIIGNETFEKLGAIGTLSNLLVYLTTVFNLKHITATTLINVFNGTTNFATLLGAFLSDTYFGRYKTLGFASIMSFLGLFVIALTAVFKNLHPPHCESKDISNCIGPTGWQMAFLLSGFGLLIIGAAGIRPCNLAFGADQFNPNTESGKRGINSFFNWYFFTLTFAQMVSVTLVVYVQSDVSWSIGLAIPAIFMLISCFLFFGGTKIYVKVKPEGSPLTSVVQVLVVSIKKRRLKLPEQPLKSLFSYTPPKSINSKLSYTHQFRFLDKAAIVTPEDQIKSDGSAANQWNLCSLQQVEEAKCVVRVIPIWAAAIVYHVGIIQQQQFVVFQALQSNRHLGNSNFQIPAATYTIFSMLSLTLWLPIYDRIIVPLLRRLTGKEGGITILQRMGIGIFLIVLSSLVSAFIEERRRKLVFTNPAVGVHSERGLVSSMSALWLVPQLCLAGLAEAFCAIGQVEFYYKQFPENMRSIAGSFFFLGMAASSYLNSFLISIVHHTTEKAKTGNWLPEDLNKGKLDYFYFLITALGILNVVYFIICARWYKYKGNDETSSVGLEMERQNVEKHF
>Ntab0075620.1 593
MIIIILSDMEVDKVPARDEPKYGGIKAMPFIIGNETFEKLGTIGTSANLLVYLTTVFNMKSITATNLINVFNGTCNFGTLLGAFLSDTYLGRYKTLGIASISSFTGMMFLALTAAISKLHPPHCGTAANSTCLEPTTGQLAFLLCGFGFLVIGASGIRPCNLAFGADQFNPNTESGRRGVNSFFNWYYFTFTFAMMVSLTVIVYIQSSVSWAIGLAIPTFLMFLSCVFFFVGTKIYVMILPEGSPLTSFAQVLVAAIKKRRLQLPEQPQNTLFNHVSINTINTELPYTDQFRFLNKASIITPEDRIKEDGSAANPWKLCSIQQVEEVKCVVRVFPIWIAGLIYYIVLVQMQAYVVFQALQSDRRLRNTSTFKIPAASYAVFQMLSMTIWIPIYDRIIVPFLQKITKKEAGITVLQKMGIGLFIAVFTMLVSAVVETRRRNVAFSHPTLGIETRRGEISAMPANWLIPQLALAGLSEAFTVIAQVEFFYKQFPENMRSFAGSFLFCGFALASYMSSFLISIVHKTTRTSDTENWLAEDLNKGRLDYFYYLVAALEVLNLGYFLICAKWYKYKGTQNDHNLEIALEKLEPTKPLV

还有些其他功能……

上一篇下一篇

猜你喜欢

热点阅读