python3.7模块pyfaidx处理fasta文件
2019-10-11 本文已影响0人
wangsb_2020
1.pyfaidx
- 官网
- cite: Shirley MD, Ma Z, Pedersen B, Wheelan S. Efficient “pythonic” access to FASTA files using pyfaidx. PeerJ PrePrints 3:e1196. 2015.
2.使用
from pyfaidx import Fasta
#读取fasta文件
genes = Fasta('cds.fa')
#获得所有fasta序列的名字
print(genes.keys())
#获得某个基因的序列和序列名
print(genes['Ntab0307420.1'])
print(genes['Ntab0307420.1'].long_name)
print(genes['Ntab0307420.1'].name)
#单个基因进行裁剪
print(genes['Ntab0307420.1'][:].fancy_name)
print(genes['Ntab0307420.1'][0:10])
#单个基因的长度
print(len(genes['Ntab0307420.1']))
3.应用实例
- 将多行fasta文件转为单行,并增加序列长度信息
方案一:
from pyfaidx import Fasta
#读取fasta文件
pep = Fasta('./pep.fa')
#新文件
pep_len = open("pep_len.fa", "w")
#写入文件
for n in pep.keys():
print(">{} {}\n{}".format(n, len(pep[n]), pep[n]), file=pep_len)
方案二:
fr = open('cds.fa', 'r')
fw = open('cds_len.fa', 'w')
seq = {}
for line in fr:
if line.startswith('>'):
name = line.split()[0]
seq[name] = ''
else:
seq[name] += line.replace('\n', '')
fr.close()
for i in seq.keys():
print("{} {}\n{}".format(i, len(seq[i]), seq[i]), file=fw)
fw.close()
- 结果
#原始文件
$ head pep.fa
>Ntab0075600.1
MNNNIEAMENNEKRASSIRDADSEKEPQINYRGVKAMPFIIGNETFEKLGAIGTLSNLLV
YLTTVFNLKHITATTLINVFNGTTNFATLLGAFLSDTYFGRYKTLGFASIMSFLGLFVIA
LTAVFKNLHPPHCESKDISNCIGPTGWQMAFLLSGFGLLIIGAAGIRPCNLAFGADQFNP
NTESGKRGINSFFNWYFFTLTFAQMVSVTLVVYVQSDVSWSIGLAIPAIFMLISCFLFFG
GTKIYVKVKPEGSPLTSVVQVLVVSIKKRRLKLPEQPLKSLFSYTPPKSINSKLSYTHQF
RFLDKAAIVTPEDQIKSDGSAANQWNLCSLQQVEEAKCVVRVIPIWAAAIVYHVGIIQQQ
QFVVFQALQSNRHLGNSNFQIPAATYTIFSMLSLTLWLPIYDRIIVPLLRRLTGKEGGIT
ILQRMGIGIFLIVLSSLVSAFIEERRRKLVFTNPAVGVHSERGLVSSMSALWLVPQLCLA
GLAEAFCAIGQVEFYYKQFPENMRSIAGSFFFLGMAASSYLNSFLISIVHHTTEKAKTGN
#结果
$ head -4 pep_len.txt
>Ntab0075600.1 600
MNNNIEAMENNEKRASSIRDADSEKEPQINYRGVKAMPFIIGNETFEKLGAIGTLSNLLVYLTTVFNLKHITATTLINVFNGTTNFATLLGAFLSDTYFGRYKTLGFASIMSFLGLFVIALTAVFKNLHPPHCESKDISNCIGPTGWQMAFLLSGFGLLIIGAAGIRPCNLAFGADQFNPNTESGKRGINSFFNWYFFTLTFAQMVSVTLVVYVQSDVSWSIGLAIPAIFMLISCFLFFGGTKIYVKVKPEGSPLTSVVQVLVVSIKKRRLKLPEQPLKSLFSYTPPKSINSKLSYTHQFRFLDKAAIVTPEDQIKSDGSAANQWNLCSLQQVEEAKCVVRVIPIWAAAIVYHVGIIQQQQFVVFQALQSNRHLGNSNFQIPAATYTIFSMLSLTLWLPIYDRIIVPLLRRLTGKEGGITILQRMGIGIFLIVLSSLVSAFIEERRRKLVFTNPAVGVHSERGLVSSMSALWLVPQLCLAGLAEAFCAIGQVEFYYKQFPENMRSIAGSFFFLGMAASSYLNSFLISIVHHTTEKAKTGNWLPEDLNKGKLDYFYFLITALGILNVVYFIICARWYKYKGNDETSSVGLEMERQNVEKHF
>Ntab0075620.1 593
MIIIILSDMEVDKVPARDEPKYGGIKAMPFIIGNETFEKLGTIGTSANLLVYLTTVFNMKSITATNLINVFNGTCNFGTLLGAFLSDTYLGRYKTLGIASISSFTGMMFLALTAAISKLHPPHCGTAANSTCLEPTTGQLAFLLCGFGFLVIGASGIRPCNLAFGADQFNPNTESGRRGVNSFFNWYYFTFTFAMMVSLTVIVYIQSSVSWAIGLAIPTFLMFLSCVFFFVGTKIYVMILPEGSPLTSFAQVLVAAIKKRRLQLPEQPQNTLFNHVSINTINTELPYTDQFRFLNKASIITPEDRIKEDGSAANPWKLCSIQQVEEVKCVVRVFPIWIAGLIYYIVLVQMQAYVVFQALQSDRRLRNTSTFKIPAASYAVFQMLSMTIWIPIYDRIIVPFLQKITKKEAGITVLQKMGIGLFIAVFTMLVSAVVETRRRNVAFSHPTLGIETRRGEISAMPANWLIPQLALAGLSEAFTVIAQVEFFYKQFPENMRSFAGSFLFCGFALASYMSSFLISIVHKTTRTSDTENWLAEDLNKGRLDYFYYLVAALEVLNLGYFLICAKWYKYKGTQNDHNLEIALEKLEPTKPLV