统计外显子个数与内含子长度(Python)
2022-07-07 本文已影响0人
曹草BioInfo
面向对象。
只有在一个基因喊两个以上外显子时才会有内含子。
输入为gff3文件
import re
class gene():
def __init__(self):
self.id = None
self.exonnum = 0
self.exon = []
self.start = 0
self.end = 0
self.intron = []
def calcintron(self):
self.intron.append((self.start, self.exon[0][0]))
for i in range(len(self.exon)-1): self.intron.append((self.exon[i][1], self.exon[i+1][0]))
self.intron.append((self.exon[-1][1], self.end))
return None
def insert(self, tup, index = 0):
while index < len(self.exon) and tup[0] > self.exon[index][0]: index += 1
self.exon.insert(index, tup)
return None
def read(key, line):
lis = re.split('[\f\n\r\t\v]+', each)
if len(lis) != 10:
return False
if type(key) == int:
return lis[key-1]
elif key == 'ID':
ID = re.match('ID=[A-Za-z0-9_]+', lis[-2])
return ID.group()
lis = []
f = open('work6_input2.gff3', 'r')
for each in f:
# 读取type
typ = read(3, each)
# 读取ID
ID = read('ID', each)
if typ == 'gene':
temp = gene()
temp.id = ID
temp.start = int(read(4, each))
temp.end = int(read(5, each))
lis.append(temp)
elif typ == 'exon':
temp.exonnum += 1
# 按序插入外显子序列
temp.insert((int(read(4, each)), int(read(5, each))))
for each in lis:
if each.exonnum >= 2:
each.calcintron()
leng = []
for i in each.intron:
leng.append(i[1]-i[0])
print('{}的外显子数量为{}, 内含子长度为{}'.format(each.id, each.exonnum, leng))
else:
print('%s的外显子数量为%d'%(each.id, each.exonnum))