根据GFF3文件统计外显子大小和数量以及内含子大小
2020-05-09 本文已影响0人
东风008
每个基因的外显子起始与结束的位置,保存为1.txt,注意需要打开1.txt编辑删除第一行的空行
output_file = open("/home/yt/Desktop/silkworm3.1.5/data/1.txt", "w")
with open('/home/yt/Desktop/silkworm3.1.5/data/silkDB3.1.5.gff3', 'r') as f:
for line in f:
line = line.rstrip("\n") # 删除行尾的换行符
array = line.split("\t")
sub_array = array[8].split(";")
name = sub_array[1].replace('name=','')
if array[2] == 'gene':
output_file.write("\n")
output_file.write(name + "\t" + array[0] + "\t" + array[3] + "\t" + array[4] + "\t" + array[6] + "\t")
if array[2] == 'exon':
output_file.write(array[3] + "\t" + array[4] + "\t")
output_file.close()
f.close()
计算每个外显子的大小
with open('/home/yt/Desktop/silkworm3.1.5/data/1.txt', 'r') as f:
for line in f:
lin = line.strip().split()
a = len(lin)
for i in range(6, a, 2):
exon = int(lin[i]) - int(lin[i-1]) + 1
print(lin[0], exon)
f.close()
计算每个内含子的大小
with open('/home/yt/Desktop/silkworm3.1.5/data/1.txt', 'r') as f:
for line in f:
lin = line.strip().split()
a = len(lin)
if a == 7:
print(lin[0], '0')
if a > 7:
if lin[4] == '+':
for i in range(7, a, 2):
intron = abs(int(lin[i]) - int(lin[i-1]) - 1)
print(lin[0], intron)
if lin[4] == '-':
for i in range(8, a, 2):
intron = abs(int(lin[i]) + 1 - int(lin[i - 3]))
print(lin[0], intron)
f.close()
统计每个基因外显子的数量
output_file = open("/home/yt/Desktop/silkworm3.1.5/data/2.txt", "w")
with open('/home/yt/Desktop/silkworm3.1.5/data/1.txt', 'r') as f:
for line in f:
lin = line.strip().split()
a = len(lin)
n = (a - 5)/2
output_file.write(lin[0]+ "\t" + str(n) + "\n")
output_file.close()
f.close()