根据GTF文件用python画基因的多个转录本(第五题)
2020-02-28 本文已影响0人
多啦A梦詹
import re
import sys
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.patches as patches
from matplotlib.patches import Rectangle
from collections import OrderedDict
matplotlib.style.use("ggplot")
args = sys.argv
geneCord = OrderedDict()
GRCh38Exon = OrderedDict()
GRCh38UTR = OrderedDict()
def main(genename=args[2]):
with open(args[1],'r') as fp_gtf:
for line in fp_gtf:
if line.startswith("#"):
continue
line_list = line.strip("\n").split("\t")
chr = line_list[0]
type = line_list[2]
start = int(line_list[3])
end = int(line_list[4])
orientation = line_list[6]
attr = line_list[8]
if type == "gene":
genename = re.search(r'gene_name "([^;]+)";?', attr).group(1)
geneCord[genename] = [start,end]
GRCh38Exon[genename] = OrderedDict()
GRCh38UTR[genename] = OrderedDict()
elif type == "transcript":
trptname = re.search(r'transcript_name "([^;]+)";?', attr).group(1)
genename = re.search(r'gene_name "([^;]+)";?', attr).group(1)
if genename not in geneCord:
continue
GRCh38Exon[genename][trptname] = []
GRCh38Exon[genename][trptname].append(orientation)
GRCh38UTR[genename][trptname] = []
elif type == "exon":
trptname = re.search(r'transcript_name "([^;]+)";?', attr).group(1)
genename = re.search(r'gene_name "([^;]+)";?', attr).group(1)
if genename not in geneCord:
continue
if trptname not in GRCh38Exon[genename]:
continue
GRCh38Exon[genename][trptname].extend([start,end])
elif type in ["five_prime_utr","three_prime_utr"]:
trptname = re.search(r'transcript_name "([^;]+)";?', attr).group(1)
genename = re.search(r'gene_name "([^;]+)";?', attr).group(1)
if genename not in geneCord:
continue
if trptname not in GRCh38UTR[genename]:
continue
GRCh38UTR[genename][trptname].extend([start,end])
gene_model(args[2])
#绘制基因模型
def gene_model(genename):
fig = plt.figure()
ax = fig.add_subplot(111)
trptNum = 0
#from numpy import *
for trpt, exon in GRCh38Exon[genename].items():
trptNum += 1
if exon[0] == '+':
col = 'red'
elif exon[0] == '-':
col = 'blue'
for i in range(1,len(exon),2):
#print(exon)
rect = Rectangle((exon[i], trptNum-0.1), exon[i+1]-exon[i], 0.2, color=col, fill=True)
if i < len(exon)-2:
ax.plot([exon[i+1],exon[i+2]],[trptNum,trptNum],color='black',linewidth=1)
trptNum = 0
for trpt, utr in GRCh38UTR[genename].items():
trptNum += 1
if utr == []:
continue
for j in range(0,len(utr),2):
#画UTR区 [Rectangle((x,y),width,height)]
rect = Rectangle((utr[j], trptNum-0.1), utr[j+1]-utr[j], 0.2, color='black', fill=True)
ax.add_patch(rect)
ax.set_xlabel(genename, color='blue', fontsize=14, fontweight='bold')
ax.yaxis.set_major_locator(ticker.FixedLocator(range(1,trptNum+1)))
ax.set_yticklabels(GRCh38Exon[genename].keys(),fontweight='bold')
plt.xlim(geneCord[genename])
plt.ylim([0.5,trptNum+0.5])
plt.show()
if __name__ == "__main__":
main(args[2])
用命令行:
python transcript.py Homo_sapiens.GRCh38.87.chr.gtf TP53
结果如下:
Figure_1.png