根据GTF文件用python画基因的多个转录本(第五题)

2020-02-28  本文已影响0人  多啦A梦詹
import re
import sys
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.patches as patches
from matplotlib.patches import Rectangle
from collections import OrderedDict
matplotlib.style.use("ggplot")

args = sys.argv

geneCord = OrderedDict()
GRCh38Exon = OrderedDict()
GRCh38UTR = OrderedDict()

def main(genename=args[2]):
    with open(args[1],'r') as fp_gtf:
        for line in fp_gtf:
            if line.startswith("#"):
                continue
            line_list = line.strip("\n").split("\t")
            chr = line_list[0]
            type = line_list[2]
            start = int(line_list[3])
            end = int(line_list[4])
            orientation = line_list[6]
            attr = line_list[8]

            if type == "gene":
                genename = re.search(r'gene_name "([^;]+)";?', attr).group(1)
                geneCord[genename] = [start,end]
                GRCh38Exon[genename] = OrderedDict()
                GRCh38UTR[genename] = OrderedDict()

            elif type == "transcript":
                trptname = re.search(r'transcript_name "([^;]+)";?', attr).group(1)
                genename = re.search(r'gene_name "([^;]+)";?', attr).group(1)
                if genename not in geneCord:
                    continue
                GRCh38Exon[genename][trptname] = []
                GRCh38Exon[genename][trptname].append(orientation)
                GRCh38UTR[genename][trptname] = []

            elif type == "exon":
                trptname = re.search(r'transcript_name "([^;]+)";?', attr).group(1)
                genename = re.search(r'gene_name "([^;]+)";?', attr).group(1)
                if genename not in geneCord:
                    continue
                if trptname not in GRCh38Exon[genename]:
                    continue
                GRCh38Exon[genename][trptname].extend([start,end])

            elif type in ["five_prime_utr","three_prime_utr"]:
                trptname = re.search(r'transcript_name "([^;]+)";?', attr).group(1)
                genename = re.search(r'gene_name "([^;]+)";?', attr).group(1)
                if genename not in geneCord:
                    continue
                if trptname not in GRCh38UTR[genename]:
                    continue
                GRCh38UTR[genename][trptname].extend([start,end])
    gene_model(args[2])

#绘制基因模型
def gene_model(genename):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    trptNum = 0
    #from numpy import *
    for trpt, exon in GRCh38Exon[genename].items():
        trptNum += 1
        if exon[0] == '+':
            col = 'red'
        elif exon[0] == '-':
            col = 'blue'
        for i in range(1,len(exon),2):
            #print(exon)
            rect = Rectangle((exon[i], trptNum-0.1), exon[i+1]-exon[i], 0.2, color=col, fill=True)
            if i < len(exon)-2:
                ax.plot([exon[i+1],exon[i+2]],[trptNum,trptNum],color='black',linewidth=1)

    trptNum = 0
    for trpt, utr in GRCh38UTR[genename].items():
        trptNum += 1
        if utr == []:
            continue
        for j in range(0,len(utr),2):
        #画UTR区 [Rectangle((x,y),width,height)]
            rect = Rectangle((utr[j], trptNum-0.1), utr[j+1]-utr[j], 0.2, color='black', fill=True)
            ax.add_patch(rect)

    ax.set_xlabel(genename, color='blue', fontsize=14, fontweight='bold')
    ax.yaxis.set_major_locator(ticker.FixedLocator(range(1,trptNum+1)))
    ax.set_yticklabels(GRCh38Exon[genename].keys(),fontweight='bold')

    plt.xlim(geneCord[genename])
    plt.ylim([0.5,trptNum+0.5])
    plt.show()

if __name__ == "__main__":
    main(args[2])

用命令行:

python transcript.py Homo_sapiens.GRCh38.87.chr.gtf TP53

结果如下:


Figure_1.png
上一篇 下一篇

猜你喜欢

热点阅读