Python

python脚本:将gtf文件以染色体为单位进行拆分

2022-07-19  本文已影响0人  小明的数据分析笔记本

只能用于linux系统,因为用到了grep命令

import os
import argparse
import subprocess
from itertools import product
from multiprocessing import Pool

def get_chromosome_num(in_file):
    chromo = []
    with open(in_file,'r') as fr:
        for line in fr:
            if not line.strip().startswith("#"):
                if line.strip().split("\t")[0] not in chromo and len(line.strip().split("\t")[0]) != 0:
                    chromo.append(line.strip().split("\t")[0])

    return chromo

def split_gtf(chromo_list,gtf_file_path,output_folder):
    os.makedirs(output_folder,exist_ok=True)
    for chr in chromo_list:
        cmd = ['grep',chr,gtf_file_path,">",output_folder+"/"+chr+".gtf"]
        print(' '.join(cmd))
        subprocess.check_output(' '.join(cmd),shell=True)

def final_run():
    parser = argparse.ArgumentParser(
        formatter_class = argparse.RawDescriptionHelpFormatter,
        description = "split gtf",
        epilog = '''
        @author: MingYan
        @contact: mingyan24@126.com
        '''
    )

    parser.add_argument("-g","--gtf",required=True,help="specify the input gtf")
    parser.add_argument("-o","--output-folder",required=True,help="specify the output folder")
    #parser.add_argument("-nt","--num-threads",required=True,type=int,default=1,help="specify the number of data threads")
    args = parser.parse_args()

    in_file = args.gtf
    output_folder = args.output_folder
    #num_threads = args.num_threads

    chr_list = get_chromosome_num(in_file)
    print(chr_list)

    split_gtf(chr_list,in_file,output_folder)

    # with Pool(num_threads) as p:
    #     p.starmap(split_gtf,product(chr_list,in_file,output_folder))
    #     p.close()
    #     p.join()

if __name__ == "__main__":
    final_run()

    print("Congratulations!")

没啥实际应用,就是为了学python

使用

python split_gtf_according_to_chromosome_num.py -g ../20220712/gtf/GCF_000146045.2 _R64_genomic.gtf -o output_mingyan_1
上一篇下一篇

猜你喜欢

热点阅读