用python解析最新版的KEGG信息(第六题)

2020-03-02  本文已影响0人  多啦A梦詹

测试文件从官网下载。

import re
import os
import csv
from collections import OrderedDict

os.chdir('D:/python')
dit = {} #OrderedDict()

with open('hsa00001.keg', 'r') as f:
    for line in f:
        line = line.rstrip()
        if line.startswith('A'):
            class_A = re.search('A\d+\s(.+)', line).group(1)
            #class_A = mch.group(1)
            dit[class_A] = {}#OrderedDict()

        elif line.startswith('B'):
            if line == "B":
                continue
            else:
                mch = re.search('B\s+\d+\s(.+)', line)
                class_B = re.sub(',',';',mch.group(1))#将B有中的逗号换成分号,以免写入CSV自动分割
                dit[class_A][class_B] = {}#OrderedDict()

        elif line.startswith('C'):
            mch = re.match('C\s+(\d+)\s(.+)', line)
            pathID = mch.group(1)
            pathName = re.sub('\s\[.+\]', '', mch.group(2))  # []需要转义
            pathwany_name=re.sub(',',';',pathName)#将pathName中的逗号换成分号,以免写入CSV自动分割
            pathway = 'hsa'+ pathID + ':' + pathwany_name
            dit[class_A][class_B][pathway] = [[], []]

        elif line.startswith('D'):
            lst = line.split(';')
            mch = re.search('D\s+(\d+)\s(.+)', lst[0])
            geneID = mch.group(1)
            geneName = mch.group(2)
            dit[class_A][class_B][pathway][0].append(geneID)
            dit[class_A][class_B][pathway][1].append(geneName)

with open('cleaned_KEGG.csv', 'w') as f:
    mycsv=csv.writer(f)
    for ka, va in dit.items():
        for kb, vb in va.items():
            for kc, vc in vb.items():
                geneid = ';'.join(vc[0])
                genename = ';'.join(vc[1])
                f.write(','.join([ka, kb, kc, geneid, genename])+"\n")

numpathway = 0
allgenes = []
with open('cleaned_kegg.CSV','r') as f:
    mycsv=csv.reader(f)
    for line in mycsv:
        #print(type(line[3]))
        if line[3]=='':
            continue
        ko_num=line[3].split(';')
        numpathway=numpathway+1
        allgenes=allgenes+ko_num
print(numpathway, len(set(allgenes)))
上一篇 下一篇

猜你喜欢

热点阅读