用python解析最新版的KEGG信息(第六题)
2020-03-02 本文已影响0人
多啦A梦詹
测试文件从官网下载。
import re
import os
import csv
from collections import OrderedDict
os.chdir('D:/python')
dit = {} #OrderedDict()
with open('hsa00001.keg', 'r') as f:
for line in f:
line = line.rstrip()
if line.startswith('A'):
class_A = re.search('A\d+\s(.+)', line).group(1)
#class_A = mch.group(1)
dit[class_A] = {}#OrderedDict()
elif line.startswith('B'):
if line == "B":
continue
else:
mch = re.search('B\s+\d+\s(.+)', line)
class_B = re.sub(',',';',mch.group(1))#将B有中的逗号换成分号,以免写入CSV自动分割
dit[class_A][class_B] = {}#OrderedDict()
elif line.startswith('C'):
mch = re.match('C\s+(\d+)\s(.+)', line)
pathID = mch.group(1)
pathName = re.sub('\s\[.+\]', '', mch.group(2)) # []需要转义
pathwany_name=re.sub(',',';',pathName)#将pathName中的逗号换成分号,以免写入CSV自动分割
pathway = 'hsa'+ pathID + ':' + pathwany_name
dit[class_A][class_B][pathway] = [[], []]
elif line.startswith('D'):
lst = line.split(';')
mch = re.search('D\s+(\d+)\s(.+)', lst[0])
geneID = mch.group(1)
geneName = mch.group(2)
dit[class_A][class_B][pathway][0].append(geneID)
dit[class_A][class_B][pathway][1].append(geneName)
with open('cleaned_KEGG.csv', 'w') as f:
mycsv=csv.writer(f)
for ka, va in dit.items():
for kb, vb in va.items():
for kc, vc in vb.items():
geneid = ';'.join(vc[0])
genename = ';'.join(vc[1])
f.write(','.join([ka, kb, kc, geneid, genename])+"\n")
numpathway = 0
allgenes = []
with open('cleaned_kegg.CSV','r') as f:
mycsv=csv.reader(f)
for line in mycsv:
#print(type(line[3]))
if line[3]=='':
continue
ko_num=line[3].split(';')
numpathway=numpathway+1
allgenes=allgenes+ko_num
print(numpathway, len(set(allgenes)))