根据Knumbers注释爬取KEGG对应KO号和pathway
2021-08-18 本文已影响0人
精神秃头生信小伙儿
背景
手头有KEGG对应各个基因的注释文件(注释可以参考KAAS注释流程),但没有对应的KO号,需要得到相应的KO号做富集分析
Python script
用selenium+xpath爬取,试了下发现KEGG最大接受的输入knumbers数量为7000左右,所以分割数据进行爬取(这里没有用线程池爬,要加速可以用多线程),注意脚本中chrome driver改成自己的地址即可。
参数三个,分别为-kn,对应你的用换行符分割的knumbers文件;-k2ko,对应爬取的kn对应ko关系的文件保存地址;-t2n,对应爬取的ko号和pathway name描述的文件保存地址。
得到文件后就可以很方便的用R整理数据,以及超几何检验/现成R包(例如ClusterProfiler)进行富集
from selenium import webdriver
# No visual interface
from selenium.webdriver.chrome.options import Options
# Avoid test
from selenium.webdriver import ChromeOptions
from lxml import etree
import os
import argparse
def setoption():
"""
Google Chrome's anti-anti-crawler parameter settings
"""
# No visual interface
chrome_options = Options()
chrome_options.add_experimental_option(
'excludeSwitches', ['enable-logging'])
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
# Avoid test
option = ChromeOptions()
option.add_experimental_option("excludeSwitches",
["enable-automation"])
return chrome_options, option
def mapkn2ko(knsearch, k2kopath, term2namepath):
"""
main process to map kn to ko
:param knsearch: kn string
:param k2kopath: path to save k2ko file
:param term2namepath: path to save term2name file
"""
chrome_options, option = setoption()
bro = webdriver.Chrome(executable_path=r"/YourPathTo/chromedriver.exe",
chrome_options=chrome_options,
options=option)
bro.get("https://www.genome.jp/kegg/ko.html")
search_input = bro.find_element_by_xpath('//*[@id="content"]/div[2]/div[1]/form/table/tbody/tr[2]/td/textarea')
search_input.send_keys(knsearch)
btn = bro.find_element_by_xpath('//*[@id="content"]/div[2]/div[1]/form/table/tbody/tr[3]/td/input[3]')
btn.click()
bro.switch_to_window(bro.window_handles[1])
page_text = bro.page_source
tree = etree.HTML(page_text)
li_list = tree.xpath('/html/body/div[2]/li')
for li in li_list:
ko = li.xpath('./a[1]/text()')[0]
ko_name = li.xpath('./text()')[0].split(" (")[0]
kn_list = li.xpath('./div/a//text()')
with open(k2kopath, "a") as k2ko:
for kn in kn_list:
k2ko.write(kn.split(":")[1]+"\t"+ko+"\n")
with open(term2namepath, "a") as term2name:
term2name.write(ko+"\t"+ko_name+"\n")
bro.quit()
def kninput(knfile):
knstring_all = []
knstring_7000 = []
with open(knfile, "rt") as knf:
while True:
try:
if len(knstring_7000) <= 7000:
knstring_7000.append(next(knf))
else:
knstring_all.append("".join(knstring_7000))
knstring_7000 = []
except StopIteration:
knstring_all.append("".join(knstring_7000))
break
return knstring_all
def getoptions():
"""
Get options from command line
:return: knfilepath, k2kofilepath, t2nfilepath
"""
parser = argparse.ArgumentParser(description="Get k2ko and term2name by inputting a file including knumbers")
parser.add_argument("-kn", "--knumber", help="Path to knumbers file")
parser.add_argument("-k2ko", "--knumber2pathway",
help="File path to save the relationship between knumbers and pathway names")
parser.add_argument("-t2n", "--term2name", help="File paht to save the pathway name and corresponding description")
args = parser.parse_args()
if not args.knumber:
raise TypeError("Missing option '-kn', please check your input")
if not args.knumber2pathway:
raise TypeError("Missing option '-k2ko', please check your input")
if not args.term2name:
raise TypeError("Missing option '-t2n', please check your input")
return args.knumber, args.knumber2pathway, args.term2name
def mainprocess():
"""
Pipeline to execute program.
"""
knfile, k2kofile, t2nfile = getoptions()
if os.path.isfile(k2kofile):
os.remove(k2kofile)
if os.path.isfile(t2nfile):
os.remove(t2nfile)
knstring_all = kninput(knfile)
for knstring in knstring_all:
mapkn2ko(knstring, k2kofile, t2nfile)
if __name__ == "__main__":
mainprocess()
富集分析可以参考:
https://www.jianshu.com/p/8ee9a71d056e?utm_campaign=maleskine...