KEGG数据库数据下载

2023-11-23  本文已影响0人  wo_monic

kegg数据库提供API供用户批量访问https://www.kegg.jp/kegg/rest/keggapi.html
使用R语言KEGGREST包可以帮助我们下载数据

BiocManager::install("KEGGREST")
BiocManager::install("fmcsR")
#BiocManager::install("RbioRXN")
#devtools::install_git("https://github.com/cran/RbioRXN.git")
##包加载
library(KEGGREST)
#library(RbioRXN)
##查看KEGG数据库包含的数据
listDatabases()

detach(package:KEGGREST, unload = T)
library(devtools)
devtools::install_github("https://github.com/kozo2/KEGGREST/tree/patch-1")
library(httr)    
set_config(use_proxy(url="127.0.0.1",port=10809))#注意此处的url是你本机的地址,端口是梯子的本地端口。如果你不需要梯子可以直接访问kegg数据库,则不需要设置。
#显示kegg所有可用的list
listDatabases()
##获取某个类型所有数据集中的数据,
pathway<- keggList("pathway")
compound <- keggList("compound")
reaction <- keggList("reaction")

library(plyr)
#这个是RbioRXN包的函数,因为安装不上,所以我直接把它的函数源码拿过来用
get.kegg.byId <-
  function(keggId) {
    kegg = data.frame()
    i = 1
    while(i <= length(keggId)) {
      
      cat('processing', keggId[i], '\n')
      query <- keggGet(keggId[i:(i+9)])
      
      for(l in 1:length(query)) {
        
        keggRow = query[[l]]
        
        for(j in names(keggRow)) {
          if(j == 'DBLINKS') {
            for(k in 1:length(keggRow$DBLINKS)) {
              db = unlist(strsplit(keggRow$DBLINKS[k], ': '))[1]
              id = unlist(strsplit(keggRow$DBLINKS[k], ': '))[2]
              keggRow[[db]] = id
            }
          } else if (j == 'PATHWAY') {
            for(k in 1:length(keggRow$PATHWAY)) {
              keggRow$PATHWAY[k] = paste(names(keggRow$PATHWAY[k]), keggRow$PATHWAY[k], sep=': ')
            }
            keggRow$PATHWAY = paste(keggRow$PATHWAY, collapse='///')
          } else if (j == 'REFERENCE') {
            keggRow$REFERENCE = paste(keggRow$REFERENCE[[1]]$REFERENCE, collapse='///')
          } else {
            if(length(keggRow[[j]]) > 1) {
              keggRow[[j]] = paste(keggRow[[j]], collapse='///')
            }
          }
        }
        keggRow[['DBLINKS']] = NULL
        keggRow = as.data.frame(keggRow, stringsAsFactors=FALSE)
        kegg = rbind.fill(kegg, keggRow)
        kegg[is.na(kegg)] = ''
      }
      i = i + 10 
    }
    return(kegg)
  }

批量获取所有的kegg的反应和化合物数据
这是RbioRXN包的get.kegg.all 函数源码

    cmp <- keggList("compound")
    reactionEntry = keggList("reaction")
    
    cmpId = names(cmp)
    cmpId = sub('cpd:', '', cmpId)
    reactionEntry = names(reactionEntry)
    reactionEntry = sub('rn:', '', reactionEntry)
    keggReaction = get.kegg.byId(reactionEntry)
    keggReaction[is.na(keggReaction)] = ""
    
    keggCompound = get.kegg.byId(cmpId)
    keggCompound[is.na(keggCompound)] = ""
    
    # reference
    referIndex = grep('.+', keggReaction$REFERENCE)
    referId = keggReaction[grep('.+', keggReaction$REFERENCE), 'ENTRY']
    referIdUnique = unique(keggReaction[grep('.+', keggReaction$REFERENCE), 'ENTRY'])
    
    redundantIndex = c()
    for(i in referIdUnique) {
      index = grep(i, referId)
      index = referIndex[index[-1]]
      redundantIndex = c(redundantIndex, index)
    }
    
    if(length(redundantIndex) > 0) {
      keggReaction_unique = keggReaction[-redundantIndex,]
    } else {
      keggReaction_unique = keggReaction
    }
    
    result = list()
    result[['reaction']] = keggReaction_unique
    result[['compound']] = keggCompound
    cat('# of reactions:', nrow(keggReaction_unique), '\n')
    cat('# of compounds:', nrow(keggCompound), '\n')
    keggAll = result
##获取所有的代谢反应和化合物数据
save(keggAll,file="keggAll.Rdata")
###提取数据
reaction=keggAll$reaction
write.csv(reaction," reaction.csv")
 
compound=keggAll$compound
write.csv(compound," compound.csv")

如果下载的时候遇到403错误,那就是你的网络问题,更换ip地址再访问试试。
本文修改自https://cloud.tencent.com/developer/article/1800280的源码,解决了包安装不上和网络错误的问题。

上一篇下一篇

猜你喜欢

热点阅读