R语言R绘图

R爬虫实战--爬取MalaCard疾病数据库信息

2021-03-27  本文已影响0人  小贝学生信

之前只知道python爬虫,偶然了解到R语言也有相应的R包完成网页爬取。使用下来觉得R爬虫比较方便的(尤其对于我目前更熟悉R)

参考视频教程:https://www.bilibili.com/video/BV1nE411M7h2?share_source=copy_web

1、准备

(1)安装R包

library("rvest")
library("xml2")
library("dplyr")
library("stringr")

(2)谷歌浏览器与SelectorGadget插件

(3)关于Malacard数据库

2、获取疾病名

关键三步骤
(1)read_html(url链接)获取链接索引的网页内容
(2)html_nodes(xpath=***)获取目标网页数据
(3)html_text()返回文本信息

Mala_id=lapply(LETTERS,function(i){
    print(i)
    #step1:获取网页目标文本内容
    link2id=paste0("https://www.malacards.org/malalist/",i,"?showAll=true")
    raw_info=read_html(link2id,encoding = "UTF-8") %>% 
      html_nodes(xpath="//table") %>% html_text()
    
    #进一步筛选文本信息(需要多次实例探索,直到得到满意的结果)
    # " {4,}"表示匹配至少四个空格
    ID_info=str_replace_all(raw_info," {4,}","") %>% 
      str_replace_all(" {2,}"," ") %>% strsplit("\r\n")
    
    #整理成data.frame表格格式
    mat=as.data.frame(matrix(ID_info[[1]],byrow = T, ncol = 3),stringsAsFactors = F)
    colnames(mat)=mat[1,];mat=mat[-1,]
    mat[,1]=trimws(mat[,1]) #去除两端的空格
    mat[,2]=trimws(mat[,2])
    return(mat)
  })
Mala_id=do.call(rbind,Mala_id)
rownames(Mala_id)=1:nrow(Mala_id)

3、获取疾病分类信息

Symbol=Mala_id$Symbol

#https://www.malacards.org/card/inflammatory_bowel_disease_1
#id=Symbol[1]
#以前100个抓取为例
res_classf=lapply(Symbol[1:100], function(id){
  print(paste0(id," ",which(Symbol==id),"/",length(Symbol)))
  ID_sle=str_replace_all(Mala_id[which(Mala_id[,2]==id),][,1],"[ ,-/]","_") %>%
    str_replace_all("_{2,}","_") %>% str_replace_all("[',]","")
  #web=read_html(paste0("https://www.malacards.org/card/",ID_sle))
  web=try(read_html(paste0("https://www.malacards.org/card/",ID_sle)), silent = T)
  while(class(web)=="try-error" ){
    print(web)
    web=try(read_html(paste0("https://www.malacards.org/card/",ID_sle)), silent = T)}
    #return()}
  res0=web %>% 
    html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "tab", " " ))]') %>%
    html_text()
  
  Mc=str_detect(res0,"MalaCards categories")
  if(sum(Mc)==1){
    res1=res0[Mc] %>% str_replace_all(" {2,}","") %>% strsplit("\r\n")
    #情况1:正常理想情况
    if(length(res1[[1]])==7){
      # [[1]] APM002
      # [1] ""                                             "MalaCards categories: "                      
      # [3] "Global:"                                      " Rare diseasesGenetic diseasesAnatomical:"   
      # [5] " Nephrological diseasesBone diseases"         "See all MalaCards categories (disease lists)"
      # [7] "" 
      global=strsplit(res1[[1]][4],"diseases")
      global=paste0(global[[1]][-length(global[[1]])],"disease")
      global=str_replace(global, "^ ", "")
      Anatomical=strsplit(res1[[1]][5],"diseases")
      Anatomical=paste0(Anatomical[[1]],"disease")
      Anatomical=str_replace(Anatomical, "^ ", "")
      print(list(ID=id, global=global,
                  Anatomical=Anatomical))
      
      return(list(ID=id, global=global,
                  Anatomical=Anatomical))
    } else if(any(str_detect(res1[[1]],"Anatomical"))){
      # [[1]] ACH002
      # [1] ""                                             "MalaCards categories: "                      
      # [3] "Anatomical:"                                  " Bone diseases"                              
      # [5] "See all MalaCards categories (disease lists)" ""
      Anatomical=strsplit(res1[[1]][4],"diseases")
      Anatomical=paste0(Anatomical[[1]],"disease")
      Anatomical=str_replace(Anatomical, "^ ", "")
      print(list(ID=id, global=NULL,
                  Anatomical=Anatomical))
      return(list(ID=id, global=NULL,
                  Anatomical=Anatomical))
      
    } else if(any(str_detect(res1[[1]],"Global"))) {
      # 情况3:只有Global分类的情况
      # [[1]] ACT206
      # [1] ""                                             "MalaCards categories: "                      
      # [3] "Global:"                                      " Fetal diseasesRare diseases"                
      # [5] "See all MalaCards categories (disease lists)" ""
      global=strsplit(res1[[1]][4],"diseases")
      global=paste0(global[[1]],"disease")
      global=str_replace(global, "^ ", "")
      print(list(ID=id, global=global,
                Anatomical=NULL))
      return(list(ID=id, global=global,
                 Anatomical=NULL))
    } 
  } else {
    #情况4:该疾病没有分类条目
    # ABD009
    print(list(ID=id, global=NULL,
                Anatomical=NULL))
    return(list(ID=id, global=NULL,
                Anatomical=NULL))
    
  }
})
names(res_classf)=Mala_id$Symbol[1:100]
str(res_classf)
res_classf=lapply(myid[c(-7,-36)], function(id){
  print(paste0(id," ",which(myid==id),"/",length(myid)))
  ID_sle=str_replace_all(Mala_id[which(Mala_id[,2]==id),][,1],"[ -]","_") %>%
    str_replace_all("[',]","")
  web=read_html(paste0("https://www.malacards.org/card/",ID_sle))
  res0=web %>% 
    html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "tab", " " ))]') %>%
    html_text()
  
  Mc=str_detect(res0,"ICD10")
  if(sum(Mc)==1){
    res1=res0[Mc] %>% str_replace_all(" {2,}","") %>% strsplit("\r\n")
    ICD10=str_replace(res1[[1]][4]," $", "")
    print(list(ID=id, ICD10=ICD10))
    return(list(ID=id, ICD10=ICD10))
  } else {
    print(list(ID=id, ICD10=NULL))
    return(list(ID=id, ICD10=NULL))
  }
})
str(res_classf)
names(res_classf)=myid

4、其它

read_html(url("https://www.malacards.org/card/alopecia_androgenetic_1"))
上一篇下一篇

猜你喜欢

热点阅读