正则表达式 爬虫SpiderMan

爬取NCBI中GEO中的数据

2020-03-01  本文已影响0人  一只烟酒僧

获取GEO中GSE网页的信息

for (i in 1:length(GEO_id)) {
    
    url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GEO_id[i],sep = "")
    GEO<-getURL(url,.encoding = "utf-8")
    GEO
    write.table(GEO,paste(table_dir,"GEO",GEO_id[i],".html",sep = ""))
    GEO_tree<-htmlParse(GEO,encoding = "utf-8")
    
    status<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Status")]/following-sibling::td')
    status<-sapply(status,xmlValue)
    title<-getNodeSet(GEO_tree,path = '//tr/child::td[contains(text(),"Title")]/following-sibling::td')
    title<-xmlValue(title[[1]])
    
    
    sample_hide<-getNodeSet(GEO_tree,path = '//td[contains(@onmouseout,"geo_empty_help")]/div/table//tr')
    sample_hide<-sapply(sample_hide,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
    
    
    sample_visual<-getNodeSet(GEO_tree,path ='//td[contains(@onmouseout,"geo_empty_help")]/table//tr' )
    sample_visual<-sapply(sample_visual,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
    sample_visual<-t(sample_visual[-3,])
    sample_visual<-sample_visual[-grep("GPL",sample_visual[,1]),]
    if(length(sample_hide)>0){
    sample_hide<-t(sample_hide[-3,])
    sample<-rbind(sample_visual,sample_hide)
    }else{sample=sample_visual}
    sample_list[[i]]<-sample
    Organism<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Organism")]/following-sibling::td')
    Organism<-sapply(Organism,xmlValue)
    
    Experiment.type<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Experiment type")]/following-sibling::td')
    Experiment.type<-sapply(Experiment.type,xmlValue)
    Experiment.type
    
    
    summary<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Summary")]/following-sibling::td')
    summary<-sapply(summary,xmlValue)
    
    
    Overall.design<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Overall design")]/following-sibling::td')
    Overall.design<-sapply(Overall.design,xmlValue)
    Citation<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Citation")]/following-sibling::td/span')
    Citation<-sapply(Citation,xmlValue)
    #下载sample信息压缩文件
    if(download_sample_info==T&!is.null(sample_info_dir)){
      sample_info=getNodeSet(GEO_tree,path = '//a[contains(text(),"Series Matrix File")]')
      sample_info=sapply(sample_info,xmlAttrs)
      sample_info=t(sample_info)
      sample_info<-sample_info[,which(colnames(sample_info)=="href")]
      sample_web<-getURL(sample_info)
      sample_name<-str_split(sample_web," ",simplify = T)
      sample_name<-t(sample_name)
      sample_name<-as.character(sample_name)[grep("GSE",sample_name)]
      sample_name<-unlist(str_split(sample_name,"\\r",simplify = T)[,1])
      sample_info_url<-paste(sample_info,sample_name,sep = "")
      dir.create(path = paste(sample_info_dir,GEO_id[i],sep = ""))
      for (j in 1:length(sample_info_url)) {

        sample_info_dir_new=paste(sample_info_dir,GEO_id[i],sep = "")
        download.file(url = sample_info_url[j],
                      destfile = paste(sample_info_dir_new,sample_name[j],sep = "/"),
                      mode = "wb")

      }
      cat(paste(paste(GEO_id[i],"的样本信息文件名为",paste(sample_name,collapse = "、"),sep=""),
                paste("保存在",sample_info_dir,"中",sep=""),
                sep="\n"))
    }
    
    
    GSE_information_sub<-data.frame(GEO_ID=GEO_id[i],
                                Status=status,
                                Title=title,
                                Organism=Organism,
                                Experiment.type=Experiment.type,
                                Summary=summary,
                                Overall.design=Overall.design,
                                PMID=Citation,
                                sample_NO=dim(sample)[1],
                                sample_info_url=sample_info
                                
    )
    GSE_information<-rbind(GSE_information,GSE_information_sub)
    Sys.sleep(5)
  }

获取GEO中GSM的信息

#-------------------------------------------------------
#Function2:获得GSM信息
#-------------------------------------------------------
write.xlsx(as.data.frame(GSE_information),paste(table_dir,"GSM_information.xlsx",sep = ""),
               sheetName="GSE_information",
               col.names = T)
sum_GSM<-sapply(sample_list,function(x){return(dim(x)[1])},simplify = T)
sum_GSM_sum<-cumsum(sum_GSM)
sum_GSM_sum<-rev(sum_GSM_sum)[1]
sum_GSM_sum

if (download_GSM_info==T) {
  for (m in 1:length(sample_list)) {
    sample_info<-data.frame()
    for (n in 1:(dim(sample_list[[m]])[1])){
      GSMid<-sample_list[[m]][n,1]
      url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GSMid,sep = "")
      GSM<-getURL(url,.encoding = "utf-8")
      GSM_url<-htmlParse(GSM)
      info_name<-c("Title","Organism","Source.name","Characteristics","Treatment.protocol",
                   "Growth.protocol","Extracted.molecule","Extraction.protocol","Library.strategy","Library.source",
                   "Library.selection","Instrument.model","Description","Data.processing")
      info_key<-c("Title","Organism","Source name","Characteristics","Treatment protocol",
                  "Growth protocol","Extracted molecule","Extraction protocol","Library strategy","Library source",
                  "Library selection","Instrument model","Description","Data processing")
      info_list<-list()
      for (a in 1:length(info_key)) {
        xpath<-paste('//td[contains(text(),"',info_key[a],'")]/following-sibling::td',sep = "")
        info_node<-getNodeSet(GSM_url,xpath)
        info_node<-sapply(info_node,xmlValue)
        info_list[[a]]<-info_node
      }
      names(info_list)<-info_name
      sra_all<-getNodeSet(GSM_url,'//td[contains(text(),"SRA")]/following-sibling::td/a')
      sra<-sapply(sra_all,xmlValue)
      sra_url<-sapply(sra_all,xmlAttrs)
      info_list$sra<-sra
      info_list$sra_url<-sra_url
      #对未获取的信息做NA处理
      info_list<-lapply(info_list,function(x){if(length(x)==0){return("NA")}else(return(x))})
      sample=data.frame(GSM_ID=GSMid,
                        Title=info_list$Title,
                        Organism=info_list$Organism,
                        Source.name=info_list$Source.name,
                        Characteristics=info_list$Characteristics,
                        Treatment.protocol=info_list$Treatment.protocol,
                        Growth.protocol=info_list$Growth.protocol,
                        Extracted.molecule=info_list$Extracted.molecule,
                        Extraction.protocol=info_list$Extraction.protocol,
                        Library.source=info_list$Library.source,
                        Description=info_list$Description,
                        Library.selection=info_list$Library.selection,
                        Instrument.model=info_list$Instrument.model,
                        Data.processing=info_list$Data.processing,
                        sra_id=info_list$sra,
                        sra_url=info_list$sra_url)
      sample_info<-rbind(sample_info,sample)
      #-------------------------------------------------------
      #Function3:显示进度条
      #-------------------------------------------------------
      running_time<-function(num=num){
      num=num+0.001
      a=seq(0,100,1)/100
      b=c(a,num)
      b=b[order(b)]
      n=which(b==num)
      persent=b[n-1]*100
      jindu<-paste(rep("--",round(persent/3)),collapse  = "")
      left<-"Running:"
      right<-paste(persent,"%",sep = "")
      final<-paste(left,jindu,right,sep = "")
      return(final)
      
      }
      
      if(m>1){jindu<-(rev(cumsum(sum_GSM[1:(m-1)]))[1]+n)/sum_GSM_sum}else{jindu<-n/sum_GSM_sum}
      
      print(running_time(jindu))
      
    }
    
    write.xlsx(sample_info,paste(table_dir,"GSM_information.xlsx",sep = ""),
               sheetName = paste(names(sample_list)[m],"_sampleinformation",sep = ""),
               append = T,row.names = F)
    
  }
  
}

汇总写成函数

library(RCurl)
library(XML)
library(stringr)
library(rvest)
library(xlsx)
GEO_get<-function(GEO_id=GEO_id,table_dir=".",download_sample_info=T,download_GSM_info=T,sample_info_dir=NULL){
  if(download_sample_info==T&is.null(sample_info_dir)){
    stop("快交出保存样本信息文件的地址,不然不给你下载")}
  if(download_sample_info==F&!is.null(sample_info_dir)){
    stop("download_sample_info设为true啊小兄弟")}
  GEO_id<-GEO_id[!duplicated(GEO_id)]
#-------------------------------------------------------
#Function1:获取GSE信息
#-------------------------------------------------------
  table_dir=paste(table_dir,"myGEO_search/",sep = "")
  dir.create(table_dir)
  GSE_information<-data.frame()
  sample_list<-list()
  for (i in 1:length(GEO_id)) {
    
    url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GEO_id[i],sep = "")
    GEO<-getURL(url,.encoding = "utf-8")
    GEO
    write.table(GEO,paste(table_dir,"GEO",GEO_id[i],".html",sep = ""))
    GEO_tree<-htmlParse(GEO,encoding = "utf-8")
    
    status<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Status")]/following-sibling::td')
    status<-sapply(status,xmlValue)
    title<-getNodeSet(GEO_tree,path = '//tr/child::td[contains(text(),"Title")]/following-sibling::td')
    title<-xmlValue(title[[1]])
    
    
    sample_hide<-getNodeSet(GEO_tree,path = '//td[contains(@onmouseout,"geo_empty_help")]/div/table//tr')
    sample_hide<-sapply(sample_hide,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
    
    
    sample_visual<-getNodeSet(GEO_tree,path ='//td[contains(@onmouseout,"geo_empty_help")]/table//tr' )
    sample_visual<-sapply(sample_visual,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
    sample_visual<-t(sample_visual[-3,])
    sample_visual<-sample_visual[-grep("GPL",sample_visual[,1]),]
    if(length(sample_hide)>0){
    sample_hide<-t(sample_hide[-3,])
    sample<-rbind(sample_visual,sample_hide)
    }else{sample=sample_visual}
    sample_list[[i]]<-sample
    Organism<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Organism")]/following-sibling::td')
    Organism<-sapply(Organism,xmlValue)
    
    Experiment.type<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Experiment type")]/following-sibling::td')
    Experiment.type<-sapply(Experiment.type,xmlValue)
    Experiment.type
    
    
    summary<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Summary")]/following-sibling::td')
    summary<-sapply(summary,xmlValue)
    
    
    Overall.design<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Overall design")]/following-sibling::td')
    Overall.design<-sapply(Overall.design,xmlValue)
    Citation<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Citation")]/following-sibling::td/span')
    Citation<-sapply(Citation,xmlValue)
    #下载sample信息压缩文件
    if(download_sample_info==T&!is.null(sample_info_dir)){
      sample_info=getNodeSet(GEO_tree,path = '//a[contains(text(),"Series Matrix File")]')
      sample_info=sapply(sample_info,xmlAttrs)
      sample_info=t(sample_info)
      sample_info<-sample_info[,which(colnames(sample_info)=="href")]
      sample_web<-getURL(sample_info)
      sample_name<-str_split(sample_web," ",simplify = T)
      sample_name<-t(sample_name)
      sample_name<-as.character(sample_name)[grep("GSE",sample_name)]
      sample_name<-unlist(str_split(sample_name,"\\r",simplify = T)[,1])
      sample_info_url<-paste(sample_info,sample_name,sep = "")
      dir.create(path = paste(sample_info_dir,GEO_id[i],sep = ""))
      for (j in 1:length(sample_info_url)) {

        sample_info_dir_new=paste(sample_info_dir,GEO_id[i],sep = "")
        download.file(url = sample_info_url[j],
                      destfile = paste(sample_info_dir_new,sample_name[j],sep = "/"),
                      mode = "wb")

      }
      cat(paste(paste(GEO_id[i],"的样本信息文件名为",paste(sample_name,collapse = "、"),sep=""),
                paste("保存在",sample_info_dir,"中",sep=""),
                sep="\n"))
    }
    
    
    GSE_information_sub<-data.frame(GEO_ID=GEO_id[i],
                                Status=status,
                                Title=title,
                                Organism=Organism,
                                Experiment.type=Experiment.type,
                                Summary=summary,
                                Overall.design=Overall.design,
                                PMID=Citation,
                                sample_NO=dim(sample)[1],
                                sample_info_url=sample_info
                                
    )
    GSE_information<-rbind(GSE_information,GSE_information_sub)
    Sys.sleep(5)
  }
  write.table(GSE_information,paste(table_dir,"GSE_information.txt",sep = ""))
  names(sample_list)<-GEO_id
#-------------------------------------------------------
#Function2:获得GSM信息
#-------------------------------------------------------
write.xlsx(as.data.frame(GSE_information),paste(table_dir,"GSM_information.xlsx",sep = ""),
               sheetName="GSE_information",
               col.names = T)
sum_GSM<-sapply(sample_list,function(x){return(dim(x)[1])},simplify = T)
sum_GSM_sum<-cumsum(sum_GSM)
sum_GSM_sum<-rev(sum_GSM_sum)[1]
sum_GSM_sum

if (download_GSM_info==T) {
  for (m in 1:length(sample_list)) {
    sample_info<-data.frame()
    for (n in 1:(dim(sample_list[[m]])[1])){
      GSMid<-sample_list[[m]][n,1]
      url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GSMid,sep = "")
      GSM<-getURL(url,.encoding = "utf-8")
      GSM_url<-htmlParse(GSM)
      info_name<-c("Title","Organism","Source.name","Characteristics","Treatment.protocol",
                   "Growth.protocol","Extracted.molecule","Extraction.protocol","Library.strategy","Library.source",
                   "Library.selection","Instrument.model","Description","Data.processing")
      info_key<-c("Title","Organism","Source name","Characteristics","Treatment protocol",
                  "Growth protocol","Extracted molecule","Extraction protocol","Library strategy","Library source",
                  "Library selection","Instrument model","Description","Data processing")
      info_list<-list()
      for (a in 1:length(info_key)) {
        xpath<-paste('//td[contains(text(),"',info_key[a],'")]/following-sibling::td',sep = "")
        info_node<-getNodeSet(GSM_url,xpath)
        info_node<-sapply(info_node,xmlValue)
        info_list[[a]]<-info_node
      }
      names(info_list)<-info_name
      sra_all<-getNodeSet(GSM_url,'//td[contains(text(),"SRA")]/following-sibling::td/a')
      sra<-sapply(sra_all,xmlValue)
      sra_url<-sapply(sra_all,xmlAttrs)
      info_list$sra<-sra
      info_list$sra_url<-sra_url
      #对未获取的信息做NA处理
      info_list<-lapply(info_list,function(x){if(length(x)==0){return("NA")}else(return(x))})
      sample=data.frame(GSM_ID=GSMid,
                        Title=info_list$Title,
                        Organism=info_list$Organism,
                        Source.name=info_list$Source.name,
                        Characteristics=info_list$Characteristics,
                        Treatment.protocol=info_list$Treatment.protocol,
                        Growth.protocol=info_list$Growth.protocol,
                        Extracted.molecule=info_list$Extracted.molecule,
                        Extraction.protocol=info_list$Extraction.protocol,
                        Library.source=info_list$Library.source,
                        Description=info_list$Description,
                        Library.selection=info_list$Library.selection,
                        Instrument.model=info_list$Instrument.model,
                        Data.processing=info_list$Data.processing,
                        sra_id=info_list$sra,
                        sra_url=info_list$sra_url)
      sample_info<-rbind(sample_info,sample)
      #-------------------------------------------------------
      #Function3:显示进度条
      #-------------------------------------------------------
      running_time<-function(num=num){
      num=num+0.001
      a=seq(0,100,1)/100
      b=c(a,num)
      b=b[order(b)]
      n=which(b==num)
      persent=b[n-1]*100
      jindu<-paste(rep("--",round(persent/3)),collapse  = "")
      left<-"Running:"
      right<-paste(persent,"%",sep = "")
      final<-paste(left,jindu,right,sep = "")
      return(final)
      
      }
      
      if(m>1){jindu<-(rev(cumsum(sum_GSM[1:(m-1)]))[1]+n)/sum_GSM_sum}else{jindu<-n/sum_GSM_sum}
      
      print(running_time(jindu))
      
    }
    
    write.xlsx(sample_info,paste(table_dir,"GSM_information.xlsx",sep = ""),
               sheetName = paste(names(sample_list)[m],"_sampleinformation",sep = ""),
               append = T,row.names = F)
    
  }
  
}

  
  
}

获取附加材料文件

######################################################## 
#-------------------------------------------------------
# Topic:爬取并下载NCBI中GEO的附件文件
# Author:Wang Haiquan
# Date:Sun Mar  1 17:25:03 2020
# Mail:mg1835020@smail.nju.edu.cn
#-------------------------------------------------------
########################################################


library(RCurl)
library(rvest)
library(XML)

get_supply<-function(GSEid=GSEid,download_file=F){
  GSEid<-GSEid
  item<-'//strong[contains(text(),"Supplementary file")]/parent::td/parent::tr/following-sibling::tr/td[1]'
  item_url<-'//strong[contains(text(),"Supplementary file")]/parent::td/parent::tr/following-sibling::tr/td[3]/a[1]'
  url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GSEid,sep = "")
  GSE_web<-getURL(url,.encoding = "utf-8")
  GSE_web<-htmlParse(GSE_web)
  item_url<-getNodeSet(GSE_web,item_url)
  item_url<-sapply(item_url,xmlAttrs)
  item<-getNodeSet(GSE_web,item)
  item<-sapply(item,xmlValue)
  item<-item[1:length(item_url)]
  supply<-data.frame(item=item,item_url=item_url)
  idex<-data.frame(idex=rep("https://www.ncbi.nlm.nih.gov/",dim(supply)[1]))
  idex$idex[grep("^ftp",supply[,2])]<-""
  supply$idex<-idex$idex
  supply$item_url<-paste(supply$idex,supply$item_url,sep = "")
  
  
  if(download_file==T){
    download_url<-supply$item_url
    filename<-as.character(supply$item)
    for (i in 1:length(download_url)) {
      download.file(download_url[i],filename[i],mode = "wb")
    }
  }
  #注意:此处下载至工作目录!!
  return(supply)
  
}
`````
上一篇下一篇

猜你喜欢

热点阅读