爬取NCBI中GEO中的数据
2020-03-01 本文已影响0人
一只烟酒僧
获取GEO中GSE网页的信息
for (i in 1:length(GEO_id)) {
url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GEO_id[i],sep = "")
GEO<-getURL(url,.encoding = "utf-8")
GEO
write.table(GEO,paste(table_dir,"GEO",GEO_id[i],".html",sep = ""))
GEO_tree<-htmlParse(GEO,encoding = "utf-8")
status<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Status")]/following-sibling::td')
status<-sapply(status,xmlValue)
title<-getNodeSet(GEO_tree,path = '//tr/child::td[contains(text(),"Title")]/following-sibling::td')
title<-xmlValue(title[[1]])
sample_hide<-getNodeSet(GEO_tree,path = '//td[contains(@onmouseout,"geo_empty_help")]/div/table//tr')
sample_hide<-sapply(sample_hide,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
sample_visual<-getNodeSet(GEO_tree,path ='//td[contains(@onmouseout,"geo_empty_help")]/table//tr' )
sample_visual<-sapply(sample_visual,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
sample_visual<-t(sample_visual[-3,])
sample_visual<-sample_visual[-grep("GPL",sample_visual[,1]),]
if(length(sample_hide)>0){
sample_hide<-t(sample_hide[-3,])
sample<-rbind(sample_visual,sample_hide)
}else{sample=sample_visual}
sample_list[[i]]<-sample
Organism<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Organism")]/following-sibling::td')
Organism<-sapply(Organism,xmlValue)
Experiment.type<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Experiment type")]/following-sibling::td')
Experiment.type<-sapply(Experiment.type,xmlValue)
Experiment.type
summary<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Summary")]/following-sibling::td')
summary<-sapply(summary,xmlValue)
Overall.design<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Overall design")]/following-sibling::td')
Overall.design<-sapply(Overall.design,xmlValue)
Citation<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Citation")]/following-sibling::td/span')
Citation<-sapply(Citation,xmlValue)
#下载sample信息压缩文件
if(download_sample_info==T&!is.null(sample_info_dir)){
sample_info=getNodeSet(GEO_tree,path = '//a[contains(text(),"Series Matrix File")]')
sample_info=sapply(sample_info,xmlAttrs)
sample_info=t(sample_info)
sample_info<-sample_info[,which(colnames(sample_info)=="href")]
sample_web<-getURL(sample_info)
sample_name<-str_split(sample_web," ",simplify = T)
sample_name<-t(sample_name)
sample_name<-as.character(sample_name)[grep("GSE",sample_name)]
sample_name<-unlist(str_split(sample_name,"\\r",simplify = T)[,1])
sample_info_url<-paste(sample_info,sample_name,sep = "")
dir.create(path = paste(sample_info_dir,GEO_id[i],sep = ""))
for (j in 1:length(sample_info_url)) {
sample_info_dir_new=paste(sample_info_dir,GEO_id[i],sep = "")
download.file(url = sample_info_url[j],
destfile = paste(sample_info_dir_new,sample_name[j],sep = "/"),
mode = "wb")
}
cat(paste(paste(GEO_id[i],"的样本信息文件名为",paste(sample_name,collapse = "、"),sep=""),
paste("保存在",sample_info_dir,"中",sep=""),
sep="\n"))
}
GSE_information_sub<-data.frame(GEO_ID=GEO_id[i],
Status=status,
Title=title,
Organism=Organism,
Experiment.type=Experiment.type,
Summary=summary,
Overall.design=Overall.design,
PMID=Citation,
sample_NO=dim(sample)[1],
sample_info_url=sample_info
)
GSE_information<-rbind(GSE_information,GSE_information_sub)
Sys.sleep(5)
}
获取GEO中GSM的信息
#-------------------------------------------------------
#Function2:获得GSM信息
#-------------------------------------------------------
write.xlsx(as.data.frame(GSE_information),paste(table_dir,"GSM_information.xlsx",sep = ""),
sheetName="GSE_information",
col.names = T)
sum_GSM<-sapply(sample_list,function(x){return(dim(x)[1])},simplify = T)
sum_GSM_sum<-cumsum(sum_GSM)
sum_GSM_sum<-rev(sum_GSM_sum)[1]
sum_GSM_sum
if (download_GSM_info==T) {
for (m in 1:length(sample_list)) {
sample_info<-data.frame()
for (n in 1:(dim(sample_list[[m]])[1])){
GSMid<-sample_list[[m]][n,1]
url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GSMid,sep = "")
GSM<-getURL(url,.encoding = "utf-8")
GSM_url<-htmlParse(GSM)
info_name<-c("Title","Organism","Source.name","Characteristics","Treatment.protocol",
"Growth.protocol","Extracted.molecule","Extraction.protocol","Library.strategy","Library.source",
"Library.selection","Instrument.model","Description","Data.processing")
info_key<-c("Title","Organism","Source name","Characteristics","Treatment protocol",
"Growth protocol","Extracted molecule","Extraction protocol","Library strategy","Library source",
"Library selection","Instrument model","Description","Data processing")
info_list<-list()
for (a in 1:length(info_key)) {
xpath<-paste('//td[contains(text(),"',info_key[a],'")]/following-sibling::td',sep = "")
info_node<-getNodeSet(GSM_url,xpath)
info_node<-sapply(info_node,xmlValue)
info_list[[a]]<-info_node
}
names(info_list)<-info_name
sra_all<-getNodeSet(GSM_url,'//td[contains(text(),"SRA")]/following-sibling::td/a')
sra<-sapply(sra_all,xmlValue)
sra_url<-sapply(sra_all,xmlAttrs)
info_list$sra<-sra
info_list$sra_url<-sra_url
#对未获取的信息做NA处理
info_list<-lapply(info_list,function(x){if(length(x)==0){return("NA")}else(return(x))})
sample=data.frame(GSM_ID=GSMid,
Title=info_list$Title,
Organism=info_list$Organism,
Source.name=info_list$Source.name,
Characteristics=info_list$Characteristics,
Treatment.protocol=info_list$Treatment.protocol,
Growth.protocol=info_list$Growth.protocol,
Extracted.molecule=info_list$Extracted.molecule,
Extraction.protocol=info_list$Extraction.protocol,
Library.source=info_list$Library.source,
Description=info_list$Description,
Library.selection=info_list$Library.selection,
Instrument.model=info_list$Instrument.model,
Data.processing=info_list$Data.processing,
sra_id=info_list$sra,
sra_url=info_list$sra_url)
sample_info<-rbind(sample_info,sample)
#-------------------------------------------------------
#Function3:显示进度条
#-------------------------------------------------------
running_time<-function(num=num){
num=num+0.001
a=seq(0,100,1)/100
b=c(a,num)
b=b[order(b)]
n=which(b==num)
persent=b[n-1]*100
jindu<-paste(rep("--",round(persent/3)),collapse = "")
left<-"Running:"
right<-paste(persent,"%",sep = "")
final<-paste(left,jindu,right,sep = "")
return(final)
}
if(m>1){jindu<-(rev(cumsum(sum_GSM[1:(m-1)]))[1]+n)/sum_GSM_sum}else{jindu<-n/sum_GSM_sum}
print(running_time(jindu))
}
write.xlsx(sample_info,paste(table_dir,"GSM_information.xlsx",sep = ""),
sheetName = paste(names(sample_list)[m],"_sampleinformation",sep = ""),
append = T,row.names = F)
}
}
汇总写成函数
library(RCurl)
library(XML)
library(stringr)
library(rvest)
library(xlsx)
GEO_get<-function(GEO_id=GEO_id,table_dir=".",download_sample_info=T,download_GSM_info=T,sample_info_dir=NULL){
if(download_sample_info==T&is.null(sample_info_dir)){
stop("快交出保存样本信息文件的地址,不然不给你下载")}
if(download_sample_info==F&!is.null(sample_info_dir)){
stop("download_sample_info设为true啊小兄弟")}
GEO_id<-GEO_id[!duplicated(GEO_id)]
#-------------------------------------------------------
#Function1:获取GSE信息
#-------------------------------------------------------
table_dir=paste(table_dir,"myGEO_search/",sep = "")
dir.create(table_dir)
GSE_information<-data.frame()
sample_list<-list()
for (i in 1:length(GEO_id)) {
url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GEO_id[i],sep = "")
GEO<-getURL(url,.encoding = "utf-8")
GEO
write.table(GEO,paste(table_dir,"GEO",GEO_id[i],".html",sep = ""))
GEO_tree<-htmlParse(GEO,encoding = "utf-8")
status<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Status")]/following-sibling::td')
status<-sapply(status,xmlValue)
title<-getNodeSet(GEO_tree,path = '//tr/child::td[contains(text(),"Title")]/following-sibling::td')
title<-xmlValue(title[[1]])
sample_hide<-getNodeSet(GEO_tree,path = '//td[contains(@onmouseout,"geo_empty_help")]/div/table//tr')
sample_hide<-sapply(sample_hide,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
sample_visual<-getNodeSet(GEO_tree,path ='//td[contains(@onmouseout,"geo_empty_help")]/table//tr' )
sample_visual<-sapply(sample_visual,function(x){x=xmlValue(x);x=unlist(str_split(x,"\n",simplify = T))})
sample_visual<-t(sample_visual[-3,])
sample_visual<-sample_visual[-grep("GPL",sample_visual[,1]),]
if(length(sample_hide)>0){
sample_hide<-t(sample_hide[-3,])
sample<-rbind(sample_visual,sample_hide)
}else{sample=sample_visual}
sample_list[[i]]<-sample
Organism<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Organism")]/following-sibling::td')
Organism<-sapply(Organism,xmlValue)
Experiment.type<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Experiment type")]/following-sibling::td')
Experiment.type<-sapply(Experiment.type,xmlValue)
Experiment.type
summary<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Summary")]/following-sibling::td')
summary<-sapply(summary,xmlValue)
Overall.design<-getNodeSet(GEO_tree,path = '//tr/td[contains(text(),"Overall design")]/following-sibling::td')
Overall.design<-sapply(Overall.design,xmlValue)
Citation<-getNodeSet(GEO_tree,path = '//td[contains(text(),"Citation")]/following-sibling::td/span')
Citation<-sapply(Citation,xmlValue)
#下载sample信息压缩文件
if(download_sample_info==T&!is.null(sample_info_dir)){
sample_info=getNodeSet(GEO_tree,path = '//a[contains(text(),"Series Matrix File")]')
sample_info=sapply(sample_info,xmlAttrs)
sample_info=t(sample_info)
sample_info<-sample_info[,which(colnames(sample_info)=="href")]
sample_web<-getURL(sample_info)
sample_name<-str_split(sample_web," ",simplify = T)
sample_name<-t(sample_name)
sample_name<-as.character(sample_name)[grep("GSE",sample_name)]
sample_name<-unlist(str_split(sample_name,"\\r",simplify = T)[,1])
sample_info_url<-paste(sample_info,sample_name,sep = "")
dir.create(path = paste(sample_info_dir,GEO_id[i],sep = ""))
for (j in 1:length(sample_info_url)) {
sample_info_dir_new=paste(sample_info_dir,GEO_id[i],sep = "")
download.file(url = sample_info_url[j],
destfile = paste(sample_info_dir_new,sample_name[j],sep = "/"),
mode = "wb")
}
cat(paste(paste(GEO_id[i],"的样本信息文件名为",paste(sample_name,collapse = "、"),sep=""),
paste("保存在",sample_info_dir,"中",sep=""),
sep="\n"))
}
GSE_information_sub<-data.frame(GEO_ID=GEO_id[i],
Status=status,
Title=title,
Organism=Organism,
Experiment.type=Experiment.type,
Summary=summary,
Overall.design=Overall.design,
PMID=Citation,
sample_NO=dim(sample)[1],
sample_info_url=sample_info
)
GSE_information<-rbind(GSE_information,GSE_information_sub)
Sys.sleep(5)
}
write.table(GSE_information,paste(table_dir,"GSE_information.txt",sep = ""))
names(sample_list)<-GEO_id
#-------------------------------------------------------
#Function2:获得GSM信息
#-------------------------------------------------------
write.xlsx(as.data.frame(GSE_information),paste(table_dir,"GSM_information.xlsx",sep = ""),
sheetName="GSE_information",
col.names = T)
sum_GSM<-sapply(sample_list,function(x){return(dim(x)[1])},simplify = T)
sum_GSM_sum<-cumsum(sum_GSM)
sum_GSM_sum<-rev(sum_GSM_sum)[1]
sum_GSM_sum
if (download_GSM_info==T) {
for (m in 1:length(sample_list)) {
sample_info<-data.frame()
for (n in 1:(dim(sample_list[[m]])[1])){
GSMid<-sample_list[[m]][n,1]
url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GSMid,sep = "")
GSM<-getURL(url,.encoding = "utf-8")
GSM_url<-htmlParse(GSM)
info_name<-c("Title","Organism","Source.name","Characteristics","Treatment.protocol",
"Growth.protocol","Extracted.molecule","Extraction.protocol","Library.strategy","Library.source",
"Library.selection","Instrument.model","Description","Data.processing")
info_key<-c("Title","Organism","Source name","Characteristics","Treatment protocol",
"Growth protocol","Extracted molecule","Extraction protocol","Library strategy","Library source",
"Library selection","Instrument model","Description","Data processing")
info_list<-list()
for (a in 1:length(info_key)) {
xpath<-paste('//td[contains(text(),"',info_key[a],'")]/following-sibling::td',sep = "")
info_node<-getNodeSet(GSM_url,xpath)
info_node<-sapply(info_node,xmlValue)
info_list[[a]]<-info_node
}
names(info_list)<-info_name
sra_all<-getNodeSet(GSM_url,'//td[contains(text(),"SRA")]/following-sibling::td/a')
sra<-sapply(sra_all,xmlValue)
sra_url<-sapply(sra_all,xmlAttrs)
info_list$sra<-sra
info_list$sra_url<-sra_url
#对未获取的信息做NA处理
info_list<-lapply(info_list,function(x){if(length(x)==0){return("NA")}else(return(x))})
sample=data.frame(GSM_ID=GSMid,
Title=info_list$Title,
Organism=info_list$Organism,
Source.name=info_list$Source.name,
Characteristics=info_list$Characteristics,
Treatment.protocol=info_list$Treatment.protocol,
Growth.protocol=info_list$Growth.protocol,
Extracted.molecule=info_list$Extracted.molecule,
Extraction.protocol=info_list$Extraction.protocol,
Library.source=info_list$Library.source,
Description=info_list$Description,
Library.selection=info_list$Library.selection,
Instrument.model=info_list$Instrument.model,
Data.processing=info_list$Data.processing,
sra_id=info_list$sra,
sra_url=info_list$sra_url)
sample_info<-rbind(sample_info,sample)
#-------------------------------------------------------
#Function3:显示进度条
#-------------------------------------------------------
running_time<-function(num=num){
num=num+0.001
a=seq(0,100,1)/100
b=c(a,num)
b=b[order(b)]
n=which(b==num)
persent=b[n-1]*100
jindu<-paste(rep("--",round(persent/3)),collapse = "")
left<-"Running:"
right<-paste(persent,"%",sep = "")
final<-paste(left,jindu,right,sep = "")
return(final)
}
if(m>1){jindu<-(rev(cumsum(sum_GSM[1:(m-1)]))[1]+n)/sum_GSM_sum}else{jindu<-n/sum_GSM_sum}
print(running_time(jindu))
}
write.xlsx(sample_info,paste(table_dir,"GSM_information.xlsx",sep = ""),
sheetName = paste(names(sample_list)[m],"_sampleinformation",sep = ""),
append = T,row.names = F)
}
}
}
获取附加材料文件
########################################################
#-------------------------------------------------------
# Topic:爬取并下载NCBI中GEO的附件文件
# Author:Wang Haiquan
# Date:Sun Mar 1 17:25:03 2020
# Mail:mg1835020@smail.nju.edu.cn
#-------------------------------------------------------
########################################################
library(RCurl)
library(rvest)
library(XML)
get_supply<-function(GSEid=GSEid,download_file=F){
GSEid<-GSEid
item<-'//strong[contains(text(),"Supplementary file")]/parent::td/parent::tr/following-sibling::tr/td[1]'
item_url<-'//strong[contains(text(),"Supplementary file")]/parent::td/parent::tr/following-sibling::tr/td[3]/a[1]'
url=paste("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",GSEid,sep = "")
GSE_web<-getURL(url,.encoding = "utf-8")
GSE_web<-htmlParse(GSE_web)
item_url<-getNodeSet(GSE_web,item_url)
item_url<-sapply(item_url,xmlAttrs)
item<-getNodeSet(GSE_web,item)
item<-sapply(item,xmlValue)
item<-item[1:length(item_url)]
supply<-data.frame(item=item,item_url=item_url)
idex<-data.frame(idex=rep("https://www.ncbi.nlm.nih.gov/",dim(supply)[1]))
idex$idex[grep("^ftp",supply[,2])]<-""
supply$idex<-idex$idex
supply$item_url<-paste(supply$idex,supply$item_url,sep = "")
if(download_file==T){
download_url<-supply$item_url
filename<-as.character(supply$item)
for (i in 1:length(download_url)) {
download.file(download_url[i],filename[i],mode = "wb")
}
}
#注意:此处下载至工作目录!!
return(supply)
}
`````