RCurl包爬取豆瓣电影id和IMDB电影号id

2017-06-18 本文已影响0人 485b1aca799e
爬取豆瓣id和IMDB_id


#输入电影名字、导演、演员信息，爬取豆瓣id和IMDB_id
#输入信息必须经过严格清洗，不允许出现空格，导演和演员字符长度严格小于等于6


#### 计算程序的运行时间
timestart<-Sys.time();
#打印开始时间
print(timestart)
####这块写你要运行的程序


#报头设置非常重要，爬虫一定要伪装，另外for循环一定要间隔休息
library(xlsx)
library(readxl)
library(plyr)
library(sqldf)
library(data.table)
library(RCurl)
library(XML)
library(stringr)
#伪装报头
myheader<-c(
  "User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"="en-us",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)


########定义编辑距离函数#############
Fun <- function(x,y){
  library(stringr)
  
  m <- str_length(x)
  n <- str_length(y)
  
  x <- str_split(x,pattern = "")[[1]];
  y <- str_split(y,pattern = "")[[1]];
  
  M <- matrix(0,nrow = m+1,ncol = n+1);
  rownames(M) <- c(" ",x)
  colnames(M) <- c(" ",y)
  
  for(i in 1:(m+1)) M[i,1] <- i-1; 
  for(j in 1:(n+1)) M[1,j] <- j-1; 
  
  for(i in 2:(m+1)){
    for(j in 2:(n+1)){
      if(x[i-1]==y[j-1]) cost=0 else cost=1;
      M[i,j]=min(M[i-1,j]+1,M[i,j-1]+1,M[i-1,j-1]+cost)
    }
  }
  #返回字符串的相似度
  return(round(1-M[m+1,n+1]/(m+n),2));
  
}





#url <- "https://movie.douban.com/"

#text=c("碟中谍","狮子王","魔戒3","星际穿越","火星救援","碟中谍2","职业特工队2","谍影重重2","碟中谍5")
#text="哈利波特与魔法石"
#text="少年派的奇幻漂流"
#text="哈利波特与死亡圣器(下)"
#text="手机"
#text <- t(c("加勒比海盗1：黑珍珠号的诅咒","冯小刚"," 张国立葛优范"))
#text <- t(c("哈利波特与死亡圣器(下)" ,        "大飞",   "廖智苗皓钧"))
#输入参数
#text <- y[1:100,c(1,2,3)]
#i=2
#抽样测试
#text <- text[sample(2901,200,replace = F),]

##################测试###########################
#text <- as.data.frame(t(z[1,]),stringsAsFactors = F)

###################匹配分类##############
#A <- "完全匹配"
#B <- "多个匹配但前五结果唯一"
#C <- "返回一个结果标题不匹配但详情页匹配"
#D <- "前五结果多个但是匹配上了"
#E <- "完全不匹配"
#F <- "前五结果多个但是没有匹配上"


url <- paste("https://movie.douban.com/subject_search?search_text=",text[,1],"&cat=1002",sep = "")
url_douban <- NULL;
url_douban_id <- NULL;
imdb_id <- NULL;
class <- NULL;



#输入数据英文小括号改写为正则表达式\\(和\\)
text[,1]<- str_replace(str_replace(text[,1],pattern = "\\(",replacement = "\\\\("),pattern = "\\)",replacement = "\\\\)");

#i=1
for(i in 1:length(url)){
  wp<-getURL(url[i],.encoding="utf-8",followlocation=T,httpheader=myheader)
  doc <- htmlParse(wp,asText=T,encoding="UTF-8")#解析
  #text[i]为电影名字
  
  title<- xpathSApply(doc,"//div[@class='pl2']//a",xmlValue)#搜索页所有的结果标题  
  title <- str_replace_all(title,pattern = "·",replacement = "")
  
  
  pipei <- str_detect(title,paste(" ",text[i,1]," {0,2}(\\(.{2,3}\\))?(：.{0,8})?","[\n | /]",sep = ""))
  
  #如果匹配列表个数等于1
  if(length(which(pipei))==1){
    subscript <- which(pipei)[1]#匹配上电影的下标
    
    url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[subscript]#进入搜索结果的链接
    url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
    wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
    doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
    if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))==0){ imdb_id[i] <- "000"}
    else{imdb_id[i] <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
    class[i] <- "A"
  }
  else if(length(which(pipei))>1){
    if(length(which(pipei[1:5]))==1){
      subscript <- which(pipei)[1]#匹配上电影的下标
      
      url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[subscript]#进入搜索结果的链接
      url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
      wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
      doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
      
        if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))==0) imdb_id[i] <- "000"
      else {imdb_id[i] <     xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
      class[i] <- "B"
    } 
    else{
      #########需要进行二次匹配###########
      url_pipei <-xpathSApply(doc,"//div[@class='pl2']//a",xmlGetAttr,"href")[which(pipei)]; 
      
      imdb_id_temp <- NULL;
      xishu <- NULL;
      for(n in 1:length(url_pipei)){
        
        wp1<-getURL(url_pipei[n],.encoding="utf-8",followlocation=T,httpheader=myheader)
        doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")
        #导演
        daoyan <- xpathSApply(doc1,path = "//div[@id='info']//span",xmlValue)[1]
        # if(is.null(daoyan)){daoyan <- ""}
        daoyan <- str_replace_all(daoyan,pattern = "导演:","")
        daoyan <- str_replace_all(daoyan,pattern = " ","")
        daoyan <- str_replace_all(daoyan,pattern = "/","")
        if(length(daoyan)==0){daoyan <- " "}
        if(str_length(daoyan)>6)
        {daoyan <- substring(daoyan,1,6)}
        
        #主演
        zhuyan <- xpathSApply(doc1,path = "//div[@id='info']//span[@class='actor']",xmlValue)
        if(length(zhuyan)==0){zhuyan <- "abcdef"}
        zhuyan <- str_replace_all(zhuyan,pattern = "主演:","")
        zhuyan <- str_replace_all(zhuyan,pattern = " ","")
        zhuyan <- str_replace_all(zhuyan,pattern = "/","")
        
        if(str_length(zhuyan)>6) {zhuyan <- substring(zhuyan,1,6)}
        
        p <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)
        
        if(length(p)==0){imdb_id_temp[n] <-"000"}
        else{imdb_id_temp[n] <-xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
        
        #加权系数计算
        xishu[n] <- 0.6*Fun(text[i,2],daoyan)+0.4*Fun(text[i,3],zhuyan)###需要将输入参数改为三个变量的数据框
        
      }
      subscript <- which.max(xishu)
      if(xishu[subscript]>0.5)
      {
        url_douban_id[i] <- str_split(url_pipei[subscript],pattern = "/")[[1]][5]
        imdb_id[i] <- imdb_id_temp[subscript]
        class[i] <- "D"
      }
      else{
        url_douban_id[i] <- 0;
        imdb_id[i] <- 0;
        class[i] <- "F"
      }
    }
  }
  else {
    if(length(pipei)==1)
    {
      url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[1]
      wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
      doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
      if(length(xpathSApply(doc1,path = "//div[@id='info']",xmlValue))!=0){
      text_another_name <- str_extract(xpathSApply(doc1,path = "//div[@id='info']",xmlValue),pattern = "又名:.*IMDb链接")
      if(is.na(text_another_name)){
        text_another_name <- "aaaaaa"
      }
      }
      else{
        text_another_name <- "aaaaaa"
      }
      if(str_detect(text_another_name,text[i,1]))
      {
        url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
       if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))!=0){
         imdb_id[i] <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
        else{imdb_id[i] <- "000"}
        class[i] <- "C"
      }
      else {
        url_douban_id[i] <- NA;
        imdb_id[i] <- NA;
        class[i] <- "E";
      }
    }
    else{
      url_douban_id[i] <- NA;
      imdb_id[i] <- NA;
      class[i] <- "E";
    }
  }  
  #每一次循环休息2秒左右
  Sys.sleep(2+runif(1,0,1))
}

#整理成数据框
x <- data.frame(text[,1],url_douban_id,imdb_id,class)


#如果匹配列表返回值前五个出现相同的匹配结果，则返回id=0;考虑将结果范围缩小到
#如果列表返回值是1，但是不匹配名称，则获得链接，进入详情信息页面
#对搜索列表的电影名称进行精简修改，注意英文名字需要加上分隔符，比如哈利波特、珀西杰克逊等
#标题第二个字段好像是没有进行匹配的，需要进行修改

#计算程序结束时间
timeend<-Sys.time()
#打印结束时间
print(timeend)
runningtime<-timeend-timestart
#输出时间消耗 
print(runningtime)
RCurl包爬取豆瓣电影id和IMDB电影号id

爬取豆瓣id和IMDB_id

猜你喜欢

热点阅读