豆瓣电影TOP250 Scrapy双向爬取

2018-11-12  本文已影响43人  我的袜子都是洞

水平:下一页,不断获取,提取每页网址
垂直:根据每页网址垂直深化

爬虫代码:

import scrapy
import urllib
from douban_movie.items import DoubanMovieItem

class MovieSpider(scrapy.Spider):
    # 爬虫名
    name = 'movie'
    # 起始url
    start_urls = ['https://movie.douban.com/top250']

    def parse(self, response):
        # 水平抓取页面
        next_page = response.xpath("//div/span[@class='next']/a/@href").extract_first()
        # 存在下一页继续爬
        if next_page is not None:
            yield scrapy.Request(urllib.parse.urljoin(response.url, next_page))
        
        # 垂直抓取页面内容
        detail_url = response.xpath("//div[@class='item']/div/a/@href").extract()
        for url in detail_url:
            yield scrapy.Request(url,callback=self.parse_item)

    def parse_item(self,response):
        item = DoubanMovieItem()
        item['name'] = response.xpath("//div[@id='content']/h1/span[1]/text()").extract_first()
        item['stars'] = response.xpath("//div/strong[@class='ll rating_num']/text()").extract_first()
        item['comment'] = response.xpath("//a[@class='rating_people']/span/text()").extract_first()
        yield item
上一篇 下一篇

猜你喜欢

热点阅读