python学习python开发爬虫Scrapy

Python爬虫之scrapy跨页面爬取信息

2017-02-07  本文已影响1066人  罗罗攀

昨天凌晨2点醒了看了下向右奔跑的文章,准备来个scrapy跨页面的数据爬取,以简书七日热门数据为例。

1 items.py代码

from scrapy.item import Item,Field

class SevendayItem(Item):
    article_url = Field()#文章链接在首页爬取
    author = Field()
    article = Field()
    date = Field()
    word = Field()
    view = Field()
    comment = Field()
    like = Field()
    gain = Field()

可以看出,我要爬取的数据不在一个页面,这时候就需要跨页面爬取了。

2 新建sevendayspider.py

import scrapy
import sys
sys.path.append("..")
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request
from sevenday.items import SevendayItem
import re
import json
import requests


class sevenday(CrawlSpider):
    name = 'sevenday'
    start_urls = ['http://www.jianshu.com/trending/weekly']

    def parse(self, response):

        selector = Selector(response)
        infos = selector.xpath('//ul[@class="note-list"]/li')

        for info in infos:
            article_url_part = info.xpath('div/a/@href').extract()[0]
            article_url = 'http://www.jianshu.com/' + article_url_part
            yield Request(article_url, meta={'article_url':article_url},
                          callback=self.parse_item)

        urls = ['http://www.jianshu.com/trending/weekly?page={}'.format(str(i)) for i in range(1, 11)]
        for url in urls:
            yield Request(url,callback=self.parse)


    def parse_item(self,response):
        item = SevendayItem()

        item['article_url'] = response.meta['article_url']

        selector = Selector(response)
        author = selector.xpath('//span[@class="name"]/a/text()').extract()[0]
        article = selector.xpath('//h1[@class="title"]/text()').extract()[0]
        date = selector.xpath('//span[@class="publish-time"]/text()').extract()[0]
        word = selector.xpath('//span[@class="wordage"]/text()').extract()[0]
        view = re.findall(r'"views_count":(.*?),', response.body.decode('utf-8'), re.S)[0]
        comment = re.findall(r'"comments_count":(.*?)}', response.body.decode('utf-8'), re.S)[0]
        like = re.findall(r'"likes_count":(.*?),', response.body.decode('utf-8'), re.S)[0]
        id = re.findall(r'{"id":(.*?),', response.body.decode('utf-8'), re.S)[0]
        gain_url = 'http://www.jianshu.com/notes/{}/rewards?count=20'.format(id)
        wb_data = requests.get(gain_url)
        json_data = json.loads(wb_data.text)
        gain = json_data['rewards_count']

        item['author'] = author
        item['article'] = article
        item['date'] = date
        item['word'] = word
        item['view'] = view
        item['comment'] = comment
        item['like'] = like
        item['gain'] = gain

        yield item

看文章和我代码就能懂,我就班门弄斧了。

结果

上一篇下一篇

猜你喜欢

热点阅读