糗事百科的scrapy爬取

2018-11-09  本文已影响0人  楚糖的糖
# -*- coding: utf-8 -*-
import scrapy


class QiuqiuSpider(scrapy.Spider):
    name = 'qiuqiu'
    # allowed_domains = ['www.qiushibaike.com']
    # start_urls = ['http://www.qiushibaike.com/']
    def start_requests(self):
        urls=["https://www.qiushibaike.com/8hr/page/%s/" %i for i in range(1,13)]
        for item in urls:
            yield scrapy.Request(url=item,callback=self.parse22)

    def parse22(self, response):
        li_list = response.xpath("//div[@id='content-left']/div")
        for li in li_list:
            item={}
            item["url_1"] = response.url
            # 用户头像,用户名,用户年龄,内容,好笑个数,评论数
            item["face"] = li.xpath(".//div[@class='author clearfix']//img/@src").extract()
            item["face"]=["https:" + i for i in item["face"]]
            item["name"]=li.xpath(".//div[@class='author clearfix']//h2/text()").extract_first()
            item["age"]=li.xpath(".//div[@class='author clearfix']/div/text()").extract_first()
            item["content"] = li.xpath(".//div[@class='content']/span/text()").extract_first()
            item["haha_count"] = li.xpath("../span[@class='stats-vote']/span[1]//i/text()").extract_first()
            item["ping_count"] = li.xpath("./span[@class='stats-comments']/i[2]/text()").extract_first()
            print(item)
            # yield item
上一篇 下一篇

猜你喜欢

热点阅读