day8、scrapy-58同城租房-存入redis

2018-08-23  本文已影响0人  是东东

zufang.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from chuzu.items import ChuzuItem
from scrapy_redis.spiders import RedisCrawlSpider


class ZufangSpider(RedisCrawlSpider):
    name = 'zufang'
    allowed_domains = ['58.com']
    # start_urls = ["http://sz.58.com/chuzu/"]
    redis_key = "zufang:start_urls"

    rules = (
        Rule(LinkExtractor(allow=r'pn\d+/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        # print(response)
        list = response.xpath("//div[@class='listBox']/ul/li")
        # print(list)
        for li in list:
            item = ChuzuItem()
            item["title"] = li.xpath("./div[@class='des']/h2/a/text()").extract_first()
            item["room"] = li.xpath("./div[@class='des']/p[1]/text()").extract_first()
            item["img"] = li.xpath("./div[@class='img_list']//img/@src").extract_first()
            item["price"] = li.xpath(".//div[@class='money']/b/text()").extract_first()
            # 获取二级页面的链接
            next_url = "http:" + li.xpath("./div[@class='des']/h2/a/@href").extract_first()
            # 请求二级页面
            yield scrapy.Request(url=next_url,callback=self.parse_next,meta={"item":item})

    def parse_next(self, response):
        item = response.meta["item"]
        item["phone"] = response.xpath("//span[@class='house-chat-txt']/text()").extract_first()
        item["address"] = response.xpath("//span[@class='dz']/text()").extract_first()

        yield item

items.py

import scrapy


class ChuzuItem(scrapy.Item):
    # title
    title = scrapy.Field()
    # room信息
    room = scrapy.Field()
    # 图片
    img = scrapy.Field()
    # 价格
    price = scrapy.Field()
    # 经纪人
    jingjiren = scrapy.Field()
    # 地址
    address = scrapy.Field()
    # 电话
    phone = scrapy.Field()

settings.py
(67)

ITEM_PIPELINES = {
   'chuzu.pipelines.ChuzuPipeline': 300,
    # 加入redis组件的管道
   "scrapy_redis.pipelines.RedisPipeline":400
}

# Redis相关的组件
# 去重组件
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# scrapy_redis调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 爬取的过程中是否允许暂停
SCHEDULER_PERSIST = True

# 配置数据库信息
REDIS_HOST = "10.36.131.78"
REDIS_PORT  = 6379
上一篇 下一篇

猜你喜欢

热点阅读