day8、scrapy-58同城租房-存入redis
2018-08-23 本文已影响0人
是东东
zufang.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from chuzu.items import ChuzuItem
from scrapy_redis.spiders import RedisCrawlSpider
class ZufangSpider(RedisCrawlSpider):
name = 'zufang'
allowed_domains = ['58.com']
# start_urls = ["http://sz.58.com/chuzu/"]
redis_key = "zufang:start_urls"
rules = (
Rule(LinkExtractor(allow=r'pn\d+/'), callback='parse_item', follow=True),
)
def parse_item(self, response):
# print(response)
list = response.xpath("//div[@class='listBox']/ul/li")
# print(list)
for li in list:
item = ChuzuItem()
item["title"] = li.xpath("./div[@class='des']/h2/a/text()").extract_first()
item["room"] = li.xpath("./div[@class='des']/p[1]/text()").extract_first()
item["img"] = li.xpath("./div[@class='img_list']//img/@src").extract_first()
item["price"] = li.xpath(".//div[@class='money']/b/text()").extract_first()
# 获取二级页面的链接
next_url = "http:" + li.xpath("./div[@class='des']/h2/a/@href").extract_first()
# 请求二级页面
yield scrapy.Request(url=next_url,callback=self.parse_next,meta={"item":item})
def parse_next(self, response):
item = response.meta["item"]
item["phone"] = response.xpath("//span[@class='house-chat-txt']/text()").extract_first()
item["address"] = response.xpath("//span[@class='dz']/text()").extract_first()
yield item
items.py
import scrapy
class ChuzuItem(scrapy.Item):
# title
title = scrapy.Field()
# room信息
room = scrapy.Field()
# 图片
img = scrapy.Field()
# 价格
price = scrapy.Field()
# 经纪人
jingjiren = scrapy.Field()
# 地址
address = scrapy.Field()
# 电话
phone = scrapy.Field()
settings.py
(67)
ITEM_PIPELINES = {
'chuzu.pipelines.ChuzuPipeline': 300,
# 加入redis组件的管道
"scrapy_redis.pipelines.RedisPipeline":400
}
# Redis相关的组件
# 去重组件
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# scrapy_redis调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 爬取的过程中是否允许暂停
SCHEDULER_PERSIST = True
# 配置数据库信息
REDIS_HOST = "10.36.131.78"
REDIS_PORT = 6379