Scrapy框架爬取全国房源信息
2019-06-26 本文已影响0人
米兰的小铁匠
效果
json文件
序
- 本次爬取房天下官网的全国658个城市的房源信息,主要是新房与二手房的信息抓取!
- Scrapy 使用了Twisted['twɪstɪd]异步网络框架,可以加快我们的下载速度。
文档地址:http://scrapy-chs.readthedocs.io/zh_CN/1.0/intro/overview.html
-
Scrapy 流程:
其流程可以描述如下:
- 调度器把requests-->引擎-->下载中间件--->下载器
- 下载器发送请求,获取响应---->下载中间件---->引擎--->爬虫中间件--->爬虫
- 爬虫提取url地址,组装成request对象---->爬虫中间件--->引擎--->调度器
- 爬虫提取数据--->引擎--->管道
-
管道进行数据的处理和保存
说明
正
- 创建一个爬虫Scrapy
- 首先在middlewares文件中,设置随机请求头(因为我只是加了一个请求头信息,所以后期测试的时候ip受到了限制,难受。。。),这个链接可以添加更多的请求头!
-
随后在setting文件中进行配置
setting
别忘了把robot协议打开,文明获取数据!
- 在items文件中定义数据对象!
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
# 定义要获取的信息名称对象
# 新房
class NewHouseItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 省份
procince = scrapy.Field()
# 城市
city = scrapy.Field()
# 几居
room = scrapy.Field()
# 面积
area = scrapy.Field()
#地区
district = scrapy.Field()
# 地址
address = scrapy.Field()
# 销售状态
sale = scrapy.Field()
# 价格
price = scrapy.Field()
# 房子详情链接
origin_url = scrapy.Field()
# 小区的名字
name = scrapy.Field()
# 二手房
class ESFHouseItem(scrapy.Item):
# 省份
procince = scrapy.Field()
# 城市
city = scrapy.Field()
# 几室几厅
room = scrapy.Field()
# 建筑面积
area = scrapy.Field()
# 层
floor = scrapy.Field()
# 地址
address = scrapy.Field()
# 单价
unit = scrapy.Field()
# 总价
price = scrapy.Field()
# 年代
year = scrapy.Field()
# 小区的名字
name = scrapy.Field()
# 朝向
toward = scrapy.Field()
# 房子详情链接
origin_url = scrapy.Field()
关键部分
- 在spider文件夹中,创建py文件编写代码(已注释)
import scrapy,re
from myspider.items import NewHouseItem,ESFHouseItem
from scrapy_redis.spiders import RedisSpider
class FangSpider(scrapy.Spider):
name = 'fang' # 爬虫名字
allowed_domains = ['fang.com'] # 爬取范围
# 开始爬取的网址url
start_urls = ['https://www.fang.com/SoufunFamily.htm']
# 获取新房与为二手房的所有房源链接
def parse(self,response):
trs = response.xpath("//div[@class='outCont']//tr")
for tr in trs:
tds = tr.xpath(".//td[not(@class)]")
procince_td = tds[0]
procince_text = procince_td.xpath(".//text()").get()
procince_text = re.sub(r"\s", "", procince_text)
if procince_text:
procince = procince_text
# 海外除外
if procince == "其它":
continue
city_td = tds[1]
city_links = city_td.xpath(".//a")
for city_link in city_links:
city = city_link.xpath(".//text()").get()
city_url = city_link.xpath(".//@href").get()
# print('===================')
# print('省份:',procince)
# print('城市:',city)
# print('城市链接:', city_url)
#构建新房链接
url_module = city_url.split("//")
scheme = url_module[0]
domain = url_module[1]
# 北京特例,需要验证
if "bj." in domain:
newhouse_url = 'https://newhouse.fang.com/house/s/'
esf_url = 'https://esf.fang.com/'
else:
newhouse_url = scheme +"//" +"newhouse." + domain + "house/s/"
# 构建二手房链接
esf_url = scheme + "//" + "esf." + domain
# print('新房链接:', newhouse_url)
# print('二手房链接:', esf_url)
#解析响应对象,返回数据对象(Item)或者新的请求对象(Request)
yield scrapy.Request(url=newhouse_url,callback=self.parse_newhouse, meta={"info":(procince, city)})
yield scrapy.Request(url=esf_url,callback=self.parse_esf, meta={"info":(procince, city)})
#
# break
# break
# 获取新房信息
def parse_newhouse(self, response):
procince, city = response.meta.get('info')
# 获取小区信息
# contains:用于包含标签
lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
for li in lis:
name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
if name:
name = name.strip()
house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
#去除空白字符
house_type_list = list(map(lambda x:re.sub(r"\s","", x), house_type_list))
# 过滤函数:filter()
# 几居
room = list(filter(lambda x:x.endswith("居"),house_type_list))
# 面积
area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall())
area = re.sub("\s|-|/","",area)
# 地区
address = li.xpath(".//div[@class='address']/a/@title").get()
district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
district = re.search(r".*\[(.+)\].*", district_text)
if district:
district = district.group(1)
#print(district)
#销售状态
sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
#print(sale)
# 价格
price= ''.join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
price = re.sub(r"\s|广告", "",price)
#print(price)
# 房子详情链接
origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
#print(origin_url)
item = NewHouseItem(name=name, procince = procince, city= city,room= room, sale=sale,price=price,address=address,district=district
,origin_url= origin_url)
yield item
# 下一页地址
next_url = response.xpath("//div[@class='page']//a[@class='next']//@href").get()
if next_url:
# 解析响应对象,返回数据对象(Item)或者新的请求对象(Request)
yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,
meta={'info':(procince,city)})
# 获取二手房信息
def parse_esf(self, response):
procince, city = response.meta.get('info')
dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
for dl in dls:
item = ESFHouseItem(procince=procince,city=city)
# 小区名字
name = dl.xpath(".//p[@class='add_shop']/a/text()").get()
if name:
item['name'] = name.strip()
#print(name)
# 房屋信息
infors = dl.xpath(".//p[@class='tel_shop']/text()").getall()
infors = list(map(lambda x:re.sub(r"\s","",x),infors))
#print(infors)
for infor in infors:
if '厅' in infor:
item['room'] = infor
elif '层' in infor:
item['floor'] = infor
elif '向' in infor:
item['toward'] = infor
elif '㎡' in infor:
item['area'] = infor
else:
if infor:
item['year'] = infor.replace('建','')
# 地址
address = dl.xpath(".//p[@class='add_shop']/span/text()").get()
item['address'] = address
# 总价
item['price'] = ''.join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall()).replace('\r\n','').replace(' ','')
#单价
item['unit'] = ''.join(dl.xpath(".//dd[@class='price_right']/span[2]//text()").getall()).replace('\r\n','').replace(' ','')
# 房子链接
detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
if detail_url:
item['origin_url'] = response.urljoin(detail_url)
#print(item)
yield item
# 解析响应对象,返回数据对象(Item)或者新的请求对象(Request)
next_url = response.xpath("//div[@class='page_al']/p[1]/a/@href").get()
yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf,
meta={'info':(procince,city)})
- 爬取的数据是存储在字典之中,我们也可以存储在json文件之中
- 可在管道文件中进行编写代码存储到json文件之中
from scrapy.exporters import JsonLinesItemExporter
# 将获取的信息存入json文件
class MyspiderPipeline(object):
def __init__(self):
self.newhouse_fp = open('newhouse.json','wb')
self.esfhouse_fp = open('esfhouse.json','wb')
self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,
ensure_ascii=False)
self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp,
ensure_ascii=False)
def process_item(self, item, spider):
self.newhouse_exporter.export_item(item)
self.esfhouse_exporter.export_item(item)
return item
def close_spider(self):
self.newhouse_fp.close()
self.esfhouse_fp.close()
大概这个样子:
json文件
很多对吧!真贵。。。。
- 如果有要求的可以进行分布式爬取,部署在服务器中!
- 安装scrapy-redis,在服务器中进行缓存消息队列,可以利用一台虚拟机作服务器,将文件上传,当然还需要安装好各种所需要的包;
分布式注意
- 继承自父类为RedisSpider
- 增加了一个redis_key的键,没有start_urls,因为分布式中,如果每台电脑都请求一次start_url就会重复
- 以scrapy_redis中crawlspider类的分布式爬虫;
- 多了init方法,该方法不是必须的,可以手动指定allow_domains;
说明
最后
目录结构:
-
在总目录下创建start文件来启动程序;
启动