scrapy汽车之家(配置pipelines)
2019-08-02 本文已影响0人
小董不太懂
之前一直再搞模拟登陆,发现爬虫的水越来越深,js是这个世界上最恶心的语言,各种加密,各种反爬,怪我太菜,被一系列反爬折磨的精神焕发。不得不说,爬虫是被培训机构炒起来的,而培训机构背后的水好深,爬虫入门容易,想深入很难。这几天看了好多从网上得到的培训视频,不得不说,基本都是垃圾,所以我建议,除了大神的网课,其他的垃圾网课不看也罢,以为爬个妹子图,就会爬虫了???还是要脚踏实地。
今天写一个关于scrapy抓取汽车之家宝马五系的图片,以谋求进一步熟悉对scrapy的操作。
目标网址:https://car.autohome.com.cn/pic/series/65.html#pvareaid=3454438



其次,我们想按照类别,将图片信息存放早对应的类下边
# -*- coding: utf-8 -*-
import scrapy
from BMW.items import BmwItem
class Bmw5Spider(scrapy.Spider):
name = 'BMW5'
allowed_domains = ['car.autohome.com.cn']
start_urls = ['https://car.autohome.com.cn/pic/series/65.html#pvareaid=3454438']
def parse(self, response):
uiboxs = response.xpath('//div[@class="column grid-16"]/div[@class="uibox"]')[1:]
#第一个类别是全景图,我们不需要,故切片,从第二个类开始抓取
for uibox in uiboxs:
#遍历每个uibox,分别提取类别和url,里面涉及一个url拼接
category = uibox.xpath('.//div[@class="uibox-title"]/a/text()').get()
urls = uibox.xpath('.//ul/li/a/img/@src').getall()
urls= list(map(lambda url:response.urljoin(url),urls))
item = BmwItem(category=category,urls=urls)
yield item
然后就是items和settings的配置。
BOT_NAME = 'BMW'
SPIDER_MODULES = ['BMW.spiders']
NEWSPIDER_MODULE = 'BMW.spiders'
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
ITEM_PIPELINES = {
'BMW.pipelines.BmwPipeline': 300,
}
import scrapy
class BmwItem(scrapy.Item):
# define the fields for your item here like:
category = scrapy.Field()
urls = scrapy.Field()
重头戏是关于pipelines的配置
从上面的settings,可以看到我们已经开启了pipelines
import os
from urllib import request
class BmwPipeline(object):
def __init__(self):
self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')
#os.path.dirname()获得上一级的目录os.path.dirname(__file__)代表pipelines所在的目录
if not os.path.exists(self.path):
os.mkdir(self.path)
#确定路径,打开或者建立一个名为image的文件
def process_item(self, item, spider):
category = item['category']
urls = item['urls']
#代表了一个存储的过程,先建立两个类文件
category_path = os.path.join(self.path,category)
if not os.path.exists(category_path):
os.mkdir(category_path)
for url in urls:
image_name = url.split('__')[-1]
#从图片网址的__开始做切片,只要__后面的部分
request.urlretrieve(url,os.path.join(category_path,image_name))
return item

我要接着尝试其他下载方法
比如scrapy自带的异步下载法




重点又来了,自定义pipelines
# -*- coding: utf-8 -*-
BOT_NAME = 'BMW'
SPIDER_MODULES = ['BMW.spiders']
NEWSPIDER_MODULE = 'BMW.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'BMW (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
ITEM_PIPELINES = {
#'BMW.pipelines.BmwPipeline': 300,
# 'scrapy.pipelines.images.ImagesPipeline' :1
'BMW.pipelines.BMWPipelineimage' :1
}
#图片下载路径,供images pipelines使用
import os
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from urllib import request
from scrapy.pipelines.images import ImagesPipeline
from BMW import settings
class BMWPipelineimage(ImagesPipeline):
def get_media_requests(self, item, info):
#引用父类中的get_media_requests(),获得所有的requests信息,然后再加上item
request_objs = super(BMWPipelineimage,self).get_media_requests(item,info)
for request_obj in request_objs:
request_obj.item = item
return request_objs
#发送下载请求之前调用,这个方法本身就是为了发送下载请求
def file_path(self, request, response=None, info=None):
#path每个图片的路径大概格式是full/xxxxxxxxx.jpg
path = super(BMWPipelineimage,self).file_path(request,response,info)
category = request.item['category']#建立一个类文件
images_store = settings.IMAGES_STORE
category_path = os.path.join(images_store,category)
if not os.path.exists(category_path):
os.mkdir(category_path)
image_name = path.replace('full/','')
image_path = os.path.join(category_path,image_name)
return image_path
#图片在存储的时候调用,目的是为了获得图片存储的路径
