scrapy汽车之家（配置pipelines）

2019-08-02 本文已影响0人小董不太懂

之前一直再搞模拟登陆，发现爬虫的水越来越深，js是这个世界上最恶心的语言，各种加密，各种反爬，怪我太菜，被一系列反爬折磨的精神焕发。不得不说，爬虫是被培训机构炒起来的，而培训机构背后的水好深，爬虫入门容易，想深入很难。这几天看了好多从网上得到的培训视频，不得不说，基本都是垃圾，所以我建议，除了大神的网课，其他的垃圾网课不看也罢，以为爬个妹子图，就会爬虫了？？？还是要脚踏实地。
今天写一个关于scrapy抓取汽车之家宝马五系的图片，以谋求进一步熟悉对scrapy的操作。
目标网址：https://car.autohome.com.cn/pic/series/65.html#pvareaid=3454438

uibox是我们需要的各模块

category是我们需要的类别

这是我们需要的图片数据src
其次，我们想按照类别，将图片信息存放早对应的类下边

# -*- coding: utf-8 -*-
import scrapy
from BMW.items import BmwItem

class Bmw5Spider(scrapy.Spider):
    name = 'BMW5'
    allowed_domains = ['car.autohome.com.cn']
    start_urls = ['https://car.autohome.com.cn/pic/series/65.html#pvareaid=3454438']

    def parse(self, response):
        uiboxs = response.xpath('//div[@class="column grid-16"]/div[@class="uibox"]')[1:]
#第一个类别是全景图，我们不需要，故切片，从第二个类开始抓取
        for uibox in uiboxs:
#遍历每个uibox，分别提取类别和url,里面涉及一个url拼接
            category = uibox.xpath('.//div[@class="uibox-title"]/a/text()').get()
            urls = uibox.xpath('.//ul/li/a/img/@src').getall()
            urls= list(map(lambda url:response.urljoin(url),urls))
            item = BmwItem(category=category,urls=urls)
            yield item

然后就是items和settings的配置。

BOT_NAME = 'BMW'

SPIDER_MODULES = ['BMW.spiders']
NEWSPIDER_MODULE = 'BMW.spiders'

ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}

ITEM_PIPELINES = {
   'BMW.pipelines.BmwPipeline': 300,
}

import scrapy


class BmwItem(scrapy.Item):
    # define the fields for your item here like:
    category = scrapy.Field()
    urls = scrapy.Field()

重头戏是关于pipelines的配置
从上面的settings，可以看到我们已经开启了pipelines

import os
from urllib import request

class BmwPipeline(object):
    def __init__(self):
        self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')
    #os.path.dirname（）获得上一级的目录os.path.dirname(__file__)代表pipelines所在的目录
        if not os.path.exists(self.path):
            os.mkdir(self.path)
    #确定路径，打开或者建立一个名为image的文件

    def process_item(self, item, spider):
        category = item['category']
        urls = item['urls']
    #代表了一个存储的过程，先建立两个类文件
        category_path = os.path.join(self.path,category)
        if not os.path.exists(category_path):
            os.mkdir(category_path)
        for url in urls:
            image_name = url.split('__')[-1]
    #从图片网址的__开始做切片，只要__后面的部分
            request.urlretrieve(url,os.path.join(category_path,image_name))
        return item

下载到的数据

我要接着尝试其他下载方法
比如scrapy自带的异步下载法

配置items

更改spider

配置setting

效果展示下载速度是没问题，超级快，但是却不是我们想要的存储方式，我们想要分类别存储，而scrapy直接给我们集合在full文件中。

重点又来了，自定义pipelines

# -*- coding: utf-8 -*-

BOT_NAME = 'BMW'

SPIDER_MODULES = ['BMW.spiders']
NEWSPIDER_MODULE = 'BMW.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'BMW (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}

ITEM_PIPELINES = {
   #'BMW.pipelines.BmwPipeline': 300,
     # 'scrapy.pipelines.images.ImagesPipeline' :1
     'BMW.pipelines.BMWPipelineimage' :1
}

#图片下载路径,供images pipelines使用
import os
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')


# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from urllib import request
from scrapy.pipelines.images import ImagesPipeline
from BMW import settings


class BMWPipelineimage(ImagesPipeline):
    def get_media_requests(self, item, info):
        #引用父类中的get_media_requests（）,获得所有的requests信息，然后再加上item
        request_objs = super(BMWPipelineimage,self).get_media_requests(item,info)
        for request_obj in request_objs:
            request_obj.item = item
        return request_objs
#发送下载请求之前调用，这个方法本身就是为了发送下载请求

    def file_path(self, request, response=None, info=None):
        #path每个图片的路径大概格式是full/xxxxxxxxx.jpg
        path = super(BMWPipelineimage,self).file_path(request,response,info)
        category = request.item['category']#建立一个类文件

        images_store = settings.IMAGES_STORE
        category_path = os.path.join(images_store,category)
        if not os.path.exists(category_path):
            os.mkdir(category_path)
        image_name = path.replace('full/','')
        image_path = os.path.join(category_path,image_name)
        return  image_path
    #图片在存储的时候调用，目的是为了获得图片存储的路径

效果图

scrapy汽车之家（配置pipelines）

重点又来了，自定义pipelines

猜你喜欢

热点阅读