Python（七十七）管道、日志与全站爬取

2022-03-17 本文已影响0人 Lonelyroots

import scrapy
import logging
from myspider02.items import Myspider02Item, TaocheParamenterConfig

logger = logging.getLogger(__name__)


class TaocheSpider(scrapy.Spider):
    name = 'taoche'
    allowed_domains = ['taoche.com']
    start_urls = ['https://changsha.taoche.com/bmw/']

    # url模板
    url = 'https://changsha.taoche.com/bmw/?page=%d'

    count = 0

    def parse(self, response):
        max_page = response.xpath('//div[@class="paging-box the-pages"]/div/a[last()-1]/text()').extract_first()
        # logger.error(max_page)
        for page in range(1, int(max_page) + 1):
            new_url = self.url % page
            # 手动请求每一页，将url地址传递给调度器（但传的是请求对象）
            """
                如何将请求传递给调度器
                    yield scrapy.Request()
                url：请求地址
                callback：请求后响应数据的处理函数
                meta：传递数据
                    每次请求都会携带meta参数{'page':page}
                    传递给响应
                    response.meta = meta
                    response.meta['page']
            """
            yield scrapy.Request(url=new_url, callback=self.parse_taoche, meta={'page': page})

    def parse_taoche(self, response):
        # logger.error(f'{response.meta["page"]}')
        # 依次得到了每一页的汽车列表
        car_list = response.xpath('//div[@id="container_base"]/ul/li')
        for car in car_list:
            #     # count 用于测试
            #     self.count += 1
            # logger.error(self.count)
            CarFigure = car.xpath('./div[1]/div/a/img/@src').extract_first()
            Title = car.xpath('./div[2]/a/span/text()').extract_first()
            RegisterYear = car.xpath('./div[2]/p/i[1]/text()').extract_first()
            mileage = car.xpath('./div[2]/p/i[2]/text()').extract_first()
            city = car.xpath('./div[2]/p/i[3]/text()').extract_first().strip()
            selling_price = car.xpath('./div[2]/div[1]/i[1]/text()').extract_first()
            price = car.xpath('.//div[@class="price"]/i[2]/text()').extract_first()

            item = Myspider02Item()
            item['CarFigure'] = CarFigure
            item['Title'] = Title
            item['RegisterYear'] = RegisterYear
            item['mileage'] = mileage
            item['city'] = city
            item['selling_price'] = selling_price
            item['price'] = price

            # logger.error(item)

            # 获取每辆车详情页的url
            detail_url = car.xpath('./div[1]/div/a/@href').extract_first()
            yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})

    def parse_detail(self, response):
        attrs = response.xpath('/html/body/div[9]/div[1]/div[2]/div[4]/div/dl[3]/dd/text()').extract_first()
        displacement, gearbox = tuple(attrs.split('/'))

        BrandModel = response.xpath('/html/body/div[9]/div[10]/div[2]/div[1]/ul/li[1]/span/a/text()').extract_first()
        SourceLocation = response.xpath('/html/body/div[9]/div[10]/div[2]/div[1]/ul/li[2]/span/text()').extract_first()

        taocheParamenterConfig = TaocheParamenterConfig()
        taocheParamenterConfig['displacement'] = displacement
        taocheParamenterConfig['gearbox'] = gearbox
        taocheParamenterConfig['BrandModel'] = BrandModel
        taocheParamenterConfig['SourceLocation'] = SourceLocation

        # 外键关联
        item = response.meta['item']
        item['detail'] = taocheParamenterConfig

        # logger.error(item)

        yield item

16_管道、日志与全站爬取/myspider02/myspider02/items.py：

import scrapy


class Myspider02Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    CarFigure = scrapy.Field()
    Title = scrapy.Field()
    RegisterYear = scrapy.Field()
    mileage = scrapy.Field()
    city = scrapy.Field()
    selling_price = scrapy.Field()
    price = scrapy.Field()
    detail = scrapy.Field()


class TaocheParamenterConfig(scrapy.Item):
    displacement = scrapy.Field()
    gearbox = scrapy.Field()
    BrandModel = scrapy.Field()
    SourceLocation = scrapy.Field()

16_管道、日志与全站爬取/myspider02/myspider02/MyMongoDB.py：

from pymongo import MongoClient


class MyMongoDB:
    def __init__(self, database, collection):
        # 只要连接一次，千万不要放到循环里！！！！！！
        # MongoDB连接
        conn = MongoClient('localhost', 8881)  # 进入MongoDB这个软件
        db = conn[database]
        self.my_set = db[collection]

    def insert(self, data, onlyOne=True):
        if not isinstance(onlyOne, bool):
            raise TypeError
        self.my_set.insert_one(data) if onlyOne else self.my_set.insert_many(data)

    def find(self, query=None, onlyOne=True):
        if not isinstance(onlyOne, bool):
            raise TypeError
        self.my_set.find_one(query) if onlyOne else self.my_set.find(query)

    def update(self, data, new_data, onlyOne=True):
        if not isinstance(onlyOne, bool):
            raise TypeError
        self.my_set.update_one(data, {'$set': new_data}) if onlyOne else self.my_set.update_many(data,
                                                                                                 {'$set': new_data})
    def delete(self, data, onlyOne=True):
        if not isinstance(onlyOne, bool):
            raise TypeError
        self.my_set.delete_one(data) if onlyOne else self.my_set.delete_many(data)

16_管道、日志与全站爬取/myspider02/myspider02/pipelines.py：

from itemadapter import ItemAdapter
from myspider02.MyMongoDB import MyMongoDB


class Myspider02Pipeline:
    mongoDB = None

    def open_spider(self, spider):
        if spider.name == "taoche":
            print('开始爬取')
            self.mongoDB = MyMongoDB('taoche', 'car')

    def process_item(self, item, spider):
        if spider.name == "taoche":
            self.mongoDB.insert(dict(item))
        return item

    def close_spider(self, spider):
        if spider.name == "taoche":
            print('结束爬取')

文章到这里就结束了！希望大家能多多支持Python（系列）！六个月带大家学会Python，私聊我，可以问关于本文章的问题！以后每天都会发布新的文章，喜欢的点点关注！一个陪伴你学习Python的新青年！不管多忙都会更新下去，一起加油！

Editor：Lonelyroots

Python（七十七）管道、日志与全站爬取

猜你喜欢

热点阅读