Scrapy爬虫实战 - 卡车

2020-01-15 本文已影响0人诺之林

pipenv --version
# pipenv, version 2018.10.13

mkdir truck && cd truck

vim Pipfile

[[source]]
url = "https://mirrors.aliyun.com/pypi/simple"
verify_ssl = true
name = "pypi"

[requires]
python_version = "3.7"

[packages]
scrapy = "*"

[dev-packages]
pylint = "*"
"autopep8" = "*"

gi python >> .gitignore

pipenv install

pipenv run scrapy version
# Scrapy 1.8.0

pipenv run scrapy startproject truck
mv truck temp && mv temp/* . && rm -rf temp

pipenv run scrapy genspider che che.com

vim truck/spiders/che.py

# -*- coding: utf-8 -*-

import scrapy


class CheSpider(scrapy.Spider):
    name = 'che'
    count = 0
    allowed_domains = ['*']
    base_url = "*"

    def start_requests(self):
        urls = [
            '*',  # 牵引车
            '*',  # 载货车
            '*',  # 自卸车
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        body = scrapy.Selector(text=response.body)
        next_page = body.xpath('//a[contains(text(),"下一页")]/@href').extract()
        categories = body.xpath(
            '//div[@class="caption"]/h2/a/text()').extract()
        refs = body.xpath('//a[contains(text(),"配置")]/@href').extract()
        for i, ref in enumerate(refs):
            yield scrapy.Request(url=(self.base_url + ref.strip()), callback=self.parse_category, meta={'category': categories[i]})
        if next_page:
            yield scrapy.Request(url=(self.base_url + next_page[0]), callback=self.parse)

    def parse_category(self, response):
        body = scrapy.Selector(text=response.body)
        refs = body.xpath('//div[@class="title-bar"]/h5/a/@href').extract()
        for ref in refs:
            yield scrapy.Request(url=(self.base_url + ref.strip()),
                                 callback=self.parse_detail,
                                 meta={'category': response.meta.get('category')})

    def parse_detail(self, response):
        body = scrapy.Selector(text=response.body)
        name = body.xpath('//h1[@class="conttan_a_l"]/a/text()').extract()
        params = body.xpath(
            '//div[@class="sppic"][2]//td/div/text()').extract()
        info = {}
        for i, val in enumerate(params):
            if i % 2 == 0:
                info[params[i].strip('：')] = params[i + 1]
        print(response.meta.get('category'))
        print(info)
        print(response.url)
        print(name[0])
        self.count = self.count + 1
        print(self.count)

pipenv run scrapy crawl che

vim README.md

pipenv install

pipenv run scrapy crawl che

Scrapy爬虫实战 - 卡车

猜你喜欢

热点阅读