Scrapy爬虫实战 - 卡车
2020-01-15 本文已影响0人
诺之林
pipenv --version
# pipenv, version 2018.10.13
mkdir truck && cd truck
vim Pipfile
[[source]]
url = "https://mirrors.aliyun.com/pypi/simple"
verify_ssl = true
name = "pypi"
[requires]
python_version = "3.7"
[packages]
scrapy = "*"
[dev-packages]
pylint = "*"
"autopep8" = "*"
gi python >> .gitignore
pipenv install
pipenv run scrapy version
# Scrapy 1.8.0
pipenv run scrapy startproject truck
mv truck temp && mv temp/* . && rm -rf temp
pipenv run scrapy genspider che che.com
vim truck/spiders/che.py
# -*- coding: utf-8 -*-
import scrapy
class CheSpider(scrapy.Spider):
name = 'che'
count = 0
allowed_domains = ['*']
base_url = "*"
def start_requests(self):
urls = [
'*', # 牵引车
'*', # 载货车
'*', # 自卸车
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
body = scrapy.Selector(text=response.body)
next_page = body.xpath('//a[contains(text(),"下一页")]/@href').extract()
categories = body.xpath(
'//div[@class="caption"]/h2/a/text()').extract()
refs = body.xpath('//a[contains(text(),"配置")]/@href').extract()
for i, ref in enumerate(refs):
yield scrapy.Request(url=(self.base_url + ref.strip()), callback=self.parse_category, meta={'category': categories[i]})
if next_page:
yield scrapy.Request(url=(self.base_url + next_page[0]), callback=self.parse)
def parse_category(self, response):
body = scrapy.Selector(text=response.body)
refs = body.xpath('//div[@class="title-bar"]/h5/a/@href').extract()
for ref in refs:
yield scrapy.Request(url=(self.base_url + ref.strip()),
callback=self.parse_detail,
meta={'category': response.meta.get('category')})
def parse_detail(self, response):
body = scrapy.Selector(text=response.body)
name = body.xpath('//h1[@class="conttan_a_l"]/a/text()').extract()
params = body.xpath(
'//div[@class="sppic"][2]//td/div/text()').extract()
info = {}
for i, val in enumerate(params):
if i % 2 == 0:
info[params[i].strip(':')] = params[i + 1]
print(response.meta.get('category'))
print(info)
print(response.url)
print(name[0])
self.count = self.count + 1
print(self.count)
pipenv run scrapy crawl che
vim README.md
pipenv install
pipenv run scrapy crawl che