Python学习八十五天:图片路径存储且item的json化
2019-05-15 本文已影响2人
暖A暖
1.item_completed()方法
-
语法:
item_completed(results, items, info)
; -
当一个单独项目中的所有图片请求完成时(不管下载成功或者失败),
ImagesPipeline.item_completed()
方法将被调用。item_completed()
方法必须返回将发送到后续item pipeline阶段的输出,因此必须返回或删除item(默认情况下item_completed会返回全部item);
2.在pipline中重写item_completed方法
- 在ImagePipeline中重写item_completed方法获取图片的保存路径
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
## start of deprecation warning block (can be removed in the future)
def _warn():
from scrapy.exceptions import ScrapyDeprecationWarning
import warnings
warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
'please use file_path(request, response=None, info=None) instead',
category=ScrapyDeprecationWarning, stacklevel=1)
# check if called from image_key or file_key with url as first argument
if not isinstance(request, Request):
_warn()
url = request
else:
url = request.url
# detect if file_key() or image_key() methods have been overridden
if not hasattr(self.file_key, '_base'):
_warn()
return self.file_key(url)
elif not hasattr(self.image_key, '_base'):
_warn()
return self.image_key(url)
## end of deprecation warning block
image_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
# 修改为时间为目录
return '{}/{}.jpg'.format(datetime.now().year,image_guid)
def item_completed(self, results, item, info):
# 获取图片地址保存到列表中
values = [value['path'] for ok, value in results if ok]
# 给item赋值
item['image_path'] = values.pop(0) if values else 'default.jpg'
return item
3.创建md5函数
- 我们可以使用scrapy中的hashlib.md5 处理 url,首先在项目settings文件的同一目录下,创建一个叫utils的package,然后在这个包里创建一个md5文件; 使用之前先从hashlib中导入md5,把hashlib中md5()实例化,然后用update传入url,再用
hexdigest()
提取摘要。还可以使用isinstance()
来判判断传入值编码类型,使用encode()
方法将unicode编码转换成其他编码的字符串等;
from hashlib import md5
def get_md5(url):
if isinstance(url, str):
# 先转化为字节码
url = url.encode()
print(url)
obj = md5()
obj.update(url)
return obj.hexdigest()
if __name__ == '__main__':
print(get_md5('www.baidu.com'))
4. 在item中添加字段
import scrapy
class XkdDribbbleSpiderItem(scrapy.Item):
title = scrapy.Field()
image_url = scrapy.Field()
date = scrapy.Field()
# 添加图片路径到item中
image_path = scrapy.Field()
# 加页面的url地址添加到item中
url = scrapy.Field()
# 添加url的哈希值字段
url_id = scrapy.Field()
5. 将item在spider中返回
import scrapy
from urllib import parse
from scrapy.http import Request
from datetime import datetime
from ..items import XkdDribbbleSpiderItem
from ..utils.md5_tool import get_md5
class DribbbleSpider(scrapy.Spider):
name = 'dribbble'
allowed_domains = ['dribbble.com']
start_urls = ['https://dribbble.com/stories']
def parse(self, response):
# 获取a标签的url值
# selector
a_selectors = response.css('div.teaser a')
for a_selector in a_selectors:
image_url = a_selector.css('img::attr(src)').extract()[0]
page_url = a_selector.css('::attr(href)').extract()[0]
yield Request(url=parse.urljoin(response.url, page_url), callback=self.parse_analyse,meta={'a_image_url': image_url})
def parse_analyse(self, response):
title = response.css('header h1::text').extract_first()
image_url = response.meta.get('a_image_url')
date_raw = response.css('p span.date::text').extract()[0]
date_str = date_raw.strip()
date = datetime.strptime(date_str, '%b %d, %Y').date()
item = XkdDribbbleSpiderItem()
item['title'] = title
item['image_url'] = [image_url]
item['date'] = date
item['url'] = response.url
item['url_id'] = get_md5(response.url)
# item数据模型进行落地,数据持久化
yield item
6.创建JsonSavePipeline,用于写入item到文件中
import codecs
import json
class JsonSavePipeline:
def process_item(self, item, spider):
# 将spider中返回的item转化为字典
file = codecs.open('blog.json', mode='a')
dict_item = dict(item)
# 将字典json化
line = json.dumps(dict_item, ensure_ascii=False) + '\n'
# 写入到文件
file.write(line)
# 再次返回item
file.close()
7.在settings文件中添加JsonSavePipeline
'XKD_Dribbble_Spider.pipelines.JsonSavePipeline': 2,