python爬虫实战教程:下载某奶茶加盟品牌产品图片
2017-12-07 本文已影响147人
chengcxy
最近一直在写web的东西,想自己完成一个购物系统的demo,产品分类要一些图片,没有美学细胞的看到处理图片头就大,只好找个相关的网站爬一下,根据产品分类创建目录,以图片显示的alt属性命名图片,保存为jpg格式.
一.代码
import os
import shutil
import requests
from lxml import etree
BASEDIR = os.path.dirname(__file__)
class AshuiSpider(object):
#产品菜单 url
def parse_product(self):
url = 'http://www.ashui1998.com/product/'
html = requests.get(url).text
selector = etree.HTML(html)
infos = selector.xpath('//div[@class="nr"]/h4/a')
for info in infos:
product_name = info.xpath('text()')[0]
product_url = info.xpath('@href')[0]
product_item = {}
product_item['product_name'] = product_name
product_item['product_url'] = product_url
yield product_item
#产品菜单下的图片 url 名称 要写入的路径
def parse_pic(self, product_items):
for product_item in product_items:
# 创建存放路径
picture_dir = product_item['product_name']
os.chdir(BASEDIR)
if os.path.exists(picture_dir):
#不论目录是否为空都删除
shutil.rmtree(picture_dir)
os.mkdir(picture_dir)
# 解析图片地址
product_url = product_item['product_url']
html = requests.get(product_url).text
selector = etree.HTML(html)
pics = selector.xpath('//div[@class="pro_main"]/dl[@class="pd_index_dl"]/dt/a')
for pic in pics:
pic_url = pic.xpath('img/@src')[0]
pic_name = pic.xpath('img/@alt')[0]
pic_item = {}
pic_item['pic_url'] = pic_url
pic_item['pic_name'] = pic_name
pic_item['file'] = os.path.join(BASEDIR, picture_dir, pic_name+'.jpg')
pic_content = requests.get(pic_url).content
pic_item['pic_content'] = pic_content
yield pic_item
def run(self):
product_items = self.parse_product()
pic_items = self.parse_pic(product_items)
for pic_item in pic_items:
with open(pic_item['file'],'wb') as pic:
pic.write(pic_item['pic_content'])
if __name__ == '__main__':
ashui = AshuiSpider()
ashui.run()
二.图片下载保存
奶茶图.png三.号外
新搭的微信公众号pythonfan,接入了图灵机器人,图片 语音 都支持回复...
image.png
本文git地址:https://github.com/chengcxy/spiders/blob/master/load_pics.py
喜欢的给个star,土豪的来个微信打赏~~~