python 抓取网页中的图片并下载

2019-08-21 本文已影响0人精神病患者link常

方法一：获取网页内容，写入本地txt文件，然后通过正则表达式获取图片

# -*- coding: UTF-8 -*-

import urllib2, re, urllib, socket

# socket.setdefaulttimeout(60) # 全局超时时间设置

# 打开网页，将网页内容写入到 textName 文件中
def openHTML(htmlUrl, textName):
    # 打开网页 设置超时时间
    page = urllib2.urlopen(htmlUrl, timeout=60)
    # 读取页面源码
    htmlcode = page.read()
    # 转换编码
    htmlcode = urllib2.unquote(htmlcode);

    # 新建txt文件
    file = open('%s.txt' % textName, "a+")

    # 将 html 写入到本地txt中
    file.write(htmlcode)

    # 添加换行，用于分割
    file.write('\n--------------------------------------\n\n\n')

    # 正则，获取 以src=' 开头  以 jpg 结尾的字符串
    reg = r'src=\'(.+?\.jpg)'

    # 编译一下，运行更快
    reg_img = re.compile(reg)

    # 得到列表
    imglist = reg_img.findall(htmlcode)

    # 列表的每一个元素前添加 https: 完善为完整的图片地址
    imglistNew = ['https:'+i for i in imglist]

    # 下载图片 urlretrieve(地址，名字，进度回调)
    name = 1
    for item in imglistNew:
        urllib.urlretrieve(item, 'img/%s.jpg' % name, downLoadCallBack)
        name +=1

    # 将图片列表转成字符串写入txt文件
    imsliststring = '\n'.join(imglistNew)
    file.write(imsliststring)

    file.close()

def downLoadCallBack(blocknum, blocksize, totalsize):
    print 'blocknum=',blocknum # 第几次下载
    print 'blocksize=',blocksize # 第几次下载的大小
    print 'totalsize=',totalsize # 总大小
    progress = float(blocknum) * blocksize / totalsize * 100
    if progress > 100 :
        progress = 100
    print '图片下载进度=%0.2f' % progress + '%'
    print "%.2f%%" % progress # python中格式化字符 %% ，可以输出%

openHTML('https://www.feizl.com/feizhuliu/hui/', 'html')

urllib.urlretrieve('https://pic.feizl.com/upload/allimg/190814/gxtxvyz3ngavqaa.jpg', '123.jpg', downLoadCallBack)

image.png

方法二：

from bs4 import BeautifulSoup

#pip install --user 安装包


def SoupHtml(htmlUrl):
    # 打开网页 设置超时时间
    page = urllib2.urlopen(htmlUrl, timeout=60)
    # 读取页面源码
    htmlcode = page.read()
    # 转换编码
    htmlcode = urllib2.unquote(htmlcode);

    soup = BeautifulSoup(htmlcode, "html.parser")

    imgs = soup.find_all('img',{"src":True}) # 找到img标签 且 存在src的属性 返回列表
    print imgs

    for item in imgs:
        print item.get('src') # get 得到属性值


SoupHtml('https://www.feizl.com/feizhuliu/hui/')

python 抓取网页中的图片并下载

猜你喜欢

热点阅读