爬取百度贴吧图片

2019-09-26 本文已影响0人 yousa_
import re
# 正则表达式模块，用来匹配图片地址
import urllib.request
# 用来获取HTML源码
import sys
import os
import re

def geturls(path):
    urls = []
    with open(path, 'r', encoding='utf-8') as f1:
        url_list = f1.readlines()
        for url in url_list:
            urls.append(url[3:])
    print('get urls list ready!')
    return urls

def getHtml(urls):
    num = 0
    for url in urls:
        print(num)
        page = urllib.request.urlopen(url)
        html = page.read()
        num += 1
        yield html
    print('get htmls list ready!')

def getpic(htmls, dir):
    '''

    :param htmls: iteration
    :param dir:
    :return:
    '''
    if not os.path.exists(dir):
        os.makedirs(dir)

    imgName = 0
    while (htmls.__next__()):
        html = htmls.__next__()
        reg = r'src="(https://imgsa.*?\.jpg)"'
        imgre = re.compile(reg)
        imList = re.findall(imgre, html.decode('utf-8'))
        print(imList)

        # 下载图片
        for imgPath in imList:
            # ------ 这里最好使用异常处理及多线程编程方式 ------
            try:
                f = open(dir +'/' + str(imgName) + ".jpg", 'wb')
                f.write((urllib.request.urlopen(imgPath)).read())
                print(imgPath)
                f.close()
            except Exception as e:
                print(imgPath + " error")
            imgName += 1

def main():
    pic_dir = 'pic'
    urls = geturls('_防诈骗.txt')
    htmls = getHtml(urls)
    getpic(htmls, pic_dir)


if __name__ == '__main__':
    main()
爬取百度贴吧图片

猜你喜欢

热点阅读