3.爬虫基础之批量爬取图片

2018-10-25  本文已影响0人  IPhone2

1.正则表达式


元字符【单字符】

. [and] \d \D \s \S

修饰符

.* + ? {m} {m,n} {m,}

边界符

^ $ \A \B

贪婪模式

.*

非贪婪模式

.*?

模式修正

re.S 单行

re.M 多行

re.I 忽略大小写


2.XPath语法


层级等位:根据标签的层级关系进行查找

属性定位:根据属性查找标签


4.爬取妹子图代码


from time import sleep
from urllib import request, parse
import re

# 业务函数,处理url
def handler_url(url, page ,num):
    if num == 1:
        page_url = url + str(page)
        # 请求头
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
            'Referer': 'https://www.baidu.com/link?url=dORiYkjnb0AkMxSoE4UzQYAiVlhvcutBR6sSxgYQY-y&wd=&eqid=961cc7e80003f1a6000000065bd05902'
    }
        return request.Request(url=page_url, headers=headers)
    else:

        page_url = url + str(page) + '/' + str(num)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
            'Referer': 'https://www.baidu.com/link?url=dORiYkjnb0AkMxSoE4UzQYAiVlhvcutBR6sSxgYQY-y&wd=&eqid=961cc7e80003f1a6000000065bd05902'
        }
        return request.Request(url=page_url, headers=headers)

# 业务函数,发起请求
def request_data(req):
    res = request.urlopen(req)
    # print(res.read().decode('utf-8'))

    return res.read().decode('utf-8')

# 业务函数,解析
def anylasis(html):
    # 正则匹配图片url
    pat = re.compile(r'<div class="article">.*?<img src="(.*?)"', re.S)

    res = pat.findall(html)
    # print(res)
    for img in res:
        # print(img)
        yield img
    # print(res)
# 主函数
def main():
    url = "http://www.mmjpg.com/mm/"

    start = int(input('请输入起始页:'))
    end = int(input('请输入终止页:'))
    print('开始下载...')
    img_name = 9540
    for page in range(start, end+1):
        # 把page对应的url处理成一个请求对象
        for num in range(1,51):
            req = handler_url(url, page, num)
            # 对请求对象发起请求
            html = request_data(req)
            # print(html)
            # 解析并且处理解析结果
            res = anylasis(html)
            # 处理res中图片地址
            for img in res:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
                    'Referer': 'https://www.baidu.com/link?url=dORiYkjnb0AkMxSoE4UzQYAiVlhvcutBR6sSxgYQY-y&wd=&eqid=961cc7e80003f1a6000000065bd05902'
                }
                # request.urlretrieve(url=img, filename="./images/" + str(img_name) + ".jpg")
                req = request.Request(img, headers=headers)
                res = request.urlopen(req)
                with open('./images/' + str(img_name) + '.jpg', 'wb') as fp:
                    fp.write(res.read())
                print("正在下载:" + img)
                img_name += 1
                sleep(0.1)
    print("下载结束!")

 
if __name__ == '__main__':
    main()
上一篇下一篇

猜你喜欢

热点阅读