IT类作者联盟程序员

不巧看到这篇文章,那就送你2000套妹子图吧!

2019-07-13  本文已影响33人  不学无术丶

这事要从前不久说起,在学会用Python爬取小说(文字信息)之后,我把目标转移到了爬取图片上,上网一搜:Python 图片爬虫。大部分搜索结果都是爬取妹子图站点,看来广大的互联网朋友对美好事物还是别有一番追求啊!哈哈哈哈......就这样,我确定了目标:爬取妹子图整站数据! 不断折腾就有了这篇文章。

step 1 网页分析

step 2 思路

def get_url(page_num):
        response = requests.get('https://www.mzitu.com/page/{}/'.format(page_num), headers=headers)
        html = etree.HTML(response.text)
        pics_titles = html.xpath('//*[@id="pins"]/li/span[1]/a/text()')        # 获取图集名称
        pics_links = html.xpath('//*[@id="pins"]/li/a/@href')                  # 获取图集链接
        directory = 'D:\Mzitu'                                                 # 事先手动在D盘创建一个文件夹,随意
        for pics_title, pics_link in zip(pics_titles, pics_links):             # 让图集名称和图集链接一一对应
            try:
                if os.path.isdir(directory):
                    os.mkdir(os.path.join(directory, pics_title))              # 以图集名称创建文件夹
                    download(pics_title, pics_link, page_num)                  # 调用下载函数 并传入图集名称、图集链接和页码
            except:
                pass
def download(pics_title, pics_link, page_num):
    response0 = requests.get(pics_link, headers=headers)
    html_0 = etree.HTML(response0.text)
    max_page = html_0.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0]    # 每套图包含的图片张数
    for j in range(1, int(max_page) + 1):
        url = pics_link + '/' + str(j)  # 拼接每张图片的网页链接
        response1 = requests.get(url, headers=headers)
        html_1 = etree.HTML(response1.text)
        pic_link = html_1.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src')[0]   # 获取每张图片的链接
        response2 = requests.get(pic_link, headers=headers)                          # 向图片单独发送请求
        file_name = 'D:\Mzitu\\' + pics_title + '\\' + pics_title + str(j) + '.jpg'  # 以图片标题创建文件
        with open(file_name, 'wb') as f:                                             # 保存图片
            f.write(response2.content)
        print('第%s页' % page_num, pics_title, "第%s张已下载完成" % j)

以上两步就能成功获取一套一套的妹子图了!只是爬取速度一般,如果你能接受,也会有不错的收获。最后,提一下运用多线程加速爬虫(小弟不才,只略知一二,望大佬指点!):

完整代码:

# -*- coding: UTF-8 -*-
import requests
from lxml import etree
import threading
import queue
import os


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
    "Referer": "https://www.mzitu.com/",
    }


def get_url(page_num):
        response = requests.get('https://www.mzitu.com/page/{}/'.format(page_num), headers=headers)
        html = etree.HTML(response.text)
        pics_titles = html.xpath('//*[@id="pins"]/li/span[1]/a/text()')        # 获取图集名称
        pics_links = html.xpath('//*[@id="pins"]/li/a/@href')                  # 获取图集链接
        directory = 'D:\Mzitu'                                                 # 事先手动在D盘创建一个文件夹,随意
        for pics_title, pics_link in zip(pics_titles, pics_links):             # 让图集名称和图集链接一一对应
            try:
                if os.path.isdir(directory):
                    os.mkdir(os.path.join(directory, pics_title))              # 以图集名称创建文件夹
                    download(pics_title, pics_link, page_num)                  # 调用下载函数 并传入图集名称和图集链接
            except:
                pass


def download(pics_title, pics_link, page_num):
    response0 = requests.get(pics_link, headers=headers)
    html_0 = etree.HTML(response0.text)
    max_page = html_0.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0]    # 每套图包含的图片张数
    for j in range(1, int(max_page) + 1):
        url = pics_link + '/' + str(j)  # 拼接每张图片的网页链接
        response1 = requests.get(url, headers=headers)
        html_1 = etree.HTML(response1.text)
        pic_link = html_1.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src')[0]   # 获取每张图片的链接
        response2 = requests.get(pic_link, headers=headers)                          # 向图片单独发送请求
        file_name = 'D:\Mzitu\\' + pics_title + '\\' + pics_title + str(j) + '.jpg'  # 以图片标题创建文件
        with open(file_name, 'wb') as f:                                             # 保存图片
            f.write(response2.content)
        print('第%s页' % page_num, pics_title, "第%s张已下载完成" % j)


class MM(threading.Thread):                # 创建一个子类,继承多线程父类threading.Thread
    def __init__(self, page_num, queue):
        threading.Thread.__init__(self)
        self.queue = queue                # 队列
        self.page_num = page_num          # 页数

    def run(self):
        try:
            get_url(self.page_num)        # 在这里调用get_url()函数
        except Exception as e:
            print(e)
        finally:
            self.queue.get()
            self.queue.task_done()


def main():
    q = queue.Queue(10)                   # 设置10个并发线程数,速度还行,可自行尝试修改
    for page_num in range(226):
        q.put(page_num)
        t = MM(page_num, q)
        t.start()
    q.join()


if __name__ == '__main__':
    main()
成果图
上一篇 下一篇

猜你喜欢

热点阅读