Python3实战:批量下载妹子图片

2018-11-13  本文已影响51人  我的袜子都是洞

目标网站:点击进入

网站截图 下载过程图

说明:
代码来源「福利向」Python妹子图爬虫(一)
不使用框架,简单上手

实例代码:

import requests
from lxml import etree
import time
from selenium import webdriver
import os

PICTURES_PATH = os.path.join(os.getcwd(), './pictures/')
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/65.0.3325.181 Safari/537.36',
    'Referer': "http://www.mmjpg.com"
}

class Spider(object):
    def __init__(self):
        # 爬取的页码
        self.page_num = 40
        # 目标站点
        self.page_urls = ['http://www.mmjpg.com/']
        self.girl_urls = []
        self.girl_name = ''
        self.pic_urls = []

    def get_page_urls(self):
        if int(self.page_num) > 1:
            for n in range(2, int(self.page_num)+1):
                page_url = 'http://www.mmjpg.com/home/' + str(n)
                self.page_urls.append(page_url)
        elif int(self.page_num) == 1:
            pass

    def get_girl_urls(self):
        for page_url in self.page_urls:
            html = requests.get(page_url).content
            selector = etree.HTML(html)
            self.girl_urls += (selector.xpath('//span[@class="title"]/a/@href'))

    def get_pic_urls(self):
        driver = webdriver.Chrome()
        for girl_url in self.girl_urls:
            driver.get(girl_url)
            time.sleep(3)
            driver.find_element_by_xpath('//em[@class="ch all"]').click()
            time.sleep(3)
            # 这里暂停3秒之后获取html的源代码
            html = driver.page_source
            selector = etree.HTML(html)
            self.girl_name = selector.xpath('//div[@class="article"]/h2/text()')[0]
            self.pic_urls = selector.xpath('//div[@id="content"]/img/@data-img')
            try:
                self.download_pic()
            except Exception as e:
                print("{}保存失败".format(self.girl_name) + str(e))

    def download_pic(self):
        try:
            os.mkdir(PICTURES_PATH)
        except:
            pass
        girl_path = PICTURES_PATH + self.girl_name
        try:
            os.mkdir(girl_path)
        except Exception as e:
            print("{}已存在".format(self.girl_name))
        img_name = 0
        for pic_url in self.pic_urls:
            img_name += 1
            img_data = requests.get(pic_url,headers =headers)
            pic_path = girl_path + '/' + str(img_name)+'.jpg'
            if os.path.isfile(pic_path):
                print("{}第{}张已存在".format(self.girl_name, img_name))
                pass
            else:
                with open(pic_path, 'wb')as f:
                    f.write(img_data.content)
                    print("正在保存{}第{}张".format(self.girl_name, img_name))
                    f.close()
        return




def main():
    spider = Spider()
    spider.get_page_urls()
    spider.get_girl_urls()
    spider.get_pic_urls()

if __name__ == '__main__':
    main()
上一篇下一篇

猜你喜欢

热点阅读