简单爬虫,尽情爬取妹子图片

2018-11-08  本文已影响0人  宋西厚
import requests
import redis
import re
from hashlib import md5
from urllib.parse import urljoin

class My_Flie:
    # path 为图片的保存路径 redis_host 为redis的主机ip
    def __init__(self,url,path=r'E:\img',redis_host='192.168.16.117',port=6379,password=None):
        self.redis_client=redis.Redis(host=redis_host,port=port,password=password)
        self.start_url=url
        self.request=requests.request
        self.headers={
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36'
        }
        self.path=path

    # 下载
    def download_html(self,url,method='get'):
        response=self.request(url=url,method=method,headers=self.headers)
        return response
    #解析 将url 存到redis的list里
    #利用redis的set去重
    def parse_html(self,response,source_url):
        page_url_list = re.findall(r'(?<=href=").*?(?=")',response.text)
        img_url_list=re.findall(r'(?<=src=").*?jpg(?=")',response.text)

        for url in page_url_list:
            if not url.startswith('http'):
                url=urljoin(source_url,url)
            if self.fliter_request(url):
                self.redis_client.lpush('my_flie:request_url_list',url)

        for url in img_url_list:
            if not url.startswith('http'):
                url=urljoin(source_url,url)
            if self.fliter_request(url):
                self.redis_client.lpush('my_flie:request_url_list',url)


    def save_img(self,response,url):
        with open(f'{self.path}\{self.md5_url(url)}.jpg','wb') as f:
            f.write(response.content)

    def fliter_request(self,url):
        flag=self.redis_client.sadd('my_flie:set',url)
        return flag

    def run(self):
        response=self.download_html(self.start_url)
        self.parse_html(response,self.start_url)
        count=1
        while True:
            try:
                url=self.redis_client.rpop('my_flie:request_url_list').decode()
                response=self.download_html(url)
                if url.endswith('html'):
                    self.parse_html(response,url)
                    print(f"下载页面")
                else:
                    self.save_img(response,url)
                    print(f"下载{count}张")
                    count+=1
            except Exception as e:
                print(e)


    def md5_url(self,url):
        m=md5()
        m.update(url.encode())
        return m.hexdigest()

    def close(self):
        self.redis_client.delete('my_flie:request_url_list')
        self.redis_client.delete('my_flie:set')


if __name__ == "__main__":
    url='http://www.meizitu.com'   # 这里可以换成你想要爬取的网址
    me=My_Flie(url)
    me.run()






上一篇下一篇

猜你喜欢

热点阅读