虫虫

爬虫—代理池的维护一(获取模块)

2019-09-29  本文已影响0人  八戒无戒

记录一个免费代理池的维护,主要包含四个模块:
获取模块:主要负责从各个免费代理网站提取出最新发布的免费代理,获取到本地并解析
存储模块:负责将获取模块获取到的proxy存储至redis数据库
检测模块:负责检测redis数据库中proxy的可用代理可不可用代理,并赋以权重
调度模块:负责将获取模块、存储模块和检测模块关联,并封装

主要涉及知识点:

  • 元类
  • python操作redis数据库,redis库的使用
  • requests库的使用
  • pyquery的使用
  • aiohttp异步http框架的简单使用
  • 多线程和多进程

获取模块

# -*- coding: utf-8 -*-
"""
__author__ = 'bingo'
__date__ = '2019/9/7'
# code is far away from bugs with the god animal protecting
    I love animals. They taste delicious.
             ┏┓   ┏┓
            ┏┛┻━━━┛┻┓
            ┃     ☃ ┃
            ┃  ┳┛  ┗┳  ┃
            ┃      ┻   ┃
            ┗━┓      ┏━┛
                ┃    ┗━━━┓
                ┃  神兽保 ┣┓
                ┃ 永无BUG┏┛
                ┗ ┓┏━┳┓┏┛
                  ┃┫┫  ┃┫┫
                  ┗┻┛  ┗┻┛
"""
import random
import asyncio
import requests
import time
import redis
import aiohttp
from pyquery import PyQuery as pq
from redis import ResponseError
from requests import ConnectTimeout
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process
from flask import Flask

# 获取模块
class ProxyMeta(type):
    def __new__(cls, name, bases, attrs):
        crawl_count = 0
        attrs["__CrawlFunc__"] = []

        # 获取获取模块中用来爬取代理的所有函数
        for k, v in attrs.items():
            if k.startswith("crawl_"):
                func = "self.{}()".format(k)
                attrs["__CrawlFunc__"].append(func)
                crawl_count += 1

        # 获取获取模块中用来爬取代理的函数数量
        attrs["__CrawlFuncCount__"] = crawl_count
        return type.__new__(cls, name, bases, attrs)


class CrawlerGetter(object, metaclass=ProxyMeta):

    def __init__(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'}
        self.headers = headers
        self.proxy_count = 0
        self.db_client = ProxyRedisClient()

    def get_page(self, url, encoding):
        try:
            res = requests.get(url, headers=self.headers, timeout=2.5)
            if res.status_code == 200:
                res.encoding = encoding
                html = res.text
                return html
            else:
                return None
        except ConnectTimeout:
            return None

    def crawl_66daili(self):
        """
        66代理
        :return:
        """
        i = 0
        url = "http://www.66ip.cn/{page}.html"
        for page in range(1, 11):
            html = self.get_page(url.format(page=page), 'gbk')
            if html:
                p = pq(html)
                doc = p(".containerbox table tr:gt(0)")
                for item in doc.items():
                    proxy_ip = item("td:first-child").text()
                    proxy_port = item("td:nth-child(2)").text()
                    if proxy_ip and proxy_port:
                        proxy = ":".join([proxy_ip, proxy_port])
                        i += 1
                        print("【66代理%s】:%s" % (i, proxy))
                        self.proxy_count += 1
                        yield proxy
                    else:
                        pass
            else:
                print("【66代理】获取代理失败page:%s" % page)
                continue

    def crawl_iphai(self):
        """
        ip海代理
        :return:
        """
        i = 0
        urls = ["http://www.iphai.com/free/ng", "http://www.iphai.com/free/wg"]
        for url in urls:
            html = self.get_page(url, 'utf8')
            if html:
                p = pq(html)
                doc = p(".table-responsive table tr:gt(0)")
                for item in doc.items():
                    proxy_ip = item("td:first-child").text()
                    proxy_port = item("td:nth-child(2)").text()
                    if proxy_ip and proxy_port:
                        proxy = ":".join([proxy_ip, proxy_port])
                        i += 1
                        print("【IP海代理%s】:%s" % (i, proxy))
                        self.proxy_count += 1
                        yield proxy
                    else:
                        pass
            else:
                print("【IP海代理】获取代理失败: %s" % url)
                continue

    def crawl_qiyun(self):
        """
        齐云代理
        :return:
        """
        i = 0
        url = "http://www.qydaili.com/free/?action=china&page={page}"
        for page in range(1, 11):
            html = self.get_page(url.format(page=page), "utf8")
            if html:
                p = pq(html)
                doc = p(".table tbody tr")
                for item in doc.items():
                    proxy_ip = item("td:first-child").text()
                    proxy_port = item("td:nth-child(2)").text()
                    if proxy_ip and proxy_port:
                        proxy = ":".join([proxy_ip, proxy_port])
                        i += 1
                        print("【齐云代理%s】:%s" % (i, proxy))
                        self.proxy_count += 1
                        yield proxy
                    else:
                        pass
            else:
                print("【齐云代理】获取代理失败page:%s" % page)
                continue

    def crawl_89daili(self):
        """
        89免费代理
        :return:
        """
        i = 0
        url = "http://www.89ip.cn/index_{page}.html"
        for page in range(1, 21):
            html = self.get_page(url.format(page=page), "utf8")
            if html:
                p = pq(html)
                doc = p(".layui-table tbody tr")
                for item in doc.items():
                    proxy_ip = item("td:first-child").text()
                    proxy_port = item("td:nth-child(2)").text()
                    if proxy_ip and proxy_port:
                        proxy = ":".join([proxy_ip, proxy_port])
                        i += 1
                        print("【89免费代理%s】:%s" % (i, proxy))
                        self.proxy_count += 1
                        yield proxy
                    else:
                        pass
            else:
                print("【89免费代理】获取代理失败page:%s" % page)
                continue

    def crawl_kuaidaili(self):
        """
        快代理
        :return:
        """
        i = 0
        url = "https://www.kuaidaili.com/free/inha/{page}/"
        for page in range(1, 11):
            html = self.get_page(url.format(page=page), "utf8")
            if html:
                p = pq(html)
                doc = p("table tbody tr")
                for item in doc.items():
                    proxy_ip = item("td:first-child").text()
                    proxy_port = item("td:nth-child(2)").text()
                    if proxy_ip and proxy_port:
                        proxy = ":".join([proxy_ip, proxy_port])
                        i += 1
                        print("【快代理%s】:%s" % (i, proxy))
                        self.proxy_count += 1
                        yield proxy
                    else:
                        pass
            else:
                print("【快代理】获取代理失败page:%s" % page)
                continue

    def crawl_yundaili(self):
        """
        云代理
        :return:
        """
        i = 0
        url = "http://www.ip3366.net/free/?stype=1&page={page}"
        for page in range(1, 8):
            html = self.get_page(url.format(page=page), "gb2312")
            if html:
                p = pq(html)
                doc = p("table tbody tr")
                for item in doc.items():
                    proxy_ip = item("td:first-child").text()
                    proxy_port = item("td:nth-child(2)").text()
                    if proxy_ip and proxy_port:
                        proxy = ":".join([proxy_ip, proxy_port])
                        i += 1
                        print("【云代理%s】:%s" % (i, proxy))
                        self.proxy_count += 1
                        yield proxy
                    else:
                        pass
            else:
                print("【云代理】获取代理失败page:%s" % page)
                continue

    def crawl_xicidaili(self):
        """
        西刺代理
        :return:
        """
        i = 0
        url = "https://www.xicidaili.com/nn/{page}"
        for page in range(1, 6):
            html = self.get_page(url.format(page=page), "utf8")
            if html:
                p = pq(html)
                doc = p(".proxies table tr:gt(0)")
                for item in doc.items():
                    proxy_ip = item("td:nth-child(2)").text()
                    proxy_port = item("td:nth-child(3)").text()
                    if proxy_ip and proxy_port:
                        proxy = ":".join([proxy_ip, proxy_port])
                        i += 1
                        print("【西刺代理%s】:%s" % (i, proxy))
                        self.proxy_count += 1
                        yield proxy
                    else:
                        pass
            else:
                print("【西刺代理】获取代理失败page:%s" % page)
                continue

    def run(self):
        """
        返回各个网站爬虫接口函数生成器,并以多线程方式存入redis数据库
        :return:
        """
        crawl_funcs_list = []
        try:
            executor = ThreadPoolExecutor(max_workers=10)
            for crawl_func_name in self.__CrawlFunc__:
                crawl_funcs_list.append(eval(crawl_func_name))
            for crawl_func in crawl_funcs_list:
                executor.submit(self.to_redis_db, crawl_func)
            executor.shutdown()
        except Exception as e:
            print("ERROR:", e)

    def to_redis_db(self, generation):
        """
        接受一个生成代理ip的生成器,将代理存入redis代理池
        :param generation:
        :return:
        """
        proxies_generation = generation
        for proxy in proxies_generation:
            self.db_client.add(proxy)

上一篇下一篇

猜你喜欢

热点阅读