开源项目

Python构建私有代理IP库

2018-07-16  本文已影响51人  kaliarch

一、背景

在Python写爬虫时候,经常会遇到爬虫与反爬虫的博弈,高强度、高频次地爬取网页信息,一般会给网站服务器带来巨大压力与性能损坏,故同一个IP不断爬取网页的信息,很快就可能被网站管理员封掉。故我们可以搭建自己的代理IP库,不停的更换自己的IP去爬去网页,不会因为同一IP而影响爬虫的进行。将爬取到的IP信息进行判断筛选可用的代理地址存入数据库MySQL/Redis/Mongodb/Memcache,后期需要使用代理IP,直接从私有库中获取以逸待劳。

二、相关资料

2.1 使用的Python模块

2.2 相关参考链接

Redis可参考Redis-3.2主从复制与集群搭建
Mongodb可参考Mongodb基础
Memcache可参考Memcached 安装脚本(附服务器自启动)
Python基础爬虫可参考利用Python搜索51CTO推荐博客并保存至Excel

三、代码示例

3.1 github地址

PROXIES

3.2 代码

image

a.spider.py

#!/bin/env python
# -*- coding:utf-8 -*-
# _author:kaliarch

import requests
from bs4 import BeautifulSoup
import random

class GetProxyIP:

    def __init__(self, page=10):
        self._page = page
        self.url_head = 'http://www.xicidaili.com/wt/'

    def get_ip(self):
        """
        get resouce proxy ip pool
        :return: res_pool list
        """
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
        res_pool = []
        for pagenum in range(1, self._page):
            url = self.url_head + str(pagenum)
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")
            soup_tr = soup.find_all('tr')
            for item in soup_tr:
                try:
                    soup_td = item.find_all('td')
                    # 获取到网页的代理IP信息
                    res_pool.append(soup_td[5].text.lower() + '://' + soup_td[1].text + ':' + soup_td[2].text)
                except IndexError:
                    pass
        return res_pool

    def right_proxies(self, res_pool):
        """
        check available ip
        :param res_pool:
        :return:right_pool list
        """
        right_pool = []
        for ip in res_pool:
            if 'https' in ip:
                proxies = {'http': ip}
            else:
                proxies = {"http": ip}
            check_urllist = ['http://www.baidu.com', 'http://www.taobao.com', 'https://cloud.tencent.com/']
            try:
                response = requests.get(random.choice(check_urllist), proxies=proxies, timeout=1)
                # 判断筛选可用IP
                if response.status_code:
                    right_pool.append(proxies)
                    print('add ip %s' % proxies)
            except Exception as e:
                continue
        return right_pool

if __name__ == '__main__':
    # 实例化类,可以传入page
    proxyhelper = GetProxyIP(2)
    res_pool = proxyhelper.get_ip()
    proxy_ip = proxyhelper.right_proxies(res_pool)
    print(proxy_ip)

b.db.conf

[mysql]
HOST = 172.20.6.100
PORT = 3306
USER = root
PASSWD = mysqladmin
DB = pydb
TABLE = pytab
CHARSET = utf8

[redis]
HOST = 172.20.6.100
PORT = 6379
PASSWD = redisadmin

[memcache]
HOST = 172.20.6.100
PORT = 11211

[mongodb]
HOST = 172.20.6.100
PORT = 27017
DB = db1
USER = mongoadmin
PASSWD = mongopwd

c.save_mysql.py

#!/bin/env python
# -*- coding:utf-8 -*-
# _author:kaliarch

import pymysql
import configparser
import spider

class MysqlOper:
    # initial database information
    def __init__(self, result_list):
        #初始化mysql数据库的性格信息
        config = configparser.ConfigParser()
        config.read('db.conf')
        self.host = config['mysql']['HOST']
        self.port = int(config['mysql']['PORT'])
        self.user = config['mysql']['USER']
        self.passwd = config['mysql']['PASSWD']
        self.db = config['mysql']['DB']
        self.table = config['mysql']['TABLE']
        self.charset = config['mysql']['CHARSET']
        self.result_list = result_list

    def mysql_save(self):
            
        # create db cursor
        try:
            DB = pymysql.connect(self.host, self.user, self.passwd, self.db, port=self.port, charset=self.charset)
            cursor = DB.cursor()
        except Exception as e:
            print("connect dbserver fail,Please see information:")
            print(e)
            exit(1)

        # check and create tables
        cursor.execute('show tables in pydb')
        tables = cursor.fetchall()
        flag = True
        for tab in tables:
            if self.table in tab:
                flag = False
                print('%s is exist' % self.table)
        print(flag)
        if flag:
            #创建pytab表
            cursor.execute(
                '''create table pytab (id int unsigned not null primary key auto_increment, protocol varchar(10),content varchar(50))''')
        else:
            return 0

        # 讲获取到的IP写入到mysql数据库
        for values in self.result_list:
            for prot, cont in values.items():
                try:
                    cursor.execute("insert into pytab (protocol,content) value (%s,%s);", [prot, cont])
                except Exception as e:
                    print("insert db occer error", e)

if __name__ == "__main__":
    proxyhelper = spider.GetProxyIP(3)
    res_pool = proxyhelper.get_ip()
    proxy_ip = proxyhelper.right_proxies(res_pool)
    dbhelper = MysqlOper(proxy_ip)
    dbhelper.mysql_save()

3.save_redis.py

#!/bin/env python
# -*- coding:utf-8 -*-
# _author:kaliarch

import redis
import random
import configparser
import spider

class RedisOper:

    def __init__(self):
        """
        initialization redis infomation
        :param
        """
        config = configparser.ConfigParser()
        config.read('db.conf')
        self.host = config['redis']['HOST']
        self.port = config['redis']['PORT']
        self.passwd = config['redis']['PASSWD']
        self.pool = redis.ConnectionPool(host=self.host, port=self.port, password=self.passwd)
        self.redis_helper = redis.Redis(connection_pool=self.pool)
        self.pipe = self.redis_helper.pipeline(transaction=True)

    def redis_save(self, result_list):
        """
        save data
        :return:None
        """
        for num, cont in enumerate(result_list):
            self.redis_helper.set(num, cont)
        self.pipe.execute()

    def redis_gain(self):
        """
        gain data
        :return: proxies
        """
        num = random.randint(0, 10)
        ip = self.redis_helper.get(num)
        self.pipe.execute()
        return ip


if __name__ == '__main__':
    proxyhelper = spider.GetProxyIP(2)
    res_pool = proxyhelper.get_ip()
    proxy_ip = proxyhelper.right_proxies(res_pool)
    dbhelper = RedisOper()
    dbhelper.redis_save(proxy_ip)
    ip = dbhelper.redis_gain()
    print(ip)

4.save_mongodb.py

#!/bin/env python
# -*- coding:utf-8 -*-
# _author:kaliarch

import configparser
import spider
from pymongo import MongoClient


class MongodbOper:

    def __init__(self):
        """
        initialization redis infomation
        :param
        """
        config = configparser.ConfigParser()
        config.read('db.conf')
        self.host = config['mongodb']['HOST']
        self.port = config['mongodb']['PORT']
        self.db = config['mongodb']['DB']
        self.user = config['mongodb']['USER']
        self.pwd = config['mongodb']['PASSWD']
        self.client = MongoClient(self.host, int(self.port))
        self.db_auth = self.client.admin
        self.db_auth.authenticate(self.user, self.pwd)
        self.DB = self.client[self.db]
        self.collection = self.DB.myset

    def mongodb_save(self, result_list):
        """
        save data
        :return:None
        """
        for values in result_list:
            self.collection.insert(values)

    def mongodb_gain(self):
        """
        gain data
        :return: proxies
        """
        ip = self.collection.find_one()
        return ip


if __name__ == '__main__':
    proxyhelper = spider.GetProxyIP(2)
    res_pool = proxyhelper.get_ip()
    proxy_ip = proxyhelper.right_proxies(res_pool)
    dbhelper = MongodbOper()
    dbhelper.mongodb_save(proxy_ip)
    ip = dbhelper.mongodb_gain()
    print(ip)

5.save_memcache.py

#!/bin/env python
# -*- coding:utf-8 -*-
# _author:kaliarch

import memcache
import random
import configparser
import spider

class MemcacheOper:

    def __init__(self):
        """
        initialization redis infomation
        :param
        """
        config = configparser.ConfigParser()
        config.read('db.conf')
        self.host = config['memcache']['HOST']
        self.port = config['memcache']['PORT']
        self.mcoper = memcache.Client([self.host + ':' + self.port], debug=True)

    def memcache_save(self, result_list):
        """
        save data
        :return:None
        """
        for num, cont in enumerate(result_list):
            self.mcoper.set(str(num), cont)

    def memcache_gain(self):
        """
        gain data
        :return: proxies
        """
        num = random.randint(0, 10)
        ip = self.mcoper.get(str(num))
        return ip


if __name__ == '__main__':
    proxyhelper = spider.GetProxyIP(2)
    res_pool = proxyhelper.get_ip()
    proxy_ip = proxyhelper.right_proxies(res_pool)
    dbhelper = MemcacheOper()
    dbhelper.memcache_save(proxy_ip)
    ip = dbhelper.memcache_gain()
    print(ip)

四、效果展示

单独运行spider.py可以查看到爬取并筛选出的可用ip池

image
运行其他保存文件,可以进入对应数据库查看存储的信息。
MySQL
image
Redis
image
Mongodb
image
Memcache
image
至此我们就利用Python构建了一个属于自己的私有代理库,在进行爬去的时候可方便从数据库中获取使用。
上一篇下一篇

猜你喜欢

热点阅读