python 代理ip爬取,ip代理,数据库存储,去重,验证。

2019-04-17  本文已影响0人  wangcc_sd

本文是对goubanjia的网站的一个综合性爬取。

实现代理ip爬取,存入数据库,使用代理,去重,验证。

首先是ip_request.py

# -*-coding:utf-8 -*-
# BY WANGCC
from bs4 import BeautifulSoup
import requests

from bs4 import BeautifulSoup
import requests
from ip_to_mysql import mysql_proxies,mysql_delete
url = 'http://ip.tool.chinaz.com/'


def str2header(headers_raw):
    if headers_raw is None:
        return None
    headers = headers_raw.splitlines()
    headers_tuples = [header.split(':', 1) for header in headers]

    result_dict = {}
    for header_item in headers_tuples:
        if not len(header_item) == 2:
            continue
        item_key = header_item[0].strip()
        item_value = header_item[1].strip()
        result_dict[item_key] = item_value
    return result_dict


# 抓包看到的header字符串
r_h = ''' 

GET / HTTP/1.1
Host: www.baidu.com
Connection: keep-alive
Cache-Control: max-age=0
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) 
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
Cookie: *** 

'''
# headers已经生成 可直接在requests.get(url,headers=headers)
headers = str2header(r_h)

proxies=mysql_proxies()
proxies_str=proxies.split(":")
Agreement=proxies_str[0]
proxies = {
    Agreement: proxies,
    }
r = requests.get(url, proxies=proxies)
soup = BeautifulSoup(r.text, 'lxml')
parent_node = soup.find(class_="IpMRig-tit")
a=parent_node.find_all('dd')
if '58.87.119.xxxx' not in a:
    print('success')
else:
    mysql_delete(proxies)
# for i in parent_node.find_all('dd'):
#     print(i.get_text())

这个模块是对ip的一个验证,随机生成header头。

ip_to_mysql.py

# -*-coding:utf-8 -*-
# BY WANGCC

import pymysql,datetime
import logger

log = logger.Logger("debug")

DB_CONFIG = {
    "host": "127.0.0.1",
    "port": 3306,
    "user": "admin",
    "passwd": "*******",
    "db": "ip_Original",
    "charset": "utf8"
}


def mysql(ip_list):
    # 打开数据库连接
    db = pymysql.connect(
        host=DB_CONFIG["host"],
        port=DB_CONFIG["port"],
        user=DB_CONFIG["user"],
        passwd=DB_CONFIG["passwd"],
        db=DB_CONFIG["db"],
        charset=DB_CONFIG["charset"])
    # 使用cursor()方法获取操作游标
    cursor = db.cursor()
    date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    for ip in ip_list:
        check_sql="select count(*) from ip_original where ip='%s'"%(ip)
        insert_sql = "insert into ip_original(ip,date)value ('%s','%s')" % (ip, date)
        cursor.execute(check_sql)
        number=cursor.fetchall()
        new_num=number[0][0]
        if number[0][0] == 0:
            try:
                # 执行sql语句
                cursor.execute(insert_sql)
                log.info(ip+'insert to ip_original success!')
                # 提交到数据库执行
                db.commit()
            except Exception as e:
                log.info('执行sql-->'+insert_sql+'fail')
                # 发生错误时回滚
                db.rollback()
        else:
            log.info(ip+': is existence !!',)
    # 关闭数据库连接
    db.close()

#获得一个ip代理
def mysql_proxies():
    # 打开数据库连接
    db = pymysql.connect(
        host=DB_CONFIG["host"],
        port=DB_CONFIG["port"],
        user=DB_CONFIG["user"],
        passwd=DB_CONFIG["passwd"],
        db=DB_CONFIG["db"],
        charset=DB_CONFIG["charset"])
    # 使用cursor()方法获取操作游标
    cursor = db.cursor()
    check_sql="select * from ip_original "
    cursor.execute(check_sql)
    number=cursor.fetchmany(1)
    proxies=number[0][1]
# 关闭数据库连接
    db.close()
    return proxies


#删除一条数据
def mysql_delete(proxies):
    # 打开数据库连接
    db = pymysql.connect(
        host=DB_CONFIG["host"],
        port=DB_CONFIG["port"],
        user=DB_CONFIG["user"],
        passwd=DB_CONFIG["passwd"],
        db=DB_CONFIG["db"],
        charset=DB_CONFIG["charset"])
    # 使用cursor()方法获取操作游标
    cursor = db.cursor()
    check_sql="delete  from ip_original  where ip = '%s'"%(proxies)
    log.info('delete data'+'proxies'+'success')
    cursor.execute(check_sql)
    number=cursor.fetchmany(1)
    proxies=number[0][1]

# 关闭数据库连接
    db.close()
    return proxies

if  __name__=="__main__":
    ip_list = ['http://117.191.11.108:80', 'http://134.209.15.143:8080', 'http://157.230.232.130:80',
               'http://111.206.6.100:80', 'http://159.138.5.222:80', 'http://178.128.12.118:8080',
               'http://83.142.126.147:80', 'http://150.109.55.190:83', 'http://165.227.62.167:8080',
               'http://167.114.153.18:80', 'http://39.137.69.10:8080', 'http://111.206.6.101:80',
               'http://165.227.29.189:8080', 'http://175.139.252.192:80', 'http://103.42.213.176:8080',
               'http://211.23.149.29:80', 'http://211.23.149.28:80', 'http://47.94.57.119:80',
               'http://175.139.252.194:80', 'http://47.94.217.37:80']
    #mysql(ip_list)
    mysql_proxies()

本文是对ip进行存储,和提取ip,删除ip的操作。

logger.py

# -*-coding:utf-8 -*-
# BY WANGCC
import logging
import os
import sys
import time


class Logger:
    def __init__(self, set_level="INFO",
                 name=os.path.split(os.path.splitext(sys.argv[0])[0])[-1],
                 log_name=time.strftime("%Y-%m-%d.log", time.localtime()),
                 log_path=os.path.join(os.path.dirname(os.path.abspath(__file__)), "log"),
                 use_console=True):
        """
        :param set_level: 日志级别["NOTSET"|"DEBUG"|"INFO"|"WARNING"|"ERROR"|"CRITICAL"],默认为INFO
        :param name: 日志中打印的name,默认为运行程序的name
        :param log_name: 日志文件的名字,默认为当前时间(年-月-日.log)
        :param log_path: 日志文件夹的路径,默认为logger.py同级目录中的log文件夹
        :param use_console: 是否在控制台打印,默认为True
        """
        if not set_level:
            set_level = self._exec_type()  # 设置set_level为None,自动获取当前运行模式
        self.__logger = logging.getLogger(name)
        self.setLevel(
            getattr(logging, set_level.upper()) if hasattr(logging, set_level.upper()) else logging.INFO)  # 设置日志级别
        if not os.path.exists(log_path):  # 创建日志目录
            os.makedirs(log_path)
        formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        handler_list = list()
        handler_list.append(logging.FileHandler(os.path.join(log_path, log_name)))
        if use_console:
            handler_list.append(logging.StreamHandler())
        for handler in handler_list:
            handler.setFormatter(formatter)
            self.addHandler(handler)

    def __getattr__(self, item):
        return getattr(self.logger, item)

    @property
    def logger(self):
        return self.__logger

    @logger.setter
    def logger(self, func):
        self.__logger = func

    def _exec_type(self):
        return "DEBUG" if os.environ.get("IPYTHONENABLE") else "INFO"


这个是对日志的一个封装,这样用起来方便些。

总结,目前整体程序可优化的空间很大,这算是1.0版本

上一篇下一篇

猜你喜欢

热点阅读