西刺代理 IP池
2017-12-26 本文已影响0人
领悟悟悟
import requests
from fake_useragent import UserAgent
from pymongo import MongoClient
from bs4 import BeautifulSoup
import lxml
import datetime
from urllib.parse import urljoin
import threadpool
ua = UserAgent()
try:
# Python 3.x
from urllib.parse import quote_plus
except ImportError:
# Python 2.x
from urllib import quote_plus
PORT = "27017"
IP = "192.168.101.203"
USERNAME = ""
PASSWORD = ""
class MyMongoOperator(object):
def __init__(self, userName=USERNAME, password=PASSWORD, port=PORT, IP=IP):
if userName:
uri = "mongodb://%s:%s@%s:%s" % (
quote_plus(userName), quote_plus(password), IP, port)
else:
uri = "mongodb://%s:%s" % (IP, port)
self.client = MongoClient(uri)
# 指定数据库和集合
def connect2DB_COLL(self, dbname, collection):
self.db = eval('''self.client.%s'''%dbname)
self.collection = eval('''self.db.%s'''%collection)
return self
def find(self, *args, **kwargs):
return self.collection.find(*args, **kwargs)
def findOne(self, query):
return self.collection.find_one(query)
def insert(self,args):
return self.collection.insert(args)
def update(self,*args):
return self.collection.update(*args)
def remove(self, args):
return self.collection.remove(args)
def closeConn(self):
self.client.close()
def getConn():
obj = MyMongoOperator()
obj.connect2DB_COLL('spider', 'IP_pool')
return obj
# 获取mongo的连接对象
mongo = getConn()
class XCspider(object):
def __init__(self):
self.header = {
'User-Agent':ua.random,
'Referer':'http://www.xicidaili.com/'
}
self.today = datetime.datetime.today()
self.dayStart = datetime.datetime(year=self.today.year, month=self.today.month, day=self.today.day)
def getTime(self, timeStr):
time = datetime.datetime.strptime(timeStr, '%y-%m-%d %H:%M')
return time
def get(self, url):
response = requests.get(url,headers=self.header)
if response.status_code == 200:
return response
elif response.status_code == 404:
raise requests.HTTPError('404 网页未找到')
elif response.status_code == 500:
raise requests.HTTPError('500 服务器错误')
else:
raise requests.HTTPError('其它相应错误{}'.format(response.status_code))
def parse(self, response):
soup = BeautifulSoup(response.content, 'lxml')
ip_list = soup.find_all('tr')
if ip_list:
ip_list = ip_list[1:]
for ip in ip_list:
info = ip.text.strip("'")
ip_info = list(filter(lambda each: each, info.split('\n')))
if len(ip_info) < 7:
print(ip_info)
continue
# 获取当天的ip
if self.getTime(ip_info[-1]) > self.dayStart:
# 数据入库
mongo.insert({
'proxy':{'{}'.format(ip_info[4]): "http://{}:{}".format(ip_info[0],ip_info[1])},
'area':ip_info[2],
'type':ip_info[3],
'time':ip_info[6],
'insertTime':datetime.datetime.now().strftime('%y-%m-%d %H:%M'),
'inspect':False,
'available':False,
}
)
else:
return False
next_page = soup.find('a',attrs={'class':'next_page'})
if next_page:
next_url = urljoin('http://www.xicidaili.com/', next_page.attrs['href'])
return next_url
pass
def clear(self):
'''
清空ip池
:return:
'''
mongo.remove({})
def verify(self):
pool = threadpool.ThreadPool(10)
ipList = mongo.find({'inspect': False})
requests = threadpool.makeRequests(verify_ip, ipList)
[pool.putRequest(req) for req in requests]
pool.wait()
print('ip检测完毕')
def verify_ip(each):
headers = {
'User-Agent': ua.random,
'Referer': 'http://www.baidu.com/'
}
try:
response = requests.get('http://www.dianping.com/', proxies=each['proxy'], timeout=3, headers=headers)
except Exception as e:
print(e)
mongo.update({'_id':each['_id']}, {'$set':{'inspect':True}})
else:
if response.status_code == 200:
mongo.update({'_id': each['_id']}, {'$set': {'inspect': True, 'available':True}})
else:
mongo.update({'_id': each['_id']}, {'$set': {'inspect': True}})
pass
def getProxy():
xc = XCspider()
xc.clear()
response = xc.get(url='http://www.xicidaili.com/nn/')
url = xc.parse(response)
while url:
print('\t正在抓取:%s'%url)
response = xc.get(url=url)
url = xc.parse(response)
def verify():
xc = XCspider()
xc.verify()
def parse():
with open('西刺代理.html', 'r', encoding='utf-8') as f:
data = f.read()
soup = BeautifulSoup(data, 'lxml')
ip_list = soup.find_all('tr')
if ip_list:
ip_list = ip_list[1:]
for ip in ip_list:
info = ip.text.strip("'")
ip_info = list(filter(lambda each:each, info.split('\n')))
# 数据入库
pass
if __name__ == '__main__':
# getProxy()
# verify()
headers = {
'User-Agent': ua.random,
'Referer': 'http://www.baidu.com/'
}
response = requests.get('http://www.dianping.com/', proxies={"HTTP" : "http://112.114.98.66:8118"},
headers=headers)
mongo.closeConn()