爬虫笔记(二) - 关于Scrapy下载中间件(IP代理)
2017-05-04 本文已影响263人
Spareribs
代理网站
代码
这段代码有bug,测试review_ips函数好像并没有生效
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
# @Time : 2017/4/27 18:58
# @Author : Spareribs
# @File : xicidaili.py
"""
import requests
from bs4 import BeautifulSoup
import threading
import Queue
class Get_ips():
def __init__(self, page):
self.ips = []
self.urls = []
for i in range(page):
self.urls.append("http://www.xicidaili.com/nn/" + str(i))
self.header = {"User-Agent": 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'}
# self.file=open("ips",'w')
self.q = Queue.Queue()
self.Lock = threading.Lock()
self.cookies = {"user_trace_token": "20170502200739-07d687303c1e44fa9c7f0259097266d6", }
# self.base_url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false&kd=python&pn=1"
self.base_url = "https://www.baidu.com"
def get_ips(self):
for url in self.urls:
res = requests.get(url, headers=self.header)
soup = BeautifulSoup(res.text, 'lxml')
ips = soup.find_all('tr')
for i in range(1, len(ips)):
ip = ips[i]
tds = ip.find_all("td")
ip_temp = "{0}://{1}:{2}".format(tds[5].contents[0], tds[1].contents[0], tds[2].contents[0])
# print str(ip_temp)
self.q.put(str(ip_temp))
def review_ips(self):
while not self.q.empty():
ip = self.q.get()
# print ip,type(ip)
http_tag = ip.split(":")[0]
# print http_tag
try:
proxy = {http_tag: ip}
print proxy
res = requests.get(self.base_url, proxies=proxy, timeout=1)
self.Lock.acquire()
if res.status_code == 200:
self.ips.append(ip)
# print ip
self.Lock.release()
except Exception:
pass
# print 'error'
def main(self):
self.get_ips()
threads = []
for i in range(40):
threads.append(threading.Thread(target=self.review_ips, args=[]))
for t in threads:
t.start()
for t in threads:
t.join()
return self.ips
def get_ip():
my = Get_ips(2)
getips_list = my.main()
with open("iplist.txt", "w") as f:
for getip in getips_list:
f.write(str(getip) + "\n")
# print getip
f.close()
return getips_list
if __name__ == "__main__":
get_ip()
scrapy中代理中间件的使用方法
middlewares.py设置
class ProxyMiddleware(object):
# with open("(此处应该是生成ip地址的txt文件)") as f:
# proxy_list = f.readlines()
# f.close()
proxy_list = [
# "HTTP://110.73.3.113:8123",
"HTTP://171.13.37.172:808",
"HTTPS://221.229.44.79:808",
]
def process_request(self, request, spider):
ip = random.choice(self.proxy_list)
print ip
request.meta['proxy'] = ip
setting.py文件设置
DOWNLOADER_MIDDLEWARES = {
'lagou.middlewares.ProxyMiddleware': 110,
}
Requests代理测试的方法
import requests
proxy = {"http":"http://110.80.142.147:808"}
base_url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false&kd=python&pn=1"
# base_url = "http://icanhazip.com"
res = requests.get(base_url, proxies=proxy)
print res.status_code
print res.text
遇到的问题
- 多线程处理
- Request使用代理proxies失败的问题:血的教训,proxy = {"http":"http://110.80.142.147:808"}小写小写!!!