python爬虫日记本Python爬虫Python数据采集与爬虫

爬虫笔记(二) - 关于Scrapy下载中间件(IP代理)

2017-05-04  本文已影响263人  Spareribs

代理网站

常用的代理网站有
西刺免费代理IP
IPRENT
米扑

代码

这段代码有bug,测试review_ips函数好像并没有生效

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
# @Time    : 2017/4/27 18:58
# @Author  : Spareribs
# @File    : xicidaili.py
"""

import requests
from bs4 import BeautifulSoup
import threading
import Queue


class Get_ips():
    def __init__(self, page):
        self.ips = []
        self.urls = []
        for i in range(page):
            self.urls.append("http://www.xicidaili.com/nn/" + str(i))
        self.header = {"User-Agent": 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'}
        # self.file=open("ips",'w')
        self.q = Queue.Queue()
        self.Lock = threading.Lock()
        self.cookies = {"user_trace_token": "20170502200739-07d687303c1e44fa9c7f0259097266d6", }
        # self.base_url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false&kd=python&pn=1"
        self.base_url = "https://www.baidu.com"
    def get_ips(self):
        for url in self.urls:
            res = requests.get(url, headers=self.header)
            soup = BeautifulSoup(res.text, 'lxml')
            ips = soup.find_all('tr')
            for i in range(1, len(ips)):
                ip = ips[i]
                tds = ip.find_all("td")
                ip_temp = "{0}://{1}:{2}".format(tds[5].contents[0], tds[1].contents[0], tds[2].contents[0])
                # print str(ip_temp)
                self.q.put(str(ip_temp))

    def review_ips(self):
        while not self.q.empty():
            ip = self.q.get()
            # print ip,type(ip)
            http_tag = ip.split(":")[0]
            # print http_tag
            try:
                proxy = {http_tag: ip}
                print proxy
                res = requests.get(self.base_url, proxies=proxy, timeout=1)
                self.Lock.acquire()
                if res.status_code == 200:
                    self.ips.append(ip)
                    # print ip
                    self.Lock.release()
            except Exception:
                pass
                # print 'error'

    def main(self):
        self.get_ips()
        threads = []
        for i in range(40):
            threads.append(threading.Thread(target=self.review_ips, args=[]))
        for t in threads:
            t.start()
        for t in threads:
            t.join()
        return self.ips


def get_ip():
    my = Get_ips(2)
    getips_list = my.main()
    with open("iplist.txt", "w") as f:
        for getip in getips_list:
            f.write(str(getip) + "\n")
            # print getip
    f.close()
    return getips_list

if __name__ == "__main__":
    get_ip()

scrapy中代理中间件的使用方法

middlewares.py设置

class ProxyMiddleware(object):
    # with open("(此处应该是生成ip地址的txt文件)") as f:
    #     proxy_list = f.readlines()
    # f.close()
    proxy_list = [
        # "HTTP://110.73.3.113:8123",
        "HTTP://171.13.37.172:808",
        "HTTPS://221.229.44.79:808",
    ]

    def process_request(self, request, spider):
        ip = random.choice(self.proxy_list)
        print ip
        request.meta['proxy'] = ip

setting.py文件设置

DOWNLOADER_MIDDLEWARES = {
    'lagou.middlewares.ProxyMiddleware': 110,
}

Requests代理测试的方法

import requests

proxy = {"http":"http://110.80.142.147:808"}
base_url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false&kd=python&pn=1"
# base_url = "http://icanhazip.com"
res = requests.get(base_url, proxies=proxy)
print res.status_code
print res.text

遇到的问题

  1. 多线程处理
  2. Request使用代理proxies失败的问题:血的教训,proxy = {"http":"http://110.80.142.147:808"}小写小写!!!
上一篇下一篇

猜你喜欢

热点阅读