Scrapy管道及中间件个人常用配置示例
2019-08-22 本文已影响0人
越大大雨天
中间件的使用示例:
随机UserAgent示例
- 使用fake_useragent生成随机UA
class UserAgentMiddleware(object):
def __init__(self):
self.ua = fake_useragent.UserAgent()
def process_request(self, request, spider):
request.headers['User-Agent'] = self.ua.random
- 使用scrapy_fake_useragent组件
pip install scrapy_fake_useragent
- 注释掉内置的UserAgentMiddleware,添加scrapy_fake_useragen
DOWNLOADER_MIDDLEWARES = {
# 关闭内置的UA中间件,使用随机UA组件
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
随机IP代理示例
- 使用阿布云动态妆发IP示例
import base64
# 阿布云代理服务器
proxyServer = "http://http-dyn.abuyun.com:9020"
# 代理隧道验证信息
proxyUser = "H4U********B3D"
proxyPass = "761*******9BC6"
proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")
# 代理设置
class ProxyMiddleware(object):
def process_request(self, request, spider):
request.meta["proxy"] = proxyServer
request.headers["Proxy-Authorization"] = proxyAuth
响应重试示例
- 使用process_response重试请求访问错误的页面
def process_response(self, request, response, spider):
"""处理响应"""
if response.status != 200 or response.body ==b"":
req = request.copy()
req.dont_filter = True
return req
return response
管道使用示例
自定义去重方式示例:
from scrapy.exceptions import DropItem
class DuplicatesPipleline(object):
def __init__(self):
self.card_num_seen = set()
self.name_seen = set()
def process_item(self,item,spider):
if item["age"] == 0:
if item['name'] in self.name_seen:
raise DropItem('Duplicate item found: %s' % item)
else:
self.name_seen.add(item['name'])
print(self.name_seen)
else:
if item['card_num'] in self.card_num_seen:
raise DropItem('Duplicate item found: %s' % item)
else:
self.card_num_seen.add(item['card_num'])
print(self.card_num_seen)
return item
MySQL异步写入Item示例:
from twisted.enterprise import adbapi
class MySQLPipeline(object):
@classmethod
def from_crawler(cls, crawler):
# 从项目的配置文件中读取相应的参数
cls.MYSQL_DB_NAME = crawler.settings.get("MYSQL_DB", 'dishonest')
cls.HOST = crawler.settings.get("MYSQL_HOST", 'localhost')
cls.PORT = crawler.settings.get("MYSQL_PORT", 3306)
cls.USER = crawler.settings.get("MYSQL_USER", 'root')
cls.PASSWD = crawler.settings.get("MYSQL_PASSWORD", 'mysql')
return cls()
def open_spider(self, spider):
self.dbpool = adbapi.ConnectionPool('pymysql', host=self.HOST, port=self.PORT, user=self.USER, passwd=self.PASSWD, db=self.MYSQL_DB_NAME, charset='utf8')
def close_spider(self, spider):
self.dbpool.close()
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.insert_db, item)
query.addErrback(self.handler_error, item, spider)
return item
def handler_error(self, failure, item, spider):
print(failure)
def insert_db(self, tx, item):
values = (
item["name"],
item["card_num"],
item["age"],
item["area"],
item["business_entity"],
item["content"],
item["publish_date"],
item["publish_unit"],
item["create_date"],
item["update_date"],
)
sql = 'INSERT INTO dishonest(name,card_num,age,area,business_entity,content,publish_date,publish_unit,create_date,update_date) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
tx.execute(sql, values)
MongoDB写入Item示例
from pymongo import MongoClient
class JdPipeline(object):
def open_spider(self, spider): # 在爬虫开启的时候仅执行一次
if spider.name == 'book':
# 也可以使用isinstanc函数来区分爬虫类:
self.client = MongoClient(host='127.0.0.1', port=27017) # 实例化mongoclient
self.db = self.client["jd"] # 创建数据库名为jd
self.collection = self.db["book"] # 集合名为book的集合操作对象
def process_item(self, item, spider):
if spider.name == 'book':
item = dict(item)
self.collection.insert(item)
# 此时item对象必须是一个字典,再插入
# 如果此时item是BaseItem则需要先转换为字典:dict(BaseItem)
item.pop("_id")
return item
def close_spider(self, spider):
self.client.close()