Scrapy 中mongodb pipelines 异步写法
2019-04-29 本文已影响9人
时间碎步
spider文件中添加custom_setting 定义 MONGO_URL
MONGO_DB
MONGO_COL
(也可以在setting.py
中配置)
custom_settings= {
'MONGO_URI': 'mongodb://localhost:27017/',
'MONGO_DB': 'corpus',
'MONGO_COL': 'pmindia',
'ITEM_PIPELINES': {
'gov.pipelines.MongoPipeline': 300,
}
}
-
pipelines.py
中:
import pymongo
from twisted.internet import defer, reactor
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db, mongo_col):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.col = mongo_col
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI', 'mongodb://localhost:27017/'),
mongo_db=crawler.settings.get('MONGO_DB'),
mongo_col=crawler.settings.get('MONGO_COL'),
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.mongodb = self.client[self.mongo_db]
def spider_closed(self, spider, reason):
self.client.close()
@defer.inlineCallbacks
def process_item(self, item, spider):
out = defer.Deferred()
reactor.callInThread(self._insert, item, out)
yield out
defer.returnValue(item)
def _insert(self, item, out):
self.mongodb[self.col].insert_one(dict(item))
reactor.callFromThread(out.callback, item)