程序园程序员

Scrapy 中mongodb pipelines 异步写法

2019-04-29  本文已影响9人  时间碎步

spider文件中添加custom_setting 定义 MONGO_URL MONGO_DB MONGO_COL(也可以在setting.py中配置)

custom_settings= {
    'MONGO_URI': 'mongodb://localhost:27017/',
    'MONGO_DB': 'corpus',
    'MONGO_COL': 'pmindia',
    'ITEM_PIPELINES': {
        'gov.pipelines.MongoPipeline': 300,
    }
}
import pymongo
from twisted.internet import defer, reactor


class MongoPipeline(object):
    def __init__(self, mongo_uri, mongo_db, mongo_col):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
        self.col = mongo_col

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI', 'mongodb://localhost:27017/'),
            mongo_db=crawler.settings.get('MONGO_DB'),
            mongo_col=crawler.settings.get('MONGO_COL'),
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.mongodb = self.client[self.mongo_db]

    def spider_closed(self, spider, reason):
        self.client.close()

    @defer.inlineCallbacks
    def process_item(self, item, spider):
        out = defer.Deferred()
        reactor.callInThread(self._insert, item, out)
        yield out
        defer.returnValue(item)

    def _insert(self, item, out):
        self.mongodb[self.col].insert_one(dict(item))
        reactor.callFromThread(out.callback, item)
上一篇下一篇

猜你喜欢

热点阅读