scrapy启动多爬虫

2019-01-21  本文已影响0人  丷菜菜呀

'''

方法一:

CrawlerProcess内部将会开启Twisted reactor、配置log和设置Twisted reactor自动关闭

'''

from scrapy.crawler import CrawlerProcess

process = CrawlerProcess({

'USER_AGENT':'*****'

})

# 可在初始化时传入项目setting信息

#process = CrawlerProcess(settings='x')

first_spider_name = 'xxxxx'

second_spider_name = 'xxxxx'

third_spider_name ='xxxxx'

    process.crawl(first_spider_name)

    process.crawl(second_spider_name)

    process.crawl(third_spider_name)

    process.start()

'''方法二:这种方法较为复杂,在spider运行结束后,

必须自行关闭Twisted reactor,需要在CrawlerRunner.crawl返回对象中添加回调

'''from twisted.internet import reactor

from scrapy.utils.log import configure_logging

configure_logging = ({ 'LOG_FORMAT':'%(levelname)s: %(message)s'})

runner = CrawlerRunner()

d = runner.crawl(first_spider_name)

d = runner.crawl(second_spider_name)

d = runner.crawl(third_spider_name)

d = runner.join

d.addBoth(lambda _: reactor.stop())

reactor.run()

第三种方法也是使用CrawlerRunner类,不过实现略有不同

from twisted.internetimport reactor, defer

from scrapy.crawlerimport CrawlerRunner

from scrapy.utils.logimport configure_logging

configure_logging()

runner = CrawlerRunner()

@defer.inlineCallbacks

def crawl():

        yield runner.crawl(first_spider_name)

         yield runner.crawl(second_spider_name)

         yield runner.crawl(third_spider_name)

         reactor.stop()

crawl()

reactor.run()

上一篇下一篇

猜你喜欢

热点阅读