scrapy启动多爬虫
'''
方法一:
CrawlerProcess内部将会开启Twisted reactor、配置log和设置Twisted reactor自动关闭
'''
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess({
'USER_AGENT':'*****'
})
# 可在初始化时传入项目setting信息
#process = CrawlerProcess(settings='x')
first_spider_name = 'xxxxx'
second_spider_name = 'xxxxx'
third_spider_name ='xxxxx'
process.crawl(first_spider_name)
process.crawl(second_spider_name)
process.crawl(third_spider_name)
process.start()
'''方法二:这种方法较为复杂,在spider运行结束后,
必须自行关闭Twisted reactor,需要在CrawlerRunner.crawl返回对象中添加回调
'''from twisted.internet import reactor
from scrapy.utils.log import configure_logging
configure_logging = ({ 'LOG_FORMAT':'%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(first_spider_name)
d = runner.crawl(second_spider_name)
d = runner.crawl(third_spider_name)
d = runner.join
d.addBoth(lambda _: reactor.stop())
reactor.run()
第三种方法也是使用CrawlerRunner类,不过实现略有不同
from twisted.internetimport reactor, defer
from scrapy.crawlerimport CrawlerRunner
from scrapy.utils.logimport configure_logging
configure_logging()
runner = CrawlerRunner()
@defer.inlineCallbacks
def crawl():
yield runner.crawl(first_spider_name)
yield runner.crawl(second_spider_name)
yield runner.crawl(third_spider_name)
reactor.stop()
crawl()
reactor.run()