Scrapy爬虫框架(四) ------ 爬小说
2019-08-06 本文已影响0人
千喜Ya
ZwwSpider :
import scrapy
class ZwwSpider(scrapy.Spider):
name = 'zww'
allowed_domains = ['81zw.us']
start_urls = ['https://www.81zw.us/book/606/10994091.html']
def parse(self, response):
title = response.xpath('//h1/text()').extract_first()
content = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace(' ', '\n')
yield {
'title': title,
'content': content
}
next_url = response.xpath('//div[@class="bottem2"]/a[3]/@href').extract_first()
# base_url = 'https://www.81zw.us/book/606/{}'.format(next_url)
if next_url.find('.html') != -1:
yield scrapy.Request(response.urljoin(next_url), callback=self.parse) #response.urljoin : 会自动补齐缺少的部分
XiaoshuoPipeline :
class XiaoshuoPipeline(object):
def open_spider(self, spider):
self.file = open('wddf.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
title = item['title']
content = item['content']
info = title + '\n' + content + '\n'
self.file.write(info)
self.file.flush() #刷新文件,避免没有达到一定文字就不写入
return item
def close_spider(self, spider):
self.file.close()
setting :
ITEM_PIPELINES = {
'xiaoshuo.pipelines.XiaoshuoPipeline': 300,
}