scrapy爬虫--升级练习
scrapy startproject toscrape_book
scrapy genspider books book.toscrape.com
Created spider 'books' using template 'basic' in module:
toscrape_book.spiders.books
这个命令可以生成一个一个spider和他遥爬的网页
自动生成:
class BooksSpider(scrapy.Spider):
name ='books'
allowed_domains = ['book.toscrape.com']
start_urls = ['http://book.toscrape.com/']
#书籍列表页面解析函数
def parse(self,response):
pass
编写封装Item
classBooksItem(scrapy.Item):
#定义封装的要爬的信息的Item类
name = scrapy.Field()#书名
price = scrapy.Field()# jia ge
review_rating = scrapy.Field()#评价等级
review_num = scrapy.Field()#评价数量
upc = scrapy.Field()#产品编码
stock = scrapy.Field()#库存量
编写spider
#书籍列表页面解析函数
def parse(self,response):
le = LinkExtractor(restrict_css='article.product_pod h3')
forlinkinle.extract_links(response):
yieldscrapy.Request(link.url,callback=self.parse_book)
le = LinkExtractor(restrict_css='ul.pager li.next')
links = le.extract_links(response)
iflinks:
next_url = links[0].url
yield scrapy.Request(next_url,callback=self.parse)
#书籍页面的解析函数
def parse_book(self,response):
book = BooksItem()
sel =response.css('div.product_main')
book['name'] = sel.xpath('./h1/text()').extract_first()
book['price'] = sel.css('p.price_color::text').extract_first()
book['review_rating'] = sel.css('p.stat-rating::attr(class)')\
.re_first('star-rating([A-Za-z]+)')
sel = response.css('table.table.table-striped')
book['upc'] = sel.xpath('(.//tr)[1]/td/text()').extract_first()
book['stock'] = sel.xpath('(.//tr)[last()-1]/td/text()')\
.re_first('\((\d+)available\)')
book['review_rating'] = sel.xpath('(.//tr)[last()]/td/text()').extract_first()
yieldbook
设置输出顺序
FEED_EXPORT_FIELDS=['upc','name','price','stock','review_rating','review_num']
汉字和阿拉伯数字的映射关系
classBookPipline(object):
review_rating_map={
'One':1,
'Two':2,
'Three':3,
'Four':4,
'Five':5,
}
defprocess_item(self,item,spider):
rating = item.get('review_rating')
ifrating:
item['review_rating'] =self.review_rating_map[rating]
returnitem
scrapy crawl books -o books.csv