scrapy爬取豆瓣读书
2018-01-24 本文已影响213人
pkxutao
scrapy是个非常强大的爬虫,简单配置后就可以上手,写了个小爬虫爬取豆瓣读书的书籍信息练练手。
整个流程就是:随便找一本书的详情页作为入口,爬取这本书的详情信息,然后从详情页底部的“喜欢读"xxx"的人也喜欢”这个标签下爬取其他书籍,如此循环达到爬取所有书籍信息的目的(理论上)。
先来介绍spider文件
在start_urls选取了一本书作为入口,接着在parse函数里对详情页进行解析,获取需要的信息赋值到item里面,在最后使用yield scrapy.Request(url)把相似书籍的详情页url放入队列,等待下一次爬取,代码如下:
import scrapy
# from bs4 import BeautifulSoup
from test_spider.items import TestSpiderItem
import re
class DmozSpider(scrapy.spiders.Spider):
itemCount = 0
name = "dbbook"
allowed_domains = ["douban.com"]
start_urls = []
start_urls.append("https://book.douban.com/subject/27609047/")
print(start_urls)
def parse(self, response):
book = TestSpiderItem()
imgUrl = response.xpath("//div[@id='mainpic']/a[@class='nbg']/@href").extract_first()
name = response.xpath("//span[@property='v:itemreviewed']/text()").extract_first()
score = response.xpath("//strong[@property='v:average']/text()").extract_first().strip()
label = response.xpath("//a[@class=' tag']/text()").extract()
book['url'] = response.url
book['label'] = label
book['score'] = score
book['imgUrl'] = imgUrl
book['name'] = name
infos = response.xpath("//div[@id='info']")
curType = "" # 当前获取的类型
# print(infos.xpath("./*"))
# print(infos.xpath("./text()"))
if "作者" in infos.extract_first():
author = infos.xpath(".//a/text()").extract_first().strip()
book['author'] = self.getFormatStr(author)
# print("作者:", infos.xpath(".//a/text()").extract_first().strip())
for info in infos.xpath("./*|./text()"):
name = info.xpath("text()").extract_first()
if name is not None:
curType = ""
# if "作者:" == name or "作者" == name:
# curType = "author"
# continue
if "出版社:" == name:
curType = "press"
continue
elif "出版年:" == name:
curType = "publishYear"
continue
elif "页数:" == name:
curType = "pageCount"
continue
elif "定价:" == name:
curType = "price"
continue
elif "ISBN:" == name:
curType = "isbn"
continue
elif "装帧:" == name:
curType = "binding"
continue
span = info.extract()
span = span.strip() # 去掉空格
span = span.replace("\n", "") # 去掉换行符
span = span.replace("<br>", "") # 去掉换行符
if len(span) != 0:
# if curType == "author":
# book['author'] = self.getFormatStr(info.xpath("text()").extract_first()) # 作者名字特殊一点
if curType == "press":
book['press'] = span
elif curType == "publishYear":
book['publishYear'] = span
elif curType == "pageCount":
book['pageCount'] = int(re.sub("\D", "", span)) #todo 这里限制只获取数字 去掉冒号 单位
elif curType == "price":
book['price'] = float(re.findall(r"\d+\.?\d*",span)[0])
elif curType == "isbn":
book['isbn'] = span
elif curType == "binding":
book['binding'] = span
yield book
# 添加其他书到url列表
similarUrls = response.xpath("//div[@id='db-rec-section']/div[@class='content clearfix']/dl/dt/a/@href").extract()
for url in similarUrls:
if self.itemCount < 10:
# self.itemCount += 1
yield scrapy.Request(url)
def getFormatStr(self, params):
params = params.strip()#去掉空格
params = params.replace(" ", "")
params = params.replace("\n" , "")#去掉换行符
return params
接着是存储,在pipeline里面把获取到的信息存入数据库,有些书籍会缺少某个字段,这里要动态判断,代码如下:
# -*- coding: utf-8 -*-
import pymysql
from pymysql import connections
class TestSpiderPipeline(object):
def __init__(self):
self.conn = pymysql.connect("localhost","root","root","douban_book", charset='utf8mb4')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
sql ="insert into book("
placeHolder = ""
selectKey = ""
params=[]
if 'url' in item:
if selectKey != "":
selectKey += ","
selectKey += "URL"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['url'])
if 'imgUrl' in item:
if selectKey != "":
selectKey += ","
selectKey += "IMG_URL"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['imgUrl'])
if 'author' in item:
if selectKey != "":
selectKey += ","
selectKey += "AUTHOR"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['author'])
if 'name' in item:
if selectKey != "":
selectKey += ","
selectKey += "NAME"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['name'])
if 'press' in item:
if selectKey != "":
selectKey += ","
selectKey += "PRESS"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['press'])
if 'score' in item and item['score'] != "":
if selectKey != "":
selectKey += ","
selectKey += "SCORE"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['score'])
if 'pageCount' in item:
if selectKey != "":
selectKey += ","
selectKey += "PAGE_COUNT"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['pageCount'])
if 'price' in item:
if selectKey != "":
selectKey += ","
selectKey += "PRICE"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['price'])
if 'isbn' in item:
if selectKey != "":
selectKey += ","
selectKey += "ISBN"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['isbn'])
if 'publishYear' in item:
if selectKey != "":
selectKey += ","
selectKey += "PUBLISH_YEAR"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['publishYear'])
if 'binding' in item:
if selectKey != "":
selectKey += ","
selectKey += "BINDING"
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(item['binding'])
if 'label' in item:
if selectKey != "":
selectKey += ","
selectKey += "LABEL"
label = ",".join(item['label'])
if placeHolder != "":
placeHolder += ","
placeHolder += "%s"
params.append(label)
sql += selectKey + ") VALUES(" + placeHolder+")"
# sql ="insert into book(URL,IMG_URL,AUTHOR,NAME,PRESS,SCORE,PAGE_COUNT,PRICE,ISBN,PUBLISH_YEAR,BINDING,LABEL) VALUES(%s,%s,%s,%s,%s,%s,%d,%s,%s,%s,%s,%s)"
# label = ",".join(item['label'])
self.cursor.execute(sql, params)
# self.cursor.execute(sql,(item['url'],item['imgUrl'],item['author'],item['name'],item['press'],item['score'],item['pageCount'],item['price'],item['isbn'],item['publishYear'],item['binding'],label))
self.conn.commit()
return item
def close_spider(self,spider):
self.conn.close()
在设置3秒延时的情况下,爬了大概1000本书被封IP了。。。。淘宝买了个代理,每两分钟更换代理,于是又愉快的跑起来了,代理设置:
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import requests
class TestSpiderSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
def process_request(self,request,spider):
ip = self.get_proxy()
print("使用代理ip: ", ip)
request.meta['proxy'] = ip
# 获取代理地址及端口
def get_proxy(self):
return requests.get("xxx").text
爬了两天两夜代理IP时间到期了,就中断了
