用完成的Scrapy第一个爬虫项目
2019-03-20 本文已影响0人
lvyz0207
scrapy003.png
第一个爬虫项目
1、创建项目
scrapy startproject douban
2、分析页面、及url
第二页的url: http://book.douban.com/top250?start=25
第三页的url: http://book.douban.com/top250?start=50
第四页的url: http://book.douban.com/top250?start=75
很自然的你会发现url后面的数字是有规律递增的,知道这个就好办了,于是用于翻页的url正则表达式我们可以这样写:
r"http://book.douban.com/top250\?start=\d+"
图书详情页URL分析
就会发现每一本书的详情页链接都类似,只是后面的一串数字不同而已
http://book.douban.com/subject/1071241/
因此我们又可以得出在列表页中查找图书详情页ulr的正则表达式了
r"http://book.douban.com/subject/\d+"
3、定义爬虫
cd douban
scrapy genspider books book.douban.com
# 终端验证代码调试页面元素
scrapy shell https://book.douban.com/subject/1013129/
4、定义要爬取的内容
import scrapy
class DoubanBookItem(scrapy.Item):
""" 定义需要抓取的字段名 """
name = scrapy.Field() # 书名
author = scrapy.Field() # 作者
press = scrapy.Field() # 出版社
date = scrapy.Field() # 出版日期
page = scrapy.Field() # 页数
price = scrapy.Field() # 价格
score = scrapy.Field() # 读者评分
ISBN = scrapy.Field() # ISBN号
author_profile = scrapy.Field() # 作者简介
content_description = scrapy.Field() # 内容简介
link = scrapy.Field() # 详情页链接
scrapy原理图.png
5、编写爬虫代码books.py
# -*- coding: utf-8 -*-
# 爬取豆瓣上图书排名前250的书籍
import scrapy
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors import LinkExtractor
from douban.items import DoubanBookItem
class BooksSpider(CrawlSpider):
name = 'books'
allowed_domains = ['book.douban.com']
start_urls = ['https://book.douban.com/top250'] # r”http://book.douban.com/top250\?start=\d+” 图书正则表达式
rules = (
# 列表URL
Rule(LinkExtractor(allow=(r"https://book.douban.com/top250\?start=\d+"))),
# 详情页url
Rule(LinkExtractor(allow=(r"https://book.douban.com/subject/\d+")),callback="books_parse"))
def books_parse(self, response):
print("------------页面响应数据-------------")
sel = Selector(response=response)
item = DoubanBookItem()
item["name"] = sel.xpath("//div[@id='wrapper']/h1/span/text()").extract()[0].strip()
item["score"] = sel.xpath('//div[@id="interest_sectl"]/div/div[2]/strong/text()').extract()[0]
item["link"] = response.url
try:
contents = sel.xpath("//div[@id='link-report']//div[@class='info']")[-1].xpath(".//p//text()").extract()
item["content_description"] = "\n".join(content for content in contents)
except:
item["content_description"] = ""
try:
profiles = sel.xpath("//div[@class='related_info']//div[@class='indent']//div[@class='intro']")[-1].xpath(".//p//text()").extract()
item['author_profile'] = "\n".join(profile for profile in profiles)
except:
item['author_profile'] = ""
datas = response.xpath("//div[@id='info']//text()").extract()
datas = [data.strip() for data in datas]
datas = [data for data in datas if data != ""]
for i, data in enumerate(datas):
print("index %d " %i, data)
print("-------------循环页面数据-----------")
for data in datas:
if "作者" in data:
if ":" in data:
item["author"] = datas[datas.index(data) + 1]
elif ':' not in data:
item["author"] = datas[datas.index(data) + 2]
elif "出版社:" in data:
item["press"] = datas[datas.index(data) + 1]
elif "出版年:" in data:
item["date"] = datas[datas.index(data) + 1]
elif "页数" in data:
item["page"] = datas[datas.index(data) + 1]
elif "定价:" in data:
item["price"] = datas[datas.index(data) + 1]
elif "ISBN" in data:
item["ISBN"] = datas[datas.index(data) + 1]
return item
6、把抓取的数据存入到MySQL中
# -*- coding: utf-8 -*-
import json
from twisted.enterprise import adbapi
from scrapy import log
import MySQLdb
import MySQLdb.cursors
class DoubanPipeline(object):
"""将抓取的数据存入到json中"""
def __init__(self):
self.file = open("./books.json", "wb")
def process_item(self, item, spider):
for k in item:
item[k] = item[k].encode("utf8")
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
class MySQLPipeline(object):
"""将抓取的数据存入到MySQL中"""
def __init__(self):
self.dbpool = adbapi.ConnectionPool("MySQLdb",
host = '127.0.0.1',
db = "douban_book",
user = "root",
password = "root",
cursorclass = MySQLdb.cursors.DictCursor,
charset = "utf8",
use_unicode = False
)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tb, item):
tb.execute("insert into books (name, author, press, date, page, price, score, ISBN, author_profile,\
content_description, link) value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",\
(item["name"], item["author"], item["press"], item["date"],\
item["page"], item["price"], item["score"], item["ISBN"],\
item["author_profile"], item["content_description"], item["link"])
)
log.msg("item data in db:%s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
7、创建数据库douban_book
、创建数据表books
create table douban_book.books(id int primary key auto_increment, name varchar(100) NOT NULL, author varchar(50) NULL, press varchar(100) NULL, date varchar(30) NULL, page varchar(30) NULL, price varchar(30) NULL, score varchar(30) NULL, ISBN varchar(30) NULL, author_profile varchar(1500) NULL, content_description varchar(1500) NULL, link varchar(255) NULL )default charset=utf8;
8、更改设置文件
ITEM_PIPELINES = {
# 'douban.pipelines.DoubanPipeline': 300,
"douban.pipelines.MySQLPipeline" : 400,
}
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-TW,zh-CN;q=0.9,zh;q=0.8,en;q=0.7',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
}
DOWNLOAD_DELAY = 3
9、运行爬虫
scrapy crawl books
10、查看结果数据
scrapy002.png错误信息
Scrapy shell调试返回403错误
第一种方法是在命令上加上-s USER_AGENT='Mozilla/5.0'
第二种方法是修改scrapy的user-agent默认值
把
USER_AGENT = 'Scrapy/%s (+http://scrapy.org)' % import_module('scrapy').__version__
改为
USER_AGENT = 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0'
mac 地址为
cd /usr/local/lib/python3.6/site-packages/scrapy/settings
使用shell再次,发现已经可以正常访问html不会在出现403错误了。