网络爬虫

用完成的Scrapy第一个爬虫项目

2019-03-20  本文已影响0人  lvyz0207
scrapy003.png

第一个爬虫项目

1、创建项目

scrapy startproject douban

2、分析页面、及url

第二页的url: http://book.douban.com/top250?start=25 
第三页的url: http://book.douban.com/top250?start=50 
第四页的url: http://book.douban.com/top250?start=75

很自然的你会发现url后面的数字是有规律递增的,知道这个就好办了,于是用于翻页的url正则表达式我们可以这样写:

r"http://book.douban.com/top250\?start=\d+"

图书详情页URL分析
就会发现每一本书的详情页链接都类似,只是后面的一串数字不同而已

http://book.douban.com/subject/1071241/

因此我们又可以得出在列表页中查找图书详情页ulr的正则表达式了

r"http://book.douban.com/subject/\d+"

3、定义爬虫

cd douban
scrapy genspider books book.douban.com
# 终端验证代码调试页面元素
scrapy shell https://book.douban.com/subject/1013129/

4、定义要爬取的内容

import scrapy

class DoubanBookItem(scrapy.Item):
    """ 定义需要抓取的字段名 """

    name = scrapy.Field()                 # 书名
    author = scrapy.Field()         # 作者
    press = scrapy.Field()                    # 出版社
    date = scrapy.Field()               # 出版日期
    page = scrapy.Field()                  # 页数
    price = scrapy.Field()                    # 价格
    score = scrapy.Field()                    # 读者评分
    ISBN = scrapy.Field()               # ISBN号
    author_profile = scrapy.Field()             # 作者简介
    content_description = scrapy.Field()        # 内容简介
    link = scrapy.Field()                       # 详情页链接

scrapy原理图.png

5、编写爬虫代码books.py

# -*- coding: utf-8 -*-
# 爬取豆瓣上图书排名前250的书籍

import scrapy
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors import LinkExtractor
from douban.items import DoubanBookItem


class BooksSpider(CrawlSpider):
    name = 'books'
    allowed_domains = ['book.douban.com']
    start_urls = ['https://book.douban.com/top250']   # r”http://book.douban.com/top250\?start=\d+” 图书正则表达式

    rules = (
        # 列表URL
        Rule(LinkExtractor(allow=(r"https://book.douban.com/top250\?start=\d+"))),
        # 详情页url
        Rule(LinkExtractor(allow=(r"https://book.douban.com/subject/\d+")),callback="books_parse"))

    def books_parse(self, response):
        print("------------页面响应数据-------------")
        sel = Selector(response=response)
        item = DoubanBookItem()

        item["name"] = sel.xpath("//div[@id='wrapper']/h1/span/text()").extract()[0].strip()
        item["score"] = sel.xpath('//div[@id="interest_sectl"]/div/div[2]/strong/text()').extract()[0]
        item["link"] = response.url

        try:
            contents = sel.xpath("//div[@id='link-report']//div[@class='info']")[-1].xpath(".//p//text()").extract()
            item["content_description"] = "\n".join(content for content in contents)
        except:
            item["content_description"] = ""

        try:
            profiles = sel.xpath("//div[@class='related_info']//div[@class='indent']//div[@class='intro']")[-1].xpath(".//p//text()").extract()
            item['author_profile'] = "\n".join(profile for profile in profiles)
        except:
            item['author_profile'] = ""

        datas = response.xpath("//div[@id='info']//text()").extract()
        datas = [data.strip() for data in datas]
        datas = [data for data in datas if data != ""]
        for i, data in enumerate(datas):
            print("index %d " %i, data)
            print("-------------循环页面数据-----------")

        for data in datas:
            if "作者" in data:
                if ":" in data:
                    item["author"] = datas[datas.index(data) + 1]
                elif ':' not in data:
                    item["author"] = datas[datas.index(data) + 2]
            elif "出版社:" in data:
                item["press"] = datas[datas.index(data) + 1]
            elif "出版年:" in data:
                item["date"] = datas[datas.index(data) + 1]
            elif "页数" in data:
                item["page"] = datas[datas.index(data) + 1]
            elif "定价:" in data:
                item["price"] = datas[datas.index(data) + 1]
            elif "ISBN" in data:
                item["ISBN"] = datas[datas.index(data) + 1]

        return item

6、把抓取的数据存入到MySQL中

# -*- coding: utf-8 -*-
import json
from twisted.enterprise import adbapi
from scrapy import log
import MySQLdb
import MySQLdb.cursors

class DoubanPipeline(object):
    """将抓取的数据存入到json中"""

    def __init__(self):
         self.file = open("./books.json", "wb")

    def process_item(self, item, spider):
        for k in item:
            item[k] = item[k].encode("utf8")

        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item

class MySQLPipeline(object):
    """将抓取的数据存入到MySQL中"""

    def __init__(self):
        self.dbpool = adbapi.ConnectionPool("MySQLdb",
                                            host = '127.0.0.1',
                                            db = "douban_book",
                                            user = "root",
                                            password = "root",
                                            cursorclass = MySQLdb.cursors.DictCursor,
                                            charset = "utf8",
                                            use_unicode = False
        )

    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self._conditional_insert, item)
        query.addErrback(self.handle_error)
        return item
    
    def _conditional_insert(self, tb, item):
        tb.execute("insert into books (name, author, press, date, page, price, score, ISBN, author_profile,\
        content_description, link) value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",\
        (item["name"], item["author"], item["press"], item["date"],\
        item["page"], item["price"], item["score"], item["ISBN"],\
        item["author_profile"], item["content_description"], item["link"])
        )

        log.msg("item data in db:%s" % item, level=log.DEBUG)

    def handle_error(self, e):
        log.err(e) 


7、创建数据库douban_book、创建数据表books

create table douban_book.books(id int primary key auto_increment, name varchar(100) NOT NULL, author varchar(50) NULL, press varchar(100) NULL, date varchar(30) NULL, page varchar(30) NULL, price varchar(30) NULL, score varchar(30) NULL, ISBN varchar(30) NULL, author_profile varchar(1500) NULL, content_description varchar(1500) NULL, link varchar(255) NULL )default charset=utf8;

8、更改设置文件

ITEM_PIPELINES = {
#    'douban.pipelines.DoubanPipeline': 300,
    "douban.pipelines.MySQLPipeline" : 400,
}


DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'zh-TW,zh-CN;q=0.9,zh;q=0.8,en;q=0.7',
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
}

DOWNLOAD_DELAY = 3

9、运行爬虫

scrapy crawl books

10、查看结果数据

scrapy002.png

错误信息

Scrapy shell调试返回403错误
第一种方法是在命令上加上-s USER_AGENT='Mozilla/5.0'

第二种方法是修改scrapy的user-agent默认值

把
USER_AGENT = 'Scrapy/%s (+http://scrapy.org)' % import_module('scrapy').__version__
改为
USER_AGENT = 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0'

mac 地址为

cd /usr/local/lib/python3.6/site-packages/scrapy/settings

使用shell再次,发现已经可以正常访问html不会在出现403错误了。

上一篇下一篇

猜你喜欢

热点阅读