python scrapy爬虫框架初体验【扒免费网站的小说】

2019-03-01 本文已影响0人来个第一次

【适用】

适用于爬虫初学者

【准备】

1、anaconda装好python和爬虫环境

2、略懂python

【目标】

爬免费小说网站的书籍，例如：https://www.kanshushenzhan.com/13238/

【爬虫思路】

get网页 --> 解析处理数据 --> 存储

这条思路适用于一般情况，有些需要扒网站的接口，才能获取完整的数据

【步骤】

1、扒书籍详情页面
项目结构：

image.png

代码如下：

# kanshu_spider.py
import scrapy

# 引入item
from kanshu.items import KanshuItem
from scrapy.selector import Selector

class KanshuSpider(scrapy.Spider):
    name = "kanshu"

    def start_requests(self):
        urls = [
            'https://www.kanshushenzhan.com/13238/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        print("来到这里")

        # 数据容器
        item = KanshuItem()
        item['section'] = []
        try:
            # 提取数据
            item['book_name'] = response.css('.bookPhr > h2::text').extract_first()
            item['o_name'] = item['name']
            item['zuozhe'] = response.css('.bookPhr > dl > dd::text').extract_first()
            item['fenmian'] = 'https://www.kanshushenzhan.com' + response.css('.bookImg > img::attr(src)').extract_first()
            item['jieshao'] = response.css('.introCon > p::text').extract_first()
            item['source'] = 'kanshukanshu'

            body = response.xpath('//*[@id="yuedu"]/div[2]/ul').extract_first()
            selectList = Selector(text=body).css('ul > li > a')
            for index, section in enumerate(selectList):
                opt = (index + 1, 'https://www.kanshushenzhan.com' + section.css('a::attr(href)').extract_first(),
                       section.css('a::text').extract_first())
                item['section'].append(opt)
        except:
            print("发生异常2")
        return item

item

# items.py
import scrapy


class KanshuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 书籍名
    book_name = scrapy.Field()
    # 原始书籍名
    o_name = scrapy.Field()
    # 简介
    jieshao= scrapy.Field()
    # 作者
    zuozhe= scrapy.Field()
    # 来源
    source = scrapy.Field()
    # 封面
    fenmian = scrapy.Field()
    # 章节list 里面存放tuple
    section = scrapy.Field()

    pass

pipelines

# pipelines.py
import pymysql


class KanshuPipeline(object):
    def __init__(self):
        super().__init__()
        # 连接数据库
        self.connect = pymysql.connect(
            host='127.0.0.1',  # 数据库地址
            port=3306,  # 数据库端口
            db='testtest',  # 数据库名
            user='root',  # 数据库用户名
            passwd='fd',  # 数据库密码
            charset='utf8',  # 编码方式
            use_unicode=True)
        # 通过cursor执行增删查改
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        try:
            arr = (item['book_name'], item['o_name'], item['jieshao'], item['zuozhe'], item['source'], item['fenmian'])
            self.cursor.execute(
                """
                INSERT INTO book(`book_name`, o_name, jieshao, zuozhe, source, fenmian) values (%s, %s, %s, %s, %s, %s)
                """, arr)

            # 假设成功了
            book_id = self.connect.insert_id()

            for opt in item['section']:
                arr = (book_id, opt[2], opt[1], opt[0])
                self.cursor.execute("""
                    INSERT INTO book_section(book_id, `s_name`, section_url, chapter) values (%s, %s, %s, %s)
                """, arr)

            # 提交sql语句
            self.connect.commit()
        except:
            self.connect.rollback()
            print("发生异常1")
        return item

setting

ITEM_PIPELINES = {
   'kanshu.pipelines.KanshuPipeline': 300,
}

此代码仅用于学习

python scrapy爬虫框架初体验【扒免费网站的小说】

猜你喜欢

热点阅读

python scrapy爬虫框架初体验 【扒免费网站的小说】

猜你喜欢

热点阅读

python scrapy爬虫框架初体验【扒免费网站的小说】