Python三期爬虫作业Python四期爬虫作业

python爬虫scrapy框架-诗词名句网-唐代诗人及其作品

2017-10-24  本文已影响73人  chengcxy

一.项目结构

项目结构
main函数作为启动爬虫脚本 spiders文件夹内tangshispider.py为主爬虫程序;
items.py为定义的数据存储字段,和上篇文章建表语句保持一致;
piplines.py为爬虫提交过来的数据存储管道,这里根据items.py里定义的不同item的类区分分别入库;
settings.py为配置脚本,主要启用了pipline的设置
项目git地址:https://github.com/chengcxy/scrapy_spiders/tree/master/tangshi

二.代码

2.1 items.py

# -*- coding: utf-8 -*-
from scrapy import Field, Item
#诗人表字段
class TangshiItem(Item):
    chaodai = Field()
    poemer = Field()
    zuopins_total = Field()
    poemer_url = Field()
#作品表字段
class PoemZuopin(Item):
    poemer = Field()
    poemer_url = Field()
    zuopin_name = Field()
    name_words = Field()
    zuopin_content = Field()
    zuopin_words = Field()
    zuopin_url = Field()

2.2 主爬虫程序 在spiders文件夹下新建tangshispider.py

#coding:utf8
from scrapy.http import Request
from scrapy.spiders import CrawlSpider
from tangshi.items import TangshiItem,PoemZuopin

class TangShiSpider(CrawlSpider):
    name = 'tangshispider'
    start_urls = ['http://www.shicimingju.com/category/tangdaishiren/page/1']
    base_url = 'http://www.shicimingju.com/category/tangdaishiren/page/{}'
    allowed_domains = ['www.shicimingju.com']

    def parse(self,response):
        total_page = response.xpath('//div[@class="yema"]/text()').extract_first().split('/')[1].replace('共', '').replace('页)', '')
        print(total_page)
        for i in range(1, int(total_page) + 1):
            poem_page_url = self.base_url.format(str(i))
            yield Request(url=poem_page_url,callback=self.parse_poem_page)

    def parse_poem_page(self, response):
        print('parse_poem_page函数解析--->%s' % response.url)
        poems = response.xpath('//div[@class="shirenlist"]//a')
        for poem in poems:
            poemer_item = TangshiItem()
            poemer_url = 'http://www.shicimingju.com' + poem.xpath('@href').extract_first()
            poemer = poem.xpath('text()').extract_first()
            poemer_item['poemer_url'] = poemer_url
            poemer_item['poemer'] = poemer
            yield Request(url=poemer_url,callback=self.parse_poem,meta={'item':poemer_item})

    def parse_poem(self,response):
            poemer_item = response.meta['item']
            # 得到诗人作品集的总作品数 构建分页
            zuopins_total = response.xpath('//div[@class="num"]/b/text()').extract_first()
            poemer_item['chaodai'] = '唐朝'
            poemer_item['zuopins_total'] = zuopins_total

            yield poemer_item
            # 根据 得到诗人作品集的总作品数   得到诗人作品集的每一页请求
            zuopin_page_base_url = poemer_item['poemer_url'].replace('.html', '') + '_{}.html'
            divmod_num = [i for i in divmod(int(zuopins_total), 40)]
            pages = divmod_num[0] if divmod_num[1] == 0 else divmod_num[0] + 1
            for page in range(1, int(pages) + 1):
                zuopin_page_url = zuopin_page_base_url.format(page)
                yield Request(url=zuopin_page_url,callback=self.parse_page_zuopin)

    def parse_page_zuopin(self, response):
        zuopin_pages = response.xpath('//div[@class="shicilist"]/ul/li[1]/a')
        poemer_url = 'http://www.shicimingju.com' + response.xpath('//div[@class="shicilist"]/ul/li[2]/a[2]/@href').extract_first()
        poemer = response.xpath('//div[@class="shicilist"]/ul/li[2]/a[2]/em/text()').extract_first()
        for zuopin_page in zuopin_pages:
            item2 = {}
            zuopin_url = 'http://www.shicimingju.com' + zuopin_page.xpath('@href').extract_first()
            zuopin_name = zuopin_page.xpath('text()').extract_first()
            print('作者:%s,作者url:%s,作品==>%s,作品url==>%s' % (poemer, poemer_url, zuopin_name, zuopin_url))
            item2['poemer_url'] = poemer_url
            item2['poemer'] = poemer
            item2['zuopin_url'] = zuopin_url
            item2['zuopin_name'] = zuopin_name
            yield Request(url=zuopin_url,callback=self.parse_zuopin_detail,meta={'item2':item2})
            # 解析作品详情页

    def parse_zuopin_detail(self, response):
        item=response.meta['item2']
        print('parse_zuopin_detail函数解析--->%s' % item['zuopin_url'])
        zuopin_item = PoemZuopin()
        zuopin_item['poemer'] = item['poemer']
        zuopin_item['poemer_url'] = item['poemer_url']
        zuopin_item['zuopin_name'] = item['zuopin_name']
        zuopin_item['name_words'] = len(item['zuopin_name'])
        zuopin_item['zuopin_url'] = item['zuopin_url']
        try:
            zuopin_content = response.xpath('//div[@class="shicineirong"]//text()').extract()
            zuopin_item['zuopin_content'] = ''.join([x.strip() for x in zuopin_content])
            zuopin_item['zuopin_words'] = len(zuopin_item['zuopin_content'].replace(',', '').replace('。', ''))
        except:
            zuopin_item['zuopin_content'] = '抓取失败无数据'
            zuopin_item['zuopin_words'] = 0
        print(zuopin_item)
        yield zuopin_item

2.3 piplines.py 数据处理管道 入数据库操作

# -*- coding: utf-8 -*-
import pymysql
from tangshi.items import TangshiItem,PoemZuopin

class TangshiPipeline(object):
    def __init__(self):
        self.MYSQL_CONFIG = {
            'host': 'localhost',
            'port': 3306,
            'user': 'root',
            'password': '密码',
            'db': 'local_db',
            'charset': 'utf8'
        }
        self.conn = pymysql.connect(**self.MYSQL_CONFIG)

    def process_item(self, item, spider):
        if isinstance(item,TangshiItem):
            poemers = ['chaodai', 'poemer', 'zuopins_total', 'poemer_url']
            poemers_base_sql = 'insert into poemers ({}) values(%s,%s,%s,%s)'
            poemers_sql = poemers_base_sql.format(','.join(poemers))
            conn = self.conn
            cursor = conn.cursor()
            cursor.execute(poemers_sql,(item['chaodai'], item['poemer'], item['zuopins_total'], item['poemer_url']))
            conn.commit()

        elif isinstance(item, PoemZuopin):
            zuopins = ['poemer', 'poemer_url', 'zuopin_name', 'name_words', 'zuopin_content', 'zuopin_words',
                       'zuopin_url']
            zuopin_base_sql = 'insert into poem_zuopin ({}) values(%s,%s,%s,%s,%s,%s,%s)'
            zuopin_sql = zuopin_base_sql.format(','.join(zuopins))
            conn = self.conn
            cursor = conn.cursor()
            cursor.execute(zuopin_sql,(item['poemer'], item['poemer_url'], item['zuopin_name'], item['name_words'],item['zuopin_content'], item['zuopin_words'], item['zuopin_url']))
            conn.commit()

2.4 settings.py

# -*- coding: utf-8 -*-
BOT_NAME = 'tangshi'
SPIDER_MODULES = ['tangshi.spiders']
NEWSPIDER_MODULE = 'tangshi.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
   'tangshi.pipelines.TangshiPipeline': 300,
}

2.5 main.py 启动爬虫脚本

from scrapy.cmdline import execute
execute('scrapy crawl tangshispider'.split(' '))
上一篇下一篇

猜你喜欢

热点阅读