Scrapy爬取全国行政区划并实时插入MySQL数据库

2017-11-09 本文已影响272人 349ff5da91d8

主要爬虫框架：Scrapy

数据库模块：pymysql
python版本：python3.5.3
windows版本:win10
爬取心得：利用已有的工具，熟悉需求
爬取步骤：
1、创建爬虫项目：scrapy startproject home_scrapy
2、编写spider代码
3、编写pipelines处理代码（json存储及mysql数据库存储）
4、运行爬虫项目： scrapy crawl spiderone
爬取时间：11:28am - 17:41pm (共计6小时13分钟）
贴代码：

spiderone.py:

# -*- coding: utf-8 -*-
import scrapy
from home_scrapy.items import HomeScrapyItem
import os


class SpideroneSpider(scrapy.Spider):
    name = 'spiderone'
    allowed_domains = ['http://www.stats.gov.cn']
    baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/'
    start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html']
    # start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/43.html']

    # 省
    def parse(self, response):
        node_list = response.xpath('//tr[@class="provincetr"]/td/a')
        for province_node in node_list:
            province_data = HomeScrapyItem()
            aname = province_node.xpath('text()').extract()
            url = province_node.xpath('@href').extract()
            pre_id = str(url[0])
            id = pre_id.split('.', -1)[0] + '0000000000'
            province_data['aname'] = aname[0]
            province_data['id'] = id
            province_data['lv'] = 1
            province_data['pid'] = 100000000000
            city_url = self.baseUrl + str(url[0])
            yield scrapy.Request(city_url, meta={'province_data': province_data}, callback=self.city_parse,
                                 encoding='utf-8', dont_filter=True)
            yield province_data

    def city_parse(self, response):
        city_list = response.xpath('//tr[@class="citytr"]')
        meta = response.meta['province_data']
        for city_node in city_list:
            city_data = dict(meta)
            number = city_node.xpath('td[1]/a/text()').extract()
            name = city_node.xpath('td[2]/a/text()').extract()
            city_url = city_node.xpath('td[1]/a/@href').extract()
            city_data['id'] = number[0]
            city_data['aname'] = name[0]
            city_data['lv'] = 2
            city_data['pid'] = meta['id']
            country_url = self.baseUrl + str(city_url[0])
            yield scrapy.Request(country_url, meta={'city_data': city_data}, callback=self.country_parse,
                                 encoding='utf-8', dont_filter=True)
            yield city_data


    def country_parse(self, response):
        meta = response.meta['city_data']
        country_list = response.xpath('//tr[@class="countytr"]')
        for country_node in country_list:

            if country_node.xpath('td/a/text()'):
                country_data = dict(meta)
                number = country_node.xpath('td[1]/a/text()').extract()
                name = country_node.xpath('td[2]/a/text()').extract()
                country_data['id'] = number[0]
                country_data['aname'] = name[0]
                country_data['lv'] = 3
                country_data['pid'] = meta['id']
                country_url = country_node.xpath('td[1]/a/@href').extract()[0]
                base_link = os.path.dirname(response.url)
                next_link = '/'.join([base_link, country_url])
                yield scrapy.Request(next_link, meta={'country_data': country_data}, callback=self.town_parse,
                                     encoding='utf-8', dont_filter=True)
                yield country_data
            else:
                country_data = dict(meta)
                number = country_node.xpath('td[1]/text()').extract()
                name = country_node.xpath('td[2]/text()').extract()
                country_data['id'] = number[0]
                country_data['aname'] = name[0]
                country_data['lv'] = 3
                country_data['pid'] = meta['id']
                yield country_data

    def town_parse(self, response):
        meta = response.meta['country_data']
        town_list = response.xpath('//tr[@class="towntr"]')
        for town_node in town_list:
            town_data = dict(meta)
            number = town_node.xpath('td[1]/a/text()').extract()
            name = town_node.xpath('td[2]/a/text()').extract()
            town_data['id'] = number[0]
            town_data['aname'] = name[0]
            town_data['lv'] = 4
            town_data['pid'] = meta['id']
            town_url = town_node.xpath('td[1]/a/@href').extract()[0]
            base_link = os.path.dirname(response.url)
            next_link = '/'.join(([base_link, town_url]))
            yield scrapy.Request(next_link, meta={'town_data': town_data}, callback=self.village_parse,
                                 encoding='utf-8', dont_filter=True)
            yield town_data
    def village_parse(self, response):
        meta = response.meta['town_data']
        village_list = response.xpath('//tr[@class="villagetr"]')
        for village_node in village_list:
            village_data = dict(meta)
            number = village_node.xpath('td[1]/text()').extract()
            code = village_node.xpath('td[2]/text()').extract()
            name = village_node.xpath('td[3]/text()').extract()
            village_data['id'] = number[0]
            # village_data['villageCode'] = code[0]
            village_data['aname'] = name[0]
            village_data['lv'] = 5
            village_data['pid'] = meta['id']
            yield village_data

pipelines.py:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import threading
import pymysql
from DBUtils.PooledDB import PooledDB
class HomeScrapyPipeline(object):
    def __init__(self):
        self.f = open("data.json", 'wb')

    def process_item(self, item, spider):
        content = json.dumps(dict(item), ensure_ascii=False) + ", \n"
        self.f.write(content.encode('utf-8'))
        return item

    def close_spider(self, spider):
        self.f.close()


lock = threading.RLock()
class HomeScrapyMySQLPipeline(object):
    def __init__(self, dbpool):
        self.dbpool = dbpool


    @classmethod
    def from_settings(cls, settings):
        #  创建数据库连接池类方法
        dbpool = PooledDB(creator=pymysql,
                 mincached=settings['DB_MIN_CACHED'], maxcached=settings['DB_MAX_CACHED'],
                 maxshared=settings['DB_MAX_SHARED'], maxconnections=settings['DB_MAX_CONNECTIONS'],
                 blocking=settings['DB_BLOCKING'], maxusage=settings['DB_MAX_USAGE'], setsession=settings['DB_SET_SESSION'],
                 host=settings['DB_HOST'], port=settings['DB_PORT'],
                 user=settings['DB_USER'], passwd=settings['DB_PASSWD'],
                 db=settings['DB_NAME'], charset=settings['DB_CHARSET'], use_unicode=False
                 )
        return cls(dbpool)  

    # pipeline默认调用
    def process_item(self, item, spider):
        lock.acquire()
        conn = self.dbpool.connection()
        cursor = conn.cursor()
        sql = "insert into area(id, aname, lv, pid) values(%s,%s,%s,%s)"
        params = (item["id"], item["aname"], item["lv"], item["pid"])
        cursor.execute(sql, params)
        conn.commit()
        cursor.close()
        conn.close()
        lock.release()
        return item


items.py:
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class HomeScrapyItem(scrapy.Item):

    # provinceName = scrapy.Field()  # 省/直辖市/自治区名称
    #
    # cityName = scrapy.Field()      # 市/区级名称
    # cityNumber = scrapy.Field()
    #
    # countryName = scrapy.Field()   # 区/县级名称
    # countryNumber = scrapy.Field()
    #
    # townName = scrapy.Field()      # 乡/镇级名称
    # townNumber = scrapy.Field()
    #
    # villageName = scrapy.Field()   # 街道/镇/村级名称
    # villageNumber = scrapy.Field()
    # villageCode = scrapy.Field()
      id = scrapy.Field()     #行政区划代码
      aname = scrapy.Field()  #行政区划名称
      lv = scrapy.Field()     #行政区划等级
      pid = scrapy.Field()    #上级行政区划代码

settings.py:

# -*- coding: utf-8 -*-

# Scrapy settings for home_scrapy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'home_scrapy'

SPIDER_MODULES = ['home_scrapy.spiders']
NEWSPIDER_MODULE = 'home_scrapy.spiders'

# TEST数据库信息
DB_HOST = "localhost"
DB_PORT = 3306
DB_NAME = "test"
DB_USER = "root"
DB_PASSWD = "root111"

# 数据库连接编码
DB_CHARSET = "utf8"

# mincached : 启动时开启的闲置连接数量(缺省值 0 以为着开始时不创建连接)
DB_MIN_CACHED = 10

# maxcached : 连接池中允许的闲置的最多连接数量(缺省值 0 代表不闲置连接池大小)
DB_MAX_CACHED = 10

# maxshared : 共享连接数允许的最大数量(缺省值 0 代表所有连接都是专用的)如果达到了最大数量,被请求为共享的连接将会被共享使用
DB_MAX_SHARED = 20

# maxconnections : 创建连接池的最大数量(缺省值 0 代表不限制)
DB_MAX_CONNECIONS = 100

# blocking : 设置在连接池达到最大数量时的行为(缺省值 0 或 False 代表返回一个错误<toMany......>; 其他代表阻塞直到连接数减少,连接被分配)
DB_BLOCKING = True

# maxusage : 单个连接的最大允许复用次数(缺省值 0 或 False 代表不限制的复用).当达到最大数时,连接会自动重新连接(关闭和重新打开)
DB_MAX_USAGE = 0

# setsession : 一个可选的SQL命令列表用于准备每个会话，如["set datestyle to german", ...]
DB_SET_SESSION = None

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'home_scrapy (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'home_scrapy.middlewares.HomeScrapySpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'home_scrapy.middlewares.MyCustomDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'home_scrapy.pipelines.HomeScrapyPipeline': 300,
    'home_scrapy.pipelines.HomeScrapyMySQLPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

数据库查询示例图：

微信截图_20171109102727.png

微信截图_20171109102858.png