Scrapy爬取全国行政区划并实时插入MySQL数据库
2017-11-09 本文已影响272人
349ff5da91d8
主要爬虫框架:Scrapy
数据库模块:pymysql
python版本:python3.5.3
windows版本:win10
爬取心得:利用已有的工具,熟悉需求
爬取步骤:
1、创建爬虫项目:scrapy startproject home_scrapy
2、编写spider代码
3、编写pipelines处理代码(json存储及mysql数据库存储)
4、运行爬虫项目: scrapy crawl spiderone
爬取时间:11:28am - 17:41pm (共计6小时13分钟)
贴代码:
spiderone.py:
# -*- coding: utf-8 -*-
import scrapy
from home_scrapy.items import HomeScrapyItem
import os
class SpideroneSpider(scrapy.Spider):
name = 'spiderone'
allowed_domains = ['http://www.stats.gov.cn']
baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/'
start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html']
# start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/43.html']
# 省
def parse(self, response):
node_list = response.xpath('//tr[@class="provincetr"]/td/a')
for province_node in node_list:
province_data = HomeScrapyItem()
aname = province_node.xpath('text()').extract()
url = province_node.xpath('@href').extract()
pre_id = str(url[0])
id = pre_id.split('.', -1)[0] + '0000000000'
province_data['aname'] = aname[0]
province_data['id'] = id
province_data['lv'] = 1
province_data['pid'] = 100000000000
city_url = self.baseUrl + str(url[0])
yield scrapy.Request(city_url, meta={'province_data': province_data}, callback=self.city_parse,
encoding='utf-8', dont_filter=True)
yield province_data
def city_parse(self, response):
city_list = response.xpath('//tr[@class="citytr"]')
meta = response.meta['province_data']
for city_node in city_list:
city_data = dict(meta)
number = city_node.xpath('td[1]/a/text()').extract()
name = city_node.xpath('td[2]/a/text()').extract()
city_url = city_node.xpath('td[1]/a/@href').extract()
city_data['id'] = number[0]
city_data['aname'] = name[0]
city_data['lv'] = 2
city_data['pid'] = meta['id']
country_url = self.baseUrl + str(city_url[0])
yield scrapy.Request(country_url, meta={'city_data': city_data}, callback=self.country_parse,
encoding='utf-8', dont_filter=True)
yield city_data
def country_parse(self, response):
meta = response.meta['city_data']
country_list = response.xpath('//tr[@class="countytr"]')
for country_node in country_list:
if country_node.xpath('td/a/text()'):
country_data = dict(meta)
number = country_node.xpath('td[1]/a/text()').extract()
name = country_node.xpath('td[2]/a/text()').extract()
country_data['id'] = number[0]
country_data['aname'] = name[0]
country_data['lv'] = 3
country_data['pid'] = meta['id']
country_url = country_node.xpath('td[1]/a/@href').extract()[0]
base_link = os.path.dirname(response.url)
next_link = '/'.join([base_link, country_url])
yield scrapy.Request(next_link, meta={'country_data': country_data}, callback=self.town_parse,
encoding='utf-8', dont_filter=True)
yield country_data
else:
country_data = dict(meta)
number = country_node.xpath('td[1]/text()').extract()
name = country_node.xpath('td[2]/text()').extract()
country_data['id'] = number[0]
country_data['aname'] = name[0]
country_data['lv'] = 3
country_data['pid'] = meta['id']
yield country_data
def town_parse(self, response):
meta = response.meta['country_data']
town_list = response.xpath('//tr[@class="towntr"]')
for town_node in town_list:
town_data = dict(meta)
number = town_node.xpath('td[1]/a/text()').extract()
name = town_node.xpath('td[2]/a/text()').extract()
town_data['id'] = number[0]
town_data['aname'] = name[0]
town_data['lv'] = 4
town_data['pid'] = meta['id']
town_url = town_node.xpath('td[1]/a/@href').extract()[0]
base_link = os.path.dirname(response.url)
next_link = '/'.join(([base_link, town_url]))
yield scrapy.Request(next_link, meta={'town_data': town_data}, callback=self.village_parse,
encoding='utf-8', dont_filter=True)
yield town_data
def village_parse(self, response):
meta = response.meta['town_data']
village_list = response.xpath('//tr[@class="villagetr"]')
for village_node in village_list:
village_data = dict(meta)
number = village_node.xpath('td[1]/text()').extract()
code = village_node.xpath('td[2]/text()').extract()
name = village_node.xpath('td[3]/text()').extract()
village_data['id'] = number[0]
# village_data['villageCode'] = code[0]
village_data['aname'] = name[0]
village_data['lv'] = 5
village_data['pid'] = meta['id']
yield village_data
pipelines.py:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import threading
import pymysql
from DBUtils.PooledDB import PooledDB
class HomeScrapyPipeline(object):
def __init__(self):
self.f = open("data.json", 'wb')
def process_item(self, item, spider):
content = json.dumps(dict(item), ensure_ascii=False) + ", \n"
self.f.write(content.encode('utf-8'))
return item
def close_spider(self, spider):
self.f.close()
lock = threading.RLock()
class HomeScrapyMySQLPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
# 创建数据库连接池类方法
dbpool = PooledDB(creator=pymysql,
mincached=settings['DB_MIN_CACHED'], maxcached=settings['DB_MAX_CACHED'],
maxshared=settings['DB_MAX_SHARED'], maxconnections=settings['DB_MAX_CONNECTIONS'],
blocking=settings['DB_BLOCKING'], maxusage=settings['DB_MAX_USAGE'], setsession=settings['DB_SET_SESSION'],
host=settings['DB_HOST'], port=settings['DB_PORT'],
user=settings['DB_USER'], passwd=settings['DB_PASSWD'],
db=settings['DB_NAME'], charset=settings['DB_CHARSET'], use_unicode=False
)
return cls(dbpool)
# pipeline默认调用
def process_item(self, item, spider):
lock.acquire()
conn = self.dbpool.connection()
cursor = conn.cursor()
sql = "insert into area(id, aname, lv, pid) values(%s,%s,%s,%s)"
params = (item["id"], item["aname"], item["lv"], item["pid"])
cursor.execute(sql, params)
conn.commit()
cursor.close()
conn.close()
lock.release()
return item
items.py:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class HomeScrapyItem(scrapy.Item):
# provinceName = scrapy.Field() # 省/直辖市/自治区名称
#
# cityName = scrapy.Field() # 市/区级名称
# cityNumber = scrapy.Field()
#
# countryName = scrapy.Field() # 区/县级名称
# countryNumber = scrapy.Field()
#
# townName = scrapy.Field() # 乡/镇级名称
# townNumber = scrapy.Field()
#
# villageName = scrapy.Field() # 街道/镇/村级名称
# villageNumber = scrapy.Field()
# villageCode = scrapy.Field()
id = scrapy.Field() #行政区划代码
aname = scrapy.Field() #行政区划名称
lv = scrapy.Field() #行政区划等级
pid = scrapy.Field() #上级行政区划代码
settings.py:
# -*- coding: utf-8 -*-
# Scrapy settings for home_scrapy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'home_scrapy'
SPIDER_MODULES = ['home_scrapy.spiders']
NEWSPIDER_MODULE = 'home_scrapy.spiders'
# TEST数据库信息
DB_HOST = "localhost"
DB_PORT = 3306
DB_NAME = "test"
DB_USER = "root"
DB_PASSWD = "root111"
# 数据库连接编码
DB_CHARSET = "utf8"
# mincached : 启动时开启的闲置连接数量(缺省值 0 以为着开始时不创建连接)
DB_MIN_CACHED = 10
# maxcached : 连接池中允许的闲置的最多连接数量(缺省值 0 代表不闲置连接池大小)
DB_MAX_CACHED = 10
# maxshared : 共享连接数允许的最大数量(缺省值 0 代表所有连接都是专用的)如果达到了最大数量,被请求为共享的连接将会被共享使用
DB_MAX_SHARED = 20
# maxconnections : 创建连接池的最大数量(缺省值 0 代表不限制)
DB_MAX_CONNECIONS = 100
# blocking : 设置在连接池达到最大数量时的行为(缺省值 0 或 False 代表返回一个错误<toMany......>; 其他代表阻塞直到连接数减少,连接被分配)
DB_BLOCKING = True
# maxusage : 单个连接的最大允许复用次数(缺省值 0 或 False 代表不限制的复用).当达到最大数时,连接会自动重新连接(关闭和重新打开)
DB_MAX_USAGE = 0
# setsession : 一个可选的SQL命令列表用于准备每个会话,如["set datestyle to german", ...]
DB_SET_SESSION = None
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'home_scrapy (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'home_scrapy.middlewares.HomeScrapySpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'home_scrapy.middlewares.MyCustomDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'home_scrapy.pipelines.HomeScrapyPipeline': 300,
'home_scrapy.pipelines.HomeScrapyMySQLPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
数据库查询示例图:

