城市五级连爬
2019-05-20 本文已影响0人
楚糖的糖
#!/usr/bin/python
# -- coding: utf-8 --
import re
from lxml import etree
import requests
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
headers = {'User-Agent': user_agent}
def GET_URL(): # 获取省
url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html"
respon = requests.get(url, headers=headers) # 获得响应
htmlText = respon.content.decode("gbk")
s = etree.HTML(htmlText) # 将源码转化为能被XPath匹配的格式
html_data = s.xpath("//table[@class='provincetable']") # 提取相同的前缀
for i in html_data:
# Provinc_url=i.xpath("//tr//@href")[:-1] #暂时注释
Provinc_name = i.xpath("//tr//a/text()")[:-1]
Provinc_url = i.xpath("//tr//@href")[2] # ===========
# print(Provinc_name)
# print(Provinc_url)
Get_City(Provinc_url)
def Get_City(Provinc_url): # 获取市
province_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/" + Provinc_url
# print(province_url)
respon = requests.get(province_url, headers=headers) # 获得响应
htmlText = respon.content.decode("gbk")
s = etree.HTML(htmlText) # 将源码转化为能被XPath匹配的格式
html_data = s.xpath("//table/tbody/tr[2]")[0] # 提取相同的前缀
for i in html_data:
city_code=i.xpath("//td[1]/a//text()") #130100000000
city_name=i.xpath("//td[2]/a//text()") #石家庄市,唐山市,秦皇岛市
# city_URL = i.xpath("//td[2]//@href") #13/1301.html,13/1302.html,13/1303.html #暂时注释
city_URL = i.xpath("//td[2]//@href")[1] #13/1301.html,13/1302.html,13/1303.html #=================
city_url_code=city_URL[:2] #在进行街道的连接的拼接的时候会用到,13,14,15
# print("=====================================================")
Get_qu(city_URL,city_url_code)
#
#
def Get_qu(city_URL,city_url_code):
qu_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/" + city_URL
# print(qu_url)
respon = requests.get(qu_url, headers=headers) # 获得响应
htmlText = respon.content.decode("gbk")
s = etree.HTML(htmlText) # 将源码转化为能被XPath匹配的格式
html_data = s.xpath("//table//tr[@class='countytr']//a") # 提取相同的前缀
for i in html_data:
qu_code = i.xpath("//td[1]/text()") # 城市的code码
qu_name = i.xpath("//td[2]//text()") # 城市的名字
# print(qu_name)
qu_URL = i.xpath("//td[2]//@href") # 城市的详细链接 '02/130202.html', '02/130203.html'
# print(qu_URL)
qu_url_code=qu_URL[3:] #130202.html
# print("*****************************************")
Get_qustreet(qu_url_code, city_url_code)
break #必须的
#
def Get_qustreet(qu_url_code,city_url_code):
for i in qu_url_code:
qustreet="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/%s/%s"%(city_url_code,i)
# print(qustreet)
respon = requests.get(qustreet, headers=headers) # 获得响应
htmlText = respon.content.decode("gbk")
s = etree.HTML(htmlText) # 将源码转化为能被XPath匹配的格式
html_data = s.xpath("//table//tr[@class='towntr']") # 提取相同的前缀
for i in html_data:
street_code = i.xpath("//td[1]//text()") # code
qu_name = i.xpath("//td[2]//text()")[1:] # 名称
qu_url = i.xpath("//td[2]//@href") # 链接
street_url_code=qustreet[:-11] #http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/13/02/
Get_street(street_url_code,qu_url)
break
break
def Get_street(street_url_code,qu_code):
for i in qu_code:
joggled_url=street_url_code+i #http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/13/02/05/130205001.html
# print(joggled_url)
respon = requests.get(joggled_url, headers=headers) # 获得响应
htmlText = respon.content.decode("gbk")
s = etree.HTML(htmlText) # 将源码转化为能被XPath匹配的格式
html_data = s.xpath("//table//tr[@class='villagetr']") # 提取相同的前缀
for i in html_data:
street_code = i.xpath("//td[1]//text()") # code统计用区划代码
qu_name = i.xpath("//td[2]//text()")[1:] # 城乡分类代码
qu_name = i.xpath("//td[3]//text()")[1:] # 名称
print(qu_name)
break
GET_URL()
目前此代码只是初期的,只是能进行逐一查询,后面会加上去重,代码封装,入库,还有注释部分不够完善