爬虫 爬取杭州公交始末站,经纬度
2019-11-22 本文已影响0人
诗人藏夜里
杭州公交总览
http://bus.hangzhou.com.cn/all_line.php
找到每一路公交车对应网址
每一路公交点击按键对应位置
# 获取每一辆车对应网页id
url = 'http://bus.hangzhou.com.cn/all_line.php'
headers = {
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
response = requests.get(url, headers=headers, timeout=5)
soup = BeautifulSoup(response.text, 'lxml') #获取网页代码信息
href = soup.find(attrs={'class': 'line_all'}).find_all('a') #找到每辆车网址信息
id_ = [] #每辆车网址
for i in range(len(href)):
id_one = re.findall('\d+', str(href[i]))[0]
id_.append(id_one)
任选一路车进入其页面
http://bus.hangzhou.com.cn/line.php?line_id=3
'''
进入每一辆车对应网页
'''
url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
response_ = requests.get(url, headers=headers, timeout=10)
print('url:{} count:{}'.format(url, count))
soup = BeautifulSoup(response_.text, 'lxml')
#找到始发站终点站对应位置
start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]
完整代码
import requests
from bs4 import BeautifulSoup
import random
import tqdm as tqdm
import pandas as pd
import numpy as np
import re
'''
创建dataframe来保存起始站,终点站
'''
df = pd.DataFrame(columns=['count', 'start', 'terminal'])
# 获取每一辆车对应网页id
url = 'http://bus.hangzhou.com.cn/all_line.php'
headers = {
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
response = requests.get(url, headers=headers, timeout=5)
soup = BeautifulSoup(response.text, 'lxml') #获取网页代码信息
href = soup.find(attrs={'class': 'line_all'}).find_all('a') #找到每辆车网址信息
id_ = [] #每辆车网址
for i in range(len(href)):
id_one = re.findall('\d+', str(href[i]))[0]
id_.append(id_one)
'''
若不换ip则可能ip被封,因为采用count计数,可以知道停在了哪里,下一次把count改为相应值继续开始即可
'''
count = 0
while(count!=len(id_)):
'''
进入每一辆车对应网页
'''
url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
response_ = requests.get(url, headers=headers, timeout=10)
print('url:{} count:{}'.format(url, count))
soup = BeautifulSoup(response_.text, 'lxml')
#找到始发站终点站对应位置
start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]
'''
根据-来分割文字,将其分别放入dataframe的相应位置
因为有些文字中分隔符不同,因此做以下处理
'''
if '——' in start_terminal_.text:
start = start_terminal_.text.split('——')[0]
terminal = start_terminal_.text.split('——')[1]
df.loc[count, 'count'] = count
df.loc[count, 'start'] = start
df.loc[count, 'terminal'] = terminal
count = count + 1
elif '-' in start_terminal_.text:
start = start_terminal_.text.split('-')[0]
terminal = start_terminal_.text.split('-')[1]
df.loc[count, 'count'] = count
df.loc[count, 'start'] = start
df.loc[count, 'terminal'] = terminal
count = count + 1
elif '—' in start_terminal_.text:
start = start_terminal_.text.split('—')[0]
terminal = start_terminal_.text.split('—')[1]
df.loc[count, 'count'] = count
df.loc[count, 'start'] = start
df.loc[count, 'terminal'] = terminal
count = count + 1
else:
df.loc[count, 'count'] = count
df.loc[count, 'start'] = start_terminal_.text
count = count + 1
结果展示
爬取结果经纬度查询
查询网址:http://api.map.baidu.com/lbsapi/getpoint/index.html
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import datetime
import time
from bs4 import BeautifulSoup
import re
headers = {
'Host': 'http://api.map.baidu.com/lbsapi/getpoint/index.html',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
'''
因为有动态操作,因此采用webdriver进行模拟点击
'''
chrome_options = Options()
browser = webdriver.Chrome(chrome_options=chrome_options)
url = 'http://api.map.baidu.com/lbsapi/getpoint/index.html'
browser.get(url)
for i in range(229, len(df)):
#起始站坐标
start = df.loc[i, 'start']
#清除搜索框
browser.find_element_by_id('localvalue').clear()
#将搜索值传入搜索框
browser.find_element_by_id('localvalue').send_keys(start + ' 杭州')
#搜索
browser.find_element_by_id('localsearch').click()
#等待页面加载完毕
time.sleep(1.5)
try:
#将页面html解码
soup = browser.find_element_by_id('MapInfo')
#匹配出第一个坐标信息
r=re.compile(r'坐标.*')
text = r.search(soup.text).group()
#匹配出经纬度坐标
xy = re.findall('\d+.\d+,\d+.\d+', text)
#将坐标放入dataframe
df.loc[i, 'start_xy'] = xy
except:
pass
try:
#终点站坐标
terminal = df.loc[i, 'terminal']
browser.find_element_by_id('localvalue').clear()
browser.find_element_by_id('localvalue').send_keys(terminal + ' 杭州')
browser.find_element_by_id('localsearch').click()
time.sleep(1.5)
soup = browser.find_element_by_id('MapInfo')
r=re.compile(r'坐标.*')
text = r.search(soup.text).group()
xy = re.findall('\d+.\d+,\d+.\d+', text) #匹配出经纬度坐标
df.loc[i, 'terminal_xy'] = xy
except:
pass
print(i)
'''
坐标处理
'''
for i in range(len(df)):
print(i)
try:
df.loc[i, 'start_x'] = float(df.loc[i, 'start_xy'].split(',')[0])
df.loc[i, 'start_y'] = float(df.loc[i, 'start_xy'].split(',')[1])
df.loc[i, 'terminal_x'] = float(df.loc[i, 'terminal_xy'].split(',')[0])
df.loc[i, 'terminal_y'] = float(df.loc[i, 'terminal_xy'].split(',')[1])
except:
continue
可视化
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-whitegrid')
# this function will be used more often to plot data on the NYC map
def plot_on_map(df, BB, map_, s=10, alpha=0.2):
fig, axs = plt.subplots(1, 2, figsize=(30,20))
axs[0].scatter(df.start_x, df.start_y, zorder=1, alpha=alpha, c='r', s=s)
axs[0].set_xlim((BB[0], BB[1]))
axs[0].set_ylim((BB[2], BB[3]))
axs[0].set_title('start locations')
axs[0].imshow(map_, zorder=0, extent=BB)
axs[1].scatter(df.terminal_x, df.terminal_y, zorder=1, alpha=alpha, c='r', s=s)
axs[1].set_xlim((BB[0], BB[1]))
axs[1].set_ylim((BB[2], BB[3]))
axs[1].set_title('terminal locations')
axs[1].imshow(map_, zorder=0, extent=BB)
# load image of NYC map
#BB为背景图的经纬度坐标
BB = (119.710941, 120.673801, 29.685506, 30.552774)
map_ = plt.imread('loc.png')
# plot training data on map
plot_on_map(d, BB, map_, s=1, alpha=0.3)
plt.savefig('station.png')
红点为站点位置