爬虫 爬取杭州公交始末站,经纬度

2019-11-22  本文已影响0人  诗人藏夜里

杭州公交总览
http://bus.hangzhou.com.cn/all_line.php

找到每一路公交车对应网址


每一路公交点击按键对应位置
# 获取每一辆车对应网页id
url = 'http://bus.hangzhou.com.cn/all_line.php'

headers = {
    'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}


response = requests.get(url, headers=headers, timeout=5)

soup = BeautifulSoup(response.text, 'lxml')   #获取网页代码信息
href = soup.find(attrs={'class': 'line_all'}).find_all('a')    #找到每辆车网址信息
id_ = []    #每辆车网址
for i in range(len(href)):
    id_one = re.findall('\d+', str(href[i]))[0]
    id_.append(id_one)

任选一路车进入其页面
http://bus.hangzhou.com.cn/line.php?line_id=3

找到始末站信息
'''
    进入每一辆车对应网页
    '''
    url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
    response_ = requests.get(url, headers=headers, timeout=10)
    print('url:{} count:{}'.format(url, count))
    soup = BeautifulSoup(response_.text, 'lxml')
    #找到始发站终点站对应位置
    start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]

完整代码

import requests
from bs4 import BeautifulSoup
import random
import tqdm as tqdm

import pandas as pd
import numpy as np
import re
'''
创建dataframe来保存起始站,终点站
'''
df = pd.DataFrame(columns=['count', 'start', 'terminal'])
  
    
# 获取每一辆车对应网页id
url = 'http://bus.hangzhou.com.cn/all_line.php'

headers = {
    'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}


response = requests.get(url, headers=headers, timeout=5)

soup = BeautifulSoup(response.text, 'lxml')   #获取网页代码信息
href = soup.find(attrs={'class': 'line_all'}).find_all('a')    #找到每辆车网址信息
id_ = []    #每辆车网址
for i in range(len(href)):
    id_one = re.findall('\d+', str(href[i]))[0]
    id_.append(id_one)

'''
若不换ip则可能ip被封,因为采用count计数,可以知道停在了哪里,下一次把count改为相应值继续开始即可
'''
count = 0
while(count!=len(id_)):
    '''
    进入每一辆车对应网页
    '''
    url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
    response_ = requests.get(url, headers=headers, timeout=10)
    print('url:{} count:{}'.format(url, count))
    soup = BeautifulSoup(response_.text, 'lxml')
    #找到始发站终点站对应位置
    start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]
    '''
    根据-来分割文字,将其分别放入dataframe的相应位置
    因为有些文字中分隔符不同,因此做以下处理
    '''
    if '——' in start_terminal_.text:
        start = start_terminal_.text.split('——')[0]
        terminal = start_terminal_.text.split('——')[1]
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start
        df.loc[count, 'terminal'] = terminal
        count = count + 1
    elif '-' in  start_terminal_.text:
        start = start_terminal_.text.split('-')[0]
        terminal = start_terminal_.text.split('-')[1]
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start
        df.loc[count, 'terminal'] = terminal
        count = count + 1
    elif '—' in  start_terminal_.text:
        start = start_terminal_.text.split('—')[0]
        terminal = start_terminal_.text.split('—')[1]
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start
        df.loc[count, 'terminal'] = terminal
        count = count + 1
    else:
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start_terminal_.text
        count = count + 1

结果展示

爬取结果

经纬度查询

查询网址:http://api.map.baidu.com/lbsapi/getpoint/index.html

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import datetime
import time
from bs4 import BeautifulSoup
import re

headers = {
    'Host': 'http://api.map.baidu.com/lbsapi/getpoint/index.html',
    'Connection': 'keep-alive',
    'X-Requested-With': 'XMLHttpRequest',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}

'''
因为有动态操作,因此采用webdriver进行模拟点击
'''
chrome_options = Options()
browser = webdriver.Chrome(chrome_options=chrome_options)
url = 'http://api.map.baidu.com/lbsapi/getpoint/index.html'
browser.get(url)
for i in range(229, len(df)):
    
    #起始站坐标
    start = df.loc[i, 'start']
    #清除搜索框
    browser.find_element_by_id('localvalue').clear()
    #将搜索值传入搜索框
    browser.find_element_by_id('localvalue').send_keys(start +  ' 杭州')
    #搜索
    browser.find_element_by_id('localsearch').click()
    #等待页面加载完毕
    time.sleep(1.5)
    try:
        #将页面html解码
        soup = browser.find_element_by_id('MapInfo')
        #匹配出第一个坐标信息
        r=re.compile(r'坐标.*')
        text = r.search(soup.text).group()
        #匹配出经纬度坐标
        xy = re.findall('\d+.\d+,\d+.\d+', text)   
        #将坐标放入dataframe
        df.loc[i, 'start_xy'] = xy
    except:
        pass
    try:
        #终点站坐标
        terminal = df.loc[i, 'terminal']
        browser.find_element_by_id('localvalue').clear()
        browser.find_element_by_id('localvalue').send_keys(terminal +  ' 杭州')
        browser.find_element_by_id('localsearch').click()
        time.sleep(1.5)
        soup = browser.find_element_by_id('MapInfo')
        r=re.compile(r'坐标.*')
        text = r.search(soup.text).group()
        xy = re.findall('\d+.\d+,\d+.\d+', text)   #匹配出经纬度坐标
        df.loc[i, 'terminal_xy'] = xy
    except:
        pass
    print(i)

'''
坐标处理
'''
for i in range(len(df)):
    print(i)
    try:
        df.loc[i, 'start_x'] = float(df.loc[i, 'start_xy'].split(',')[0])
        df.loc[i, 'start_y'] = float(df.loc[i, 'start_xy'].split(',')[1])
        df.loc[i, 'terminal_x'] = float(df.loc[i, 'terminal_xy'].split(',')[0])
        df.loc[i, 'terminal_y'] = float(df.loc[i, 'terminal_xy'].split(',')[1])
    except:
        continue

可视化

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-whitegrid')

# this function will be used more often to plot data on the NYC map
def plot_on_map(df, BB, map_, s=10, alpha=0.2):
    fig, axs = plt.subplots(1, 2, figsize=(30,20))
    axs[0].scatter(df.start_x, df.start_y, zorder=1, alpha=alpha, c='r', s=s)
    axs[0].set_xlim((BB[0], BB[1]))
    axs[0].set_ylim((BB[2], BB[3]))
    axs[0].set_title('start locations')
    axs[0].imshow(map_, zorder=0, extent=BB)

    axs[1].scatter(df.terminal_x, df.terminal_y, zorder=1, alpha=alpha, c='r', s=s)
    axs[1].set_xlim((BB[0], BB[1]))
    axs[1].set_ylim((BB[2], BB[3]))
    axs[1].set_title('terminal locations')
    axs[1].imshow(map_, zorder=0, extent=BB)
    
# load image of NYC map
#BB为背景图的经纬度坐标
BB = (119.710941, 120.673801, 29.685506, 30.552774)
map_ = plt.imread('loc.png')
# plot training data on map
plot_on_map(d, BB, map_, s=1, alpha=0.3)
plt.savefig('station.png')
红点为站点位置
上一篇下一篇

猜你喜欢

热点阅读