爬取中国天气网并且渲染

2019-01-29  本文已影响0人  徒手說梦话
import requests
from bs4 import BeautifulSoup
from pyecharts import Bar

all_data = []

def parse_page(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    response = requests.get(url,headers = headers)
    text = response.content.decode('utf-8')
    soup = BeautifulSoup(text,'html5lib')  # 由于港澳台的html代码不完整,'html5lib'解析器容错率强,比lxml好,但速度比较慢
    today = soup.find('div',class_="conMidtab")
    tables = today.find_all('table') # 省/直辖市
    for table in tables:
        trs = table.find_all('tr')[2:]  # 第三个tr标签开始
        for index,tr in enumerate(trs): # 每行表格
            tds = tr.find_all('td')  
            if index == 0:  # tr标签下面第一行的index为0,那么就是开头第一个城市,开头第一个城市td标签与第二个不一样
                city_td = tds[1]
            else:
                city_td = tds[0]
            city = list(city_td.stripped_strings)[0]
            weather = tds[-2]
            low_weather = list(weather.stripped_strings)[0]
            all_data.append({'城市':city,'最低气温':int(low_weather)})

def spider():
    urls = {
        'http://www.weather.com.cn/textFC/hb.shtml',
        'http://www.weather.com.cn/textFC/db.shtml',
        'http://www.weather.com.cn/textFC/hd.shtml',
        'http://www.weather.com.cn/textFC/hz.shtml',
        'http://www.weather.com.cn/textFC/hn.shtml',
        'http://www.weather.com.cn/textFC/xb.shtml',
        'http://www.weather.com.cn/textFC/xn.shtml',
        'http://www.weather.com.cn/textFC/gat.shtml'

    }
    for url in urls:
        parse_page(url)
    '''
    all_data = [
        {'城市': '宿迁', '最低气温': '2'},
        {'城市': '济南', '最低气温': '2'},
        {'城市': '青岛', '最低气温': '1'},
        {'城市': '淄博', '最低气温': '-2'},
        {'城市': '德州', '最低气温': '-1'},
        {'城市': '烟台', '最低气温': '-2'}
    ]

    def sorr_key(data):
        weather = data['最低气温']  # all_data['最低气温']
        return weather
    
    all_data.sort(key=sorr_key)
    '''
    all_data.sort(key=lambda data:data['最低气温'])
    data = all_data[0:10]
    
    cities = list(map(lambda x:x['城市'],data))
    weather = list(map(lambda x:x['最低气温'],data))
    
    chart = Bar("中国天气最低温排行榜") # 柱形图
    chart.add('',cities,weather)
    chart.render('temperature.html')


spider()

ps:这次爬虫需要注意的是lxml解释器不一定很好用,html5lib虽然兼容性高,但是速度慢,map函数以及sort和lambda函数的用法

上一篇下一篇

猜你喜欢

热点阅读