requests+BeautifulSoup 实现猫眼TOP10

2018-07-16  本文已影响0人  把握_cc79
import requests
from bs4 import BeautifulSoup
import bs4
import pprint


def get_html(url, headers):
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        print("ERROR")


def fill_list(html, ulist):
    soup = BeautifulSoup(html, 'html.parser')
    for dd in soup('dd'):
        if isinstance(dd, bs4.element.Tag):
            rank = dd.find('i').string
            name = dd.find_all('p')[0].string
            stat = "".join(dd.find_all('p')[1].string.split())
            releasetime = dd.find_all('p')[2].string
            score = dd.find_all('i')[1].string + dd.find_all('i')[2].string
            ulist.append([rank, name, stat, releasetime, score])


if __name__ == '__main__':
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit'
                      '/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari'
                      '/537.36'
    }
    bash_url = 'http://maoyan.com/board/4'
    deep = 10
    ulist = []
    for j in range(deep):
        url = bash_url + '?offset=' + str(j * 10)
        html = get_html(url, headers)
        fill_list(html, ulist)
    pprint.pprint(ulist)  # 这里使用pprint是为了打印漂亮一点,实际上可以不用的,可以直接入库

爬取效果:


image.png
上一篇下一篇

猜你喜欢

热点阅读