request_html库的一些简单使用

2018-07-08  本文已影响0人  xin激流勇进

request_html常用方法

from requests_html import HTMLSession

session = HTMLSession()
r = session.get('https://news.cnblogs.com/n/recommend')
news = r.html.find('h2.news_entry > a')
for new in news:
    print(new.text)
    print(new.absolute_links)

加载js,下载漫画图片

%%time
for i in range(1, 16):
    r = session.get('http://www.gugu5.com/n/14178/556176.html?p=%s'%i)
    r.html.render()
    src = r.html.find('#qTcms_pic', first=True).attrs['src']
    print(src)
    display(Image(url=src))

http://html.python-requests.org/

小例子

from requests_html import HTMLSession
from IPython.display import display, Image

session = HTMLSession()

%%time
for i in range(1, 15):
    r = session.get('http://www.gugu5.com/n/14178/531259.html?p=%s'%i)
    r.html.render()
    src = r.html.find('#qTcms_pic', first=True).attrs['src']
    
    display(Image(url=src))
    print('第%s页'%i)

爬取猫眼top100电影

import requests
from bs4 import BeautifulSoup
import csv


def get_page(url):

    headers = {
        'Host': 'maoyan.com',
        'Referer': 'http://maoyan.com/board',
        'User-Agent': 'Mozilla/5.0'
    }

    r = requests.get(url, headers=headers)
    return r.text


def parse(content):
    movies = []
    soup = BeautifulSoup(content, 'lxml')
    dl = soup.find('dl', attrs={'class': 'board-wrapper'})

    for dd in dl.find_all('dd'):
        rank = dd.find('i').text
        title = dd.find('div', attrs={'class': "movie-item-info"}).find('a').text
        stars = dd.find('p', attrs={'class': 'star'}).text.strip().split(':')[-1].strip()
        releasetime = dd.find('p', attrs={'class': 'releasetime'}).text.strip().split(':')[-1].strip()
        score = dd.find('p', attrs={'class': 'score'}).text

        movie = [rank, title, stars, releasetime, score]
        movies.append(movie)

    return movies


def write(rows, file):
    with open(file, 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(rows)


def main():
    url = 'http://maoyan.com/board/4?offset='
    for i in range(10):
        print(url + str(i*10))
        content = get_page(url + str(i*10))
        movies = parse(content)
        # print(movies)
        write(movies, '1.csv')


if __name__ == '__main__':
    main()
上一篇 下一篇

猜你喜欢

热点阅读