python自学

Python例子之《郁莉为你讲古诗》音频离线

2017-09-26  本文已影响31人  By_syk

https://www.lizhi.fm/1804846/album/28562879269936667

郁莉为你讲故事 - 荔枝FM

Python 脚本代码:

# download audio resource from 荔枝FM
# author: By_syk <By_syk@163.com>
# date: 2017-09-26

import os
import re
from urllib import request

import math

FOLDER_SAVE = r'E:/Download/LizhiFm/'
URL_BASE = r'https://www.lizhi.fm'


def download_page(url):
    res = request.urlopen(url)
    return res.read().decode('utf-8')


def download_audio(folder_save, audio_name, url_audio):
    audio_name = re.sub(r'\s+', ' ', audio_name.strip()).replace(' ', '_')
    file_audio = folder_save + audio_name + get_res_suffix(url_audio)

    if os.path.exists(file_audio):
        return

    res = request.urlopen(url_audio)
    data = res.read()
    with open(file_audio, 'wb') as file:
        file.write(data)


def parse_album_name(url_page, html_content):
    if is_album_page(url_page):
        match = re.search(r'<h1 class="radioName">.+?>(.+?)<', html_content)
        if match:
            return match.group(1)
    if is_user_page(url_page):
        match = re.search(r'<h1 class="user-info-name">FM\d+ (.+?)<', html_content)
        if match:
            return match.group(1)


def parse_all_page_url(url_first, html_content):
    url_page_arr = [url_first]
    match = re.search(r'<div class="page.+?</div>', html_content, re.S)
    if not match:
        return url_page_arr
    block_page = match.group()
    return url_page_arr + [URL_BASE + i for i in re.findall(r'<a href="([^"]+?)">', block_page)]


def parse_audios(html_content):
    return re.findall(r'<a.+?title="(.+?)".+?data-duration="(\d+)".+?data-url="(.+?)"', html_content)


def get_res_suffix(url_res):
    return '.' + url_res.split('.')[-1]


def is_album_page(url_page):
    return re.match(r'^https://www\.lizhi\.fm/\d+/album/\d+$', url_page)


def is_user_page(url_page):
    return re.match(r'^https://www\.lizhi\.fm/user/\d+$', url_page)


def readable_sec(sec):
    text = ''
    if sec > 60 * 60:
        text += str(sec // (60 * 60)) + 'h'
        sec %= 60 * 60
    text += str(sec // 60) + 'm'
    return text


def readable_kb(kb):
    if kb > 1024 * 1024:
        return '%.1fGB' % (kb / (1024 * 1024) + 0.05)
    if kb > 1024:
        return str(math.ceil(kb / 1024)) + 'MB'
    return str(math.ceil(kb)) + 'KB'


def download_all(url_html_album):
    page = download_page(url_html_album)

    album_name = parse_album_name(url_html_album, page)
    if album_name is None:
        print('err. invalid url.')
        return
    print('album name:', album_name)

    folder_save_album = FOLDER_SAVE + album_name + '/'
    if not os.path.isdir(folder_save_album):
        # os.mkdir(folder_save_album)
        os.makedirs(folder_save_album)
    print('save folder:', folder_save_album)

    print('fetching all audios...')
    all_audios = []
    for url_page in parse_all_page_url(url_html_album, page):
        page = download_page(url_page)
        all_audios += parse_audios(page)
    all_audio_dict = {}
    total_audio_duration = 0
    for item in all_audios:
        if item[2] in all_audio_dict:
            continue
        all_audio_dict[item[2]] = item[0]
        total_audio_duration += int(item[1])
    print(len(all_audio_dict), 'audios found, about', readable_sec(total_audio_duration), '/',
          readable_kb(total_audio_duration * 17))

    for index, item in enumerate(all_audio_dict.items()):
        print('downloading audio %d: %s' % (index + 1, item[1]))
        download_audio(folder_save_album, item[1], item[0])

    print('all done')

download_all('https://www.lizhi.fm/1804846/album/28562879269936667')

输入输出:

python3 lizhi_fm_offline.py
album name: 郁莉为你讲古诗(1)
save folder: E:/Download/LizhiFm/郁莉为你讲古诗(1)/
fetching all audios...
100 audios found, about 11h1m / 659MB
downloading audio 1: 《月夜》--思念的翅膀==郁莉为你讲古诗
downloading audio 2: 《迢迢牵牛星》--天上人间==郁莉为你讲古诗
...omiited
all done
全部音频文件

拓展

为了方便离线荔枝FM平台的其他音频资源,稍加修改代码作为通用脚本:

# download_all('https://www.lizhi.fm/1804846/album/28562879269936667')
if __name__ == '__main__':
    url_album = input('url: ').strip()
    download_all(url_album)

离线周建龙的《鬼吹灯全集》,启动脚本后输入其链接即可:

python3 lizhi_fm_offline.py
url: https://www.lizhi.fm/user/2617184632410917420

本文代码仅作学习交流之用,请勿用于其他用途。

上一篇下一篇

猜你喜欢

热点阅读