Python例子之《郁莉为你讲古诗》音频离线
2017-09-26 本文已影响31人
By_syk
https://www.lizhi.fm/1804846/album/28562879269936667
郁莉为你讲故事 - 荔枝FMPython 脚本代码:
# download audio resource from 荔枝FM
# author: By_syk <By_syk@163.com>
# date: 2017-09-26
import os
import re
from urllib import request
import math
FOLDER_SAVE = r'E:/Download/LizhiFm/'
URL_BASE = r'https://www.lizhi.fm'
def download_page(url):
res = request.urlopen(url)
return res.read().decode('utf-8')
def download_audio(folder_save, audio_name, url_audio):
audio_name = re.sub(r'\s+', ' ', audio_name.strip()).replace(' ', '_')
file_audio = folder_save + audio_name + get_res_suffix(url_audio)
if os.path.exists(file_audio):
return
res = request.urlopen(url_audio)
data = res.read()
with open(file_audio, 'wb') as file:
file.write(data)
def parse_album_name(url_page, html_content):
if is_album_page(url_page):
match = re.search(r'<h1 class="radioName">.+?>(.+?)<', html_content)
if match:
return match.group(1)
if is_user_page(url_page):
match = re.search(r'<h1 class="user-info-name">FM\d+ (.+?)<', html_content)
if match:
return match.group(1)
def parse_all_page_url(url_first, html_content):
url_page_arr = [url_first]
match = re.search(r'<div class="page.+?</div>', html_content, re.S)
if not match:
return url_page_arr
block_page = match.group()
return url_page_arr + [URL_BASE + i for i in re.findall(r'<a href="([^"]+?)">', block_page)]
def parse_audios(html_content):
return re.findall(r'<a.+?title="(.+?)".+?data-duration="(\d+)".+?data-url="(.+?)"', html_content)
def get_res_suffix(url_res):
return '.' + url_res.split('.')[-1]
def is_album_page(url_page):
return re.match(r'^https://www\.lizhi\.fm/\d+/album/\d+$', url_page)
def is_user_page(url_page):
return re.match(r'^https://www\.lizhi\.fm/user/\d+$', url_page)
def readable_sec(sec):
text = ''
if sec > 60 * 60:
text += str(sec // (60 * 60)) + 'h'
sec %= 60 * 60
text += str(sec // 60) + 'm'
return text
def readable_kb(kb):
if kb > 1024 * 1024:
return '%.1fGB' % (kb / (1024 * 1024) + 0.05)
if kb > 1024:
return str(math.ceil(kb / 1024)) + 'MB'
return str(math.ceil(kb)) + 'KB'
def download_all(url_html_album):
page = download_page(url_html_album)
album_name = parse_album_name(url_html_album, page)
if album_name is None:
print('err. invalid url.')
return
print('album name:', album_name)
folder_save_album = FOLDER_SAVE + album_name + '/'
if not os.path.isdir(folder_save_album):
# os.mkdir(folder_save_album)
os.makedirs(folder_save_album)
print('save folder:', folder_save_album)
print('fetching all audios...')
all_audios = []
for url_page in parse_all_page_url(url_html_album, page):
page = download_page(url_page)
all_audios += parse_audios(page)
all_audio_dict = {}
total_audio_duration = 0
for item in all_audios:
if item[2] in all_audio_dict:
continue
all_audio_dict[item[2]] = item[0]
total_audio_duration += int(item[1])
print(len(all_audio_dict), 'audios found, about', readable_sec(total_audio_duration), '/',
readable_kb(total_audio_duration * 17))
for index, item in enumerate(all_audio_dict.items()):
print('downloading audio %d: %s' % (index + 1, item[1]))
download_audio(folder_save_album, item[1], item[0])
print('all done')
download_all('https://www.lizhi.fm/1804846/album/28562879269936667')
输入输出:
python3 lizhi_fm_offline.py
album name: 郁莉为你讲古诗(1)
save folder: E:/Download/LizhiFm/郁莉为你讲古诗(1)/
fetching all audios...
100 audios found, about 11h1m / 659MB
downloading audio 1: 《月夜》--思念的翅膀==郁莉为你讲古诗
downloading audio 2: 《迢迢牵牛星》--天上人间==郁莉为你讲古诗
...omiited
all done
全部音频文件
拓展
为了方便离线荔枝FM平台的其他音频资源,稍加修改代码作为通用脚本:
# download_all('https://www.lizhi.fm/1804846/album/28562879269936667')
if __name__ == '__main__':
url_album = input('url: ').strip()
download_all(url_album)
离线周建龙的《鬼吹灯全集》,启动脚本后输入其链接即可:
python3 lizhi_fm_offline.py
url: https://www.lizhi.fm/user/2617184632410917420
本文代码仅作学习交流之用,请勿用于其他用途。