2019-下载喜马拉雅音频
2019-08-01 本文已影响0人
berrycam
#coding=utf-8
'''
author : berrycam
time : 2019.8.1
'''
import os
import requests
from lxml import etree
class DownloadMedia(object):
'''
下载喜马拉雅的音频文件
'''
def __init__(self, target):
self.__target_url = target
self.__headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/73.0.3683.75 Safari/537.36'}
def get_url_list(self):
'''
:return:把所有需要下载音频json数据进行汇集
'''
resp_home_page = requests.get(self.__target_url, headers=self.__headers)
html = resp_home_page.text
elements = etree.HTML(html)
# 创建下载文件夹
folder_path = elements.xpath('//*[@id="root"]/main/section/div/div[2]/div[1]/div[1]/div[2]/div[2]/h1')[0].text.strip()
if not os.path.exists(folder_path):
os.mkdir(folder_path)
# 获取总的页数
page_navigation = elements.xpath('//*[@id="anchor_sound_list"]/div[2]/div/nav/ul/li')[-2]
page_num = page_navigation.xpath('./a/span')[0].text
# 获取要进行访问音频列表的URL列表
page_url = ['{}p{}'.format(self.__target_url, num) for num in range(1, int(page_num)+1)]
media_url_list = list()
for url in page_url:
response = requests.get(url, headers=self.__headers).text
elements = etree.HTML(response)
sound_list = elements.xpath('//*[@id="anchor_sound_list"]/div[2]/ul/li')
for li in sound_list:
media_href = li.xpath('./div[2]/a/@href')[0].strip()
trackID = media_href.split('/')
track_json = 'http://www.ximalaya.com/tracks/{}.json'.format(trackID[-1])
media_url_list.append(track_json)
return media_url_list,folder_path
def down_media_m4a(self, media_url_list, folder_path):
'''下载音频文件
'''
for url in media_url_list:
resp = requests.get(url, headers=self.__headers)
json_dict = resp.json()
play_path = json_dict.get('play_path_64')
title = json_dict.get('title')
file_path = os.path.join(folder_path, title+'.m4a')
resp_data = requests.get(play_path, headers=self.__headers)
if resp_data.status_code == 200:
with open(file_path, 'wb') as f:
f.write(resp_data.content)
print(' %s finish'% file_path)
else:
print('error :%d' % resp_data.status_code)
if __name__ == '__main__':
# target_url = str(input("请输入喜马拉雅下载地址:\n"))
target_url = 'https://www.ximalaya.com/yinyue/16162468/'
d = DownloadMedia(target=target_url)
url_list, forder_path = d.get_url_list()
d.down_media_m4a(url_list, forder_path)