利用浏览器插件爬取抖音热门歌曲

2018-08-02  本文已影响0人  no_ones

今天在360浏览器里看到了一个可以在电脑看抖音的插件,比手机app,格式相当简单,学习一波抓取音频
直接上代码:

from pyquery import PyQuery as pq
import requests
from requests.exceptions import ConnectionError
import os
from hashlib import md5

def get_html(url, header):
    
    try:
        response = requests.get(url, headers=header)
        if response.status_code == 200:
            response.encoding = response.apparent_encoding
            return response.text
        return None
    except ConnectionError:
        print('connect error')
        return None

def get_links(html):
    doc = pq(html)
    items1 = doc('.rankbox ul li').items()
    items2 = doc('.leftbox-bd .tit a').items()
     #只能遍历一次,需重新声明
    for (item1,item2) in zip(items1,items2):
       # yield item.attr('data-audio')
       yield(item1.attr('data-audio'), item2.text())

def get_music(url):
    print("Downing" + url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.content
        return None
    except ConnectionError:
        return None

def save_music(content, name):
    file_path = '{0}/{1}.{2}'.format(os.getcwd(), name, 'mp3')
    #当前目录并不是指脚本所在的目录,而是所运行脚本的目录
    print(file_path)
    if not os.path.exists(file_path):
        with open(file_path, 'wb') as f:
            f.write(content)
            f.close()
    #判断存在重复图片  


def main():
    for i in range(1,3):
        url = "https://kuaiyinshi.com/hot/music/?source=dou-yin&page="
        header = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
            'referer' : 'https://kuaiyinshi.com/hot/music/?source=dou-yin&page=' + str(i-1),
            'cookie' : 'Hm_lvt_67cfc45e6393b98852546ccd940217ac=1533194039; Hm_lpvt_67cfc45e6393b98852546ccd940217ac=1533201706; Hm_lvt_cdce8cda34e84469b1c8015204129522=1533194039; Hm_lpvt_cdce8cda34e84469b1c8015204129522=1533201706'}
        url = url + str(i)
        html  = get_html(url, header)
        for (item1,item2) in get_links(html):
            link = 'http://' + item1[2:]
            content = get_music(link)
            save_music(content, item2)


if __name__ == "__main__":
    main()

学到了如何同时遍历两个数组,路还很长啊……

上一篇下一篇

猜你喜欢

热点阅读