Python爬虫爬虫专题python爬虫

QQ音乐爬虫

2017-10-02  本文已影响617人  Evtion

脑海一直有个想法,想做一个音乐播放的小程序。奈何还只停留在脑海之中。音乐的数据的来源是个需要考虑的问题。之前用Nodejs爬取过酷狗音乐的歌曲,不过酷狗音乐的歌手下的单曲展示有数量上的限制(才有20首)。所以喵上了QQ音乐。QQ音乐和网易云音乐都是比较流行的播放的平台,不过你懂得(小程序)和QQ音乐都是同根生,或许兼容音乐格式会更好。QQ音乐的歌曲的格式是m4a。下面说说爬取音乐URL逻辑。

请求地址以及请求头 音乐url请求头
getOneSongInfoCallback({
  "code": 0,
  "data": [
      {
          "action": {
              "alert": 100002,
              "icons": 8060,
              "msgdown": 0,
              "msgfav": 0,
              "msgid": 14,
              "msgpay": 6,
              "msgshare": 0,
              "switch": 17413891
          },
          "album": {
              "id": 651899,
              "mid": "003Xj8XB18Mhv2",
              "name": "A Muso Duro",
              "subtitle": "",
              "time_public": "1979-01-01",
              "title": "A Muso Duro"
          },
          "bpm": 92,
          "data_type": 0,
          "file": {
              "media_mid": "002TDCnP31VlKY",
              "size_128mp3": 3156681,
              "size_192aac": 3944375,
              "size_192ogg": 4143166,
              "size_24aac": 567922,
              "size_320mp3": 7904241,
              "size_48aac": 1008429,
              "size_96aac": 1860828,
              "size_ape": 0,
              "size_dts": 0,
              "size_flac": 0,
              "size_try": 0,
              "try_begin": 0,
              "try_end": 0
          },
          "fnote": 4009,
          "genre": 1,
          "id": 7152931,
          "index_album": 3,
          "index_cd": 0,
          "interval": 197,
          "isonly": 1,
          "ksong": {
              "id": 0,
              "mid": ""
          },
          "label": "4611686018427387904",
          "language": 19,
          "mid": "002TDCnP31VlKY",
          "modify_stamp": 0,
          "mv": {
              "id": 0,
              "name": "",
              "title": "",
              "vid": ""
          },
          "name": "Non Finirà",
          "pay": {
              "pay_down": 1,
              "pay_month": 1,
              "pay_play": 0,
              "pay_status": 0,
              "price_album": 0,
              "price_track": 200,
              "time_free": 0
          },
          "singer": [
              {
                  "id": 3038,
                  "mid": "000W9C4H0qCsUx",
                  "name": "Pierangelo Bertoli",
                  "title": "Pierangelo Bertoli",
                  "type": 0,
                  "uin": 0
              }
          ],
          "status": 0,
          "subtitle": "",
          "time_public": "1979-01-01",
          "title": "Non Finirà",
          "trace": "",
          "type": 0,
          "url": "http://stream9.qqmusic.qq.com/19152931.wma",
          "version": 0,
          "volume": {
              "gain": 1.8560,
              "lra": 11.8830,
              "peak": 0.8129999999999999
          }
      }
  ],
  "url": {
      "7152931": "ws.stream.qqmusic.qq.com/C100002TDCnP31VlKY.m4a?fromtag=38"
  },
  "url1": {
      
  },
  "extra_data": [
      
  ],
  "joox": 0,
  "joox_login": 1,
  "msgid": 0
})
   self.dealJson(response,["^getOneSongInfoCallback\(","\)$"])
   def dealJson(self,response,pattern):
      text=response.text
      text=re.sub(pattern[0],'',text)
      text=re.sub(pattern[1],'',text)
      jsonData=json.loads(text)
      return jsonData
  # -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import logging
import json
import re
import urllib.parse
import urllib.request
from musicCrawl.items import MusiccrawlItem
from . import getsinger
class requestUrlSpider(scrapy.Spider):
    name = "requestUrl"
    allowed_domains = ["www.y.qq.com"]
    start_urls = ['http://www.y.qq.com/']
    headers={
        ':authority':'c.y.qq.com',
        ':method':'GET',
        ':path':'/soso/fcgi-bin/client_search_cp?ct=24&qqmusic_ver=1298&new_json=1&remoteplace=txt.yqq.song&searchid=54134794373394557&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&p=1&n=20&w=%E8%B5%B5%E9%9B%B7&g_tk=5381&jsonpCallback=searchCallbacksong4621&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq&needNewCode=0',
        ':scheme':'https',
        'accept':'*/*',
        'accept-encoding':'gzip, deflate, sdch, br',
        'accept-language':'zh-CN,zh;q=0.8',
        'cache-control':'no-cache',
        'cookie':'pgv_pvi=2539236352; RK=E/NGWlYOOU; pgv_pvid=4700237117; ptui_loginuin=1482816494; ptcz=0ed94d9b03e410a4a4d523a936e1de8f739a265f19407322ac522dc4402dd9f8; pt2gguin=o1482816494; yq_index=0; pgv_si=s1229479936; ts_last=y.qq.com/portal/search.html; ts_uid=7066888440; yqq_stat=0',
        'pragma':'no-cache',
        'referer':'https://y.qq.com/portal/search.html',
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    }
    reqHeader={
        ':authority':'c.y.qq.com',
        ':method':'GET',
        ':path':'/v8/fcg-bin/fcg_play_single_song.fcg?songmid=001bhwUC1gE6ep&tpl=yqq_song_detail&format=jsonp&callback=getOneSongInfoCallback&g_tk=5381&jsonpCallback=getOneSongInfoCallback&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq&needNewCode=0',
        ':scheme':'https',
        'accept':'*/*',
        'accept-encoding':'gzip, deflate, sdch, br',
        'accept-language':'zh-CN,zh;q=0.8',
        'cache-control':'no-cache',
        'cookie':'pgv_pvi=2539236352; RK=E/NGWlYOOU; pgv_pvid=4700237117; ptui_loginuin=1482816494; ptcz=0ed94d9b03e410a4a4d523a936e1de8f739a265f19407322ac522dc4402dd9f8; pt2gguin=o1482816494; pgv_si=s1229479936; yq_playdata=s; yq_playschange=0; yq_index=3; qqmusic_fromtag=66; player_exist=1; yplayer_open=0; ts_last=y.qq.com/n/yqq/song/001bhwUC1gE6ep.html; ts_uid=7066888440; yqq_stat=0',
        'pragma':'no-cache',
        'referer':'https://y.qq.com/n/yqq/song/001bhwUC1gE6ep.html',
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    }
    url="https://c.y.qq.com/soso/fcgi-bin/client_search_cp?ct=24&qqmusic_ver=1298&new_json=1&remoteplace=txt.yqq.song&searchid=56365046261055832&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&p=1&n=50&w={singer}&g_tk=5381&jsonpCallback=searchCallbacksong412&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq&needNewCode=0"
    allUrl="https://c.y.qq.com/soso/fcgi-bin/client_search_cp?ct=24&qqmusic_ver=1298&new_json=1&remoteplace=txt.yqq.song&searchid=63213556368351152&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&p={page}&n=164&w={singer}&g_tk=5381&jsonpCallback=searchCallbacksong8887&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq&needNewCode=0"
    singerName=[]
    def start_requests(self):
      # 从mongoDB数据库里面获取已经爬取好的歌手名字
        getObj=getsinger.getsinger("mongouri","dbname","user","pass")
        getObj.connect()
        datalist=getObj.finddata()
        datalist=list(datalist)
        for i in range(len(datalist)):
            self.singerName.append(datalist[i]["singerName"])
        for i in range(len(self.singerName)):
            singer=urllib.parse.quote(self.singerName[i])
            yield Request(url=self.url.format(singer=singer),headers=self.headers,callback=self.songCount,dont_filter=True)
# 解释获取歌手每一首歌的mid值
    def parse(self, response):
        jsonData=self.dealJson(response,["^searchCallbacksong\d{0,}\(","\)$"])
        songDetail=jsonData["data"]["song"]
        length=len(songDetail["list"])
        for i in range(length):
            action=songDetail["list"][i]
            songmid=action["mid"]
            musicUrl="https://c.y.qq.com/v8/fcg-bin/fcg_play_single_song.fcg?songmid={mid}&tpl=yqq_song_detail&format=jsonp&callback=getOneSongInfoCallback&g_tk=5381&jsonpCallback=getOneSongInfoCallback&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq&needNewCode=0"
            yield Request(url=musicUrl.format(mid=songmid),headers=self.reqHeader,callback=self.parse_music,dont_filter=True)
# 提取音乐地址函数
    def parse_music(self,response):
        jsonData=self.dealJson(response,["^getOneSongInfoCallback\(","\)$"])
        musicList=jsonData["data"][0]
        musicName=musicList["name"]
        musicUrl=jsonData["url"]
        for key,val in musicUrl.items():
            item=MusiccrawlItem()
            item["name"]=musicName
            item["url"]="http://"+val
            yield item
    def dealJson(self,response,pattern):
        text=response.text
        text=re.sub(pattern[0],'',text)
        text=re.sub(pattern[1],'',text)
        jsonData=json.loads(text)
        return jsonData
    def songCount(self,response):
        numData=response
        numData=self.dealJson(numData,["^searchCallbacksong\d{0,}\(","\)$"])
        condition=numData["data"]["song"]["totalnum"]%20
        if condition==0:
            pageNum=numData["data"]["song"]["totalnum"]//20
        pageNum=(numData["data"]["song"]["totalnum"]//20)+1
        for j in range(len(self.singerName)):
            for i in range(pageNum):
                singer=urllib.parse.quote(self.singerName[j])
                yield Request(url=self.allUrl.format(page=i,singer=singer),headers=self.headers,callback=self.parse,dont_filter=True)

如果获取详细代码可以直接访问 QQMusicCrawler,README有详细的使用说明。

上一篇 下一篇

猜你喜欢

热点阅读