Python多线程，单线程，协程爬虫某音乐实战对比

2019-05-19 本文已影响6人望月成三人

单线程，多线程下载某云音乐

import re
import urllib.request
import requests
from bs4 import BeautifulSoup
import os
import time
from Threads import BaseThread

PATH = lambda p: os.path.abspath(
    os.path.join(os.path.dirname(__file__), p)
)

'''
https://music.163.com/playlist?id=  得到播放列表
http://music.163.com/song/media/outer/url?id= 得到下载链接
urllib.request.urlretrieve  把远程下载的mp3文件下载到本地
'''


class Music163:
    def __init__(self):
        pass
    def get_music_163(self, id):
        user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 ' \
                     'Safari/537.36 '
        headers = {'User-Agent': user_agent}
        data = requests.get("https://music.163.com/playlist?id=" + id, headers).text
        soup = BeautifulSoup(data, 'lxml')
        temp = []
        for i in soup.find("ul", {"class", "f-hide"}).find_all("a"):
            pattern = re.compile('<a .*?id=(.*?)">(.*?)</a>', re.S)
            items = re.findall(pattern, str(i))
            temp.append([items[0][0], items[0][1]])
        return temp

    # 批量下载
    def download(self, value):
        for i in value:
            if os.path.isfile(PATH("mp3/" + i[1] + ".mp3")):
                print("%s已经被下载了" % i[1])
            else:
                url = 'http://music.163.com/song/media/outer/url?id=' + i[0] + '.mp3'
                urllib.request.urlretrieve(url, '%s' % PATH("mp3/" + i[1] + ".mp3"))
                print("%s下载成功" % i[1])

    # 单个下载
    def get(self, value):
        if os.path.isfile(PATH("mp3/" + value[1] + ".mp3")):
            print("%s已经被下载了" % value[1])
        else:
            url = 'http://music.163.com/song/media/outer/url?id=' + value[0] + '.mp3'
            urllib.request.urlretrieve(url, '%s' % PATH("mp3/" + value[1] + ".mp3"))
            print("%s下载成功" % value[1])


# 多线程
def multi_thread():
    id = "2786226719"  # 播放的列表id
    start_time = time.time()
    threads = []
    mc = Music163()
    data = mc.get_music_163(id)
    count = len(data)
    for i in range(0, count):
        threads.append(BaseThread(mc.get(data[i])))
    for j in range(0, count):
        threads[j].start()
    for k in range(0, count):
        threads[k].join()
    end_time = time.time()
    print("共耗时%.2f" % (end_time - start_time) + "秒")
    # 多线程47秒


# 运行单线程
def run():
    id = "2786226719"  # 播放的列表id
    start_time = time.time()
    mc = Music163()
    data = mc.get_music_163(id)
    mc.download(data)
    end_time = time.time()
    print("共耗时%.2f" % (end_time - start_time) + "秒")
    # 单线程43秒


if __name__ == "__main__":
    # run()
    multi_thread()

单线程共下载100首歌,耗时9.09秒

dan.png

多线程共下载100首歌,耗时9.60秒

image.png

协程下载的代码

mport time
from multiprocessing import Process
from gevent import monkey
import urllib.request
import BaseMusic163
monkey.patch_all()
import gevent
import os

PATH = lambda p: os.path.abspath(
    os.path.join(os.path.dirname(__file__), p)
)
'''
协程发请求,
'''
class Producer(object):
    def __init__(self):
        self._rungevent()

    def _rungevent(self):
        jobs = []
        id = "2786226719"  # 播放的列表id
        start_time = time.time()
        mc = BaseMusic163.Music163()
        data = mc.get_music_163(id)
        count = len(data)
        for i in range(count):  # windows下有1024端口限制
            jobs.append(gevent.spawn(self.produce(data[i])))
        gevent.joinall(jobs)
        end_time = time.time()
        print("共耗时%.2f" % (end_time - start_time) + "秒")

    def produce(self, value):
        if os.path.isfile(PATH("mp3/" + value[1] + ".mp3")):
            print("%s已经被下载了" % value[1])
        else:
            url = 'http://music.163.com/song/media/outer/url?id=' + value[0] + '.mp3'
            urllib.request.urlretrieve(url, '%s' % PATH("mp3/" + value[1] + ".mp3"))
            print("%s下载成功" % value[1])


def main():
    p1 = Process(target=Producer, args=())
    p1.start()


if __name__ == '__main__':
    main()

下载时间

image.png

结论

昨天测试，发现是协程>多线程>单线程
今天测试却是：多线程>协程>单线程
当然也会出现单线程耗时反而比多线程耗时短的情况
一直流传多进程+协程，可以解决python的GIL问题，因为本次测试的数据不多，使用的也是单进程+协程的方式，后续对协程的测试，有机会进行大量数据的测试，采用多进程+协程的方式进行测试
源码获取

Python多线程，单线程，协程爬虫某音乐实战对比

结论

猜你喜欢

热点阅读