Python采集美拍视频

2020-08-19  本文已影响0人  乂尤先生

首先我们利用开发者工具进行抓包分析:

页面抓包.png
通过开发者工具发现左侧是这次加载页面动态加载的信息,可以看到这个链接是GET方式请求。然后可以试着去分析及查看他们的响应内容
分析.png
对json进行美化,可以基本断定该信息为美拍的接口地址,下面会对json进行分析处理
{
    "medias":[
        {
            "id":1224285358,
            "client_id":1089857302,
            "caption":"当老爸变成了灯神.....",
            "entry_info":null,
            "weibo_share_caption":"#美拍#当老爸变成了灯神.....",
            "facebook_share_caption":"",
            "weixin_share_caption":"分享@祝晓晗🌻 的美拍",
            "weixin_friendfeed_share_caption":"当老爸变成了灯神.....",
            "qzone_share_caption":"当老爸变成了灯神.....",
            "qq_share_caption":"当老爸变成了灯神.....",
            "instagram_share_caption":"分享祝晓晗🌻的美拍“当老爸变成了灯神.....”,快来看看!",
            "weixin_share_sub_caption":"来跟我一起玩美拍吧~",
            "weixin_friendfeed_share_sub_caption":"来跟我一起玩美拍吧~",
            "qzone_share_sub_caption":"来跟我一起玩美拍吧~",
            "qq_share_sub_caption":"来跟我一起玩美拍吧~",
            "geo":null,
            "video":"0c02aHR0cHM635RLy9tdnZpZGVvMTEubWVpdHVkYXRhLmNvbS81ZjI1NzY0MmE1MmJkMTV1bThhOGFwODk0MV9IMjY0XzRfMjJlNTFmNTM0MGRm2mSmMjcubXA0",
            "url":"[https://www.meipai.com/media/1224285358?client_id=1089857306&utm_media_id=1224285358&utm_source=meipai_share&gid=](https://www.meipai.com/media/1224285358?client_id=1089857306&utm_media_id=1224285358&utm_source=meipai_share&gid=)",
            "cover_pic":"[https://mvimg11.meitudata.com/5f257642c12ec6a1eqs8673689.jpg](https://mvimg11.meitudata.com/5f257642c12ec6a1eqs8673689.jpg)",
            "pic_size":"720*1280",
            "category":3,
            "time":49,
            "is_long":true,
            "show_controls":false,
            "created_at":"08-01 22:05",
            "comments_count":114,
            "likes_count":3100,
            "reposts_count":3,
            "user":Object{...},
            "cur_lives_id":"",
            "cur_lives_type":0,
            "cur_lives_stream_type":0,
            "cur_lives_scheme":"",
            "cur_yy_actid":"",
            "feed_id":"6856016371960585390",
            "locked":false,
            "type":5,
            "caption_url_params":[

            ],
            "privacy_config":Object{...},
            "has_watermark":0,
            "refuse_gift":true,
            "refuse_gift_reason":"送礼功能已下线",
            "hide_gift_btn":true,
            "new_music":Object{...},
            "convert_cover_pic":"",
            "ar_magic_info":null,
            "aside_info":null,
            "convert_pic_size":"",
            "cover_pic_resize":null,
            "m_plan":false,
            "ad_level":"N",
            "first_frame_pic":"[http://mvimg10.meitudata.com/5f257aaf2bc6c3332.jpg](http://mvimg10.meitudata.com/5f257aaf2bc6c3332.jpg)",
            "first_frame_pic_size":"720*1280",
            "is_safe":1,
            "is_safe2":1,
            "miniprogram_data":Array[0],
            "cover_title":"",
            "dangerous_action":false,
            "category_tag_id":0,
            "recommend_caption":null,
            "recommend_cover_pic":null,
            "recommend_cover_pic_size":null,
            "user_recommend_cover_pic":null,
            "user_recommend_cover_pic_size":null,
            "is_prefer":0,
            "is_ad":0,
            "left_bottom_tip":Array[0],
            "display_source":125536,
            "trace_id":"ke0yujgv-30xvb4-4chv",
            "item_info":"{"id":1224285358,"code":125536,"codeDetail":"125536,125541","trace_id":"ke0yujgv-30xvb4-4chv"}",
            "caption_origin":"当老爸变成了灯神.....",
            "campaign":"",
            "created_at_origin":1596290704,
            "caption_complete":"当老爸变成了灯神.....",
            "caption_all":"当老爸变成了灯神....."
        },
        Object{...},
        Object{...},
        Object{...},
        Object{...},
        Object{...},
        Object{...},
        Object{...},
        Object{...}
    ],
    "total":1000,
    "current_page":2
}

使用找到的接口地址:

    def main(self,i):
        #抓包获取到的美拍接口地址
        url = "https://www.meipai.com/squares/new_timeline?page={i}&count=24&tid=13".format(i=i)
        print(url)
        response = self.get_req(url)
        if response:
            try:
                self.get_video(response)
            except Exception as e:
                print('获取视频出错,错误代码:',e)

解析抓包文件,获取视频信息

    def get_video(self,response):
        reqs = json.loads(response)
        reqs = reqs['medias']
        for req in reqs:
            videoname = req['caption']
            if videoname:
                video_name = videoname
            else:
                video_name = req['weibo_share_caption']
            video_name = video_name.replace(' ', '')
            video_name = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", video_name)  # 剔除不合法字符
            print(video_name)
            video_url = req['video']
            try:
                videourl = self.video_decode(video_url).decode('utf8')  # 解密视频地址
                print(videourl)
                try:
                    self.download(video_name, videourl)

                except Exception as e:
                    print('视频下载出错,错误代码:',e)
                    with open(r'meipai/spider.txt', 'a+', encoding='utf-8') as f:
                        f.write('视频下载出错,错误代码:{e}---采集{videourl}|{video_name}内容失败\n'.format(e=e,video_url=video_url,video_name=video_name))
                    pass
            except Exception as e:
                print(r'视频地址解密出错,错误代码:',e)
                with open(r'meipai/spider.txt', 'a+', encoding='utf-8') as f:
                    f.write('视频解密出错,错误代码:{e}---采集{video_url}|{video_name}内容失败\n'.format(e=e,video_url=video_url,video_name=video_name))
                

解析视频地址信息

    def video_decode(self,encoded_string):
        def getHex(param1):
            return {
                'str': param1[4:],
                'hex': ''.join(list(param1[:4])[::-1]),
            }

        def getDec(param1):
            loc2 = str(int(param1, 16))
            return {
                'pre': list(loc2[:2]),
                'tail': list(loc2[2:]),
            }

        def substr(param1, param2):
            loc3 = param1[0: int(param2[0])]
            loc4 = param1[int(param2[0]): int(param2[0]) + int(param2[1])]
            return loc3 + param1[int(param2[0]):].replace(loc4, "")

        def getPos(param1, param2):
            param2[0] = len(param1) - int(param2[0]) - int(param2[1])
            return param2

        dict2 = getHex(encoded_string)
        dict3 = getDec(dict2['hex'])
        str4 = substr(dict2['str'], dict3['pre'])
        return base64.b64decode(substr(str4, getPos(str4, dict3['tail'])))

下载视频,附带进度显示

    def download(self,name,videourl):
        print("准备下载!")
        file_path = 'meipai/{name}.mp4'.format(name=name)
        with closing(requests.get(videourl,stream=True)) as response:
            chunk_size = 1024  # 单次请求最大值
            print(response.status_code)
            content_size = int(response.headers['content-length'])  # 内容体总大小
            print(content_size)
            data_count = 0
            with open(file_path, "wb") as file:
                for data in response.iter_content(chunk_size=chunk_size):
                    file.write(data)
                    data_count = data_count + len(data)
                    now_jd = (data_count / content_size) * 100
                    print("\r 文件下载进度:%d%%(%d/%d) - %s" % (now_jd, data_count, content_size, file_path), end=" ")
                print("\n>>> 获取视频成功了!")
        time.sleep(2)

完整代码

import requests
from fake_useragent import UserAgent
import base64
import json
import re
import time
from contextlib import closing
import threading

class MP():
    # 解密视频地址
    def video_decode(self,encoded_string):
        def getHex(param1):
            return {
                'str': param1[4:],
                'hex': ''.join(list(param1[:4])[::-1]),
            }

        def getDec(param1):
            loc2 = str(int(param1, 16))
            return {
                'pre': list(loc2[:2]),
                'tail': list(loc2[2:]),
            }

        def substr(param1, param2):
            loc3 = param1[0: int(param2[0])]
            loc4 = param1[int(param2[0]): int(param2[0]) + int(param2[1])]
            return loc3 + param1[int(param2[0]):].replace(loc4, "")

        def getPos(param1, param2):
            param2[0] = len(param1) - int(param2[0]) - int(param2[1])
            return param2

        dict2 = getHex(encoded_string)
        dict3 = getDec(dict2['hex'])
        str4 = substr(dict2['str'], dict3['pre'])
        return base64.b64decode(substr(str4, getPos(str4, dict3['tail'])))

    # 请求头
    def ua(self):
        ua = UserAgent()
        headers = {
            'Cookie': 'MUSID=kdhd1o131g536r6shisl5rdcg7; MP_WEB_GID=266934702254632; virtual_device_id=433ced9ee7d2b137b89ae37d40df50e9; pvid=UdlW5diAfeJPUaHLK1j3vMC7xVBOnB9c; sid=kdhd1o131g536r6shisl5rdcg7; UM_distinctid=174006923bd1a2-0ad676043abc8a-581b3318-1fa400-174006923beff; CNZZDATA1256786412=1978692496-1597731334-%7C1597731334',
            'Host': 'www.meipai.com',
            'Referer': 'https://www.meipai.com/square/13',
            #'User-Agent': ua.random,
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'

        }
        return headers
    # 访问网页
    def get_req(self,url):
        response = requests.get(url, headers=self.ua())
        if response.status_code == 200:
            response = response.content.decode('utf-8')

        else:
            response = None
        return response
    #解析抓包文件
    def get_video(self,response):
        reqs = json.loads(response)
        reqs = reqs['medias']
        for req in reqs:
            videoname = req['caption']
            if videoname:
                video_name = videoname
            else:
                video_name = req['weibo_share_caption']
            video_name = video_name.replace(' ', '')
            video_name = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", video_name)  # 剔除不合法字符
            print(video_name)
            video_url = req['video']
            try:
                videourl = self.video_decode(video_url).decode('utf8')  # 解密视频地址
                print(videourl)
                try:
                    self.download(video_name, videourl)

                except Exception as e:
                    print('视频下载出错,错误代码:',e)
                    with open(r'meipai/spider.txt', 'a+', encoding='utf-8') as f:
                        f.write('视频下载出错,错误代码:{e}---采集{videourl}|{video_name}内容失败\n'.format(e=e,video_url=video_url,video_name=video_name))
                    pass
            except Exception as e:
                print(r'视频地址解密出错,错误代码:',e)
                with open(r'meipai/spider.txt', 'a+', encoding='utf-8') as f:
                    f.write('视频解密出错,错误代码:{e}---采集{video_url}|{video_name}内容失败\n'.format(e=e,video_url=video_url,video_name=video_name))
                pass

    def download(self,name,videourl):
        print("准备下载!")
        file_path = 'meipai/{name}.mp4'.format(name=name)
        with closing(requests.get(videourl, proxies={'https': 'https://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'},stream=True)) as response:
            chunk_size = 1024  # 单次请求最大值
            print(response.status_code)
            content_size = int(response.headers['content-length'])  # 内容体总大小
            print(content_size)
            data_count = 0
            with open(file_path, "wb") as file:
                for data in response.iter_content(chunk_size=chunk_size):
                    file.write(data)
                    data_count = data_count + len(data)
                    now_jd = (data_count / content_size) * 100
                    print("\r 文件下载进度:%d%%(%d/%d) - %s" % (now_jd, data_count, content_size, file_path), end=" ")
                print("\n>>> 获取视频成功了!")
        time.sleep(2)
    def main(self,i):

        #抓包获取到的美拍接口地址
        url = "https://www.meipai.com/squares/new_timeline?page={i}&count=24&tid=13".format(i=i)
        print(url)
        response = self.get_req(url)
        if response:
            try:
                self.get_video(response)
            except Exception as e:
                print('获取视频出错,错误代码:',e)


if __name__=="__main__":
    video_download = MP()
    #video_download.main(1)
    for i in range(100):
        t1 = threading.Thread(target=video_download.main,kwargs={"i":i})
        t1.start()
上一篇 下一篇

猜你喜欢

热点阅读