python爬取某一视频网站视频

2020-03-27  本文已影响0人  刘年

关键有ajax分析
灵活运用正则和xpath来获取数据
文件命名时替换掉不规范字符

from urllib import parse
import requests
from lxml import etree
import re
import time
#通过解析ajx分析数据来源,由start控制刷新出来的内容
#得到视频所在网页
headers= {
    'Referer': 'https://www.pearvideo.com/category_8',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.4034.400'
}
base_url ='https://www.pearvideo.com/'
def list_url(n):
    data = {
        'reqType': 5,
        'categoryId': 8,
        'start': 12*n
    }
    result = parse.urlencode(data)
    url='https://www.pearvideo.com/category_loading.jsp?{0}'.format(result)
    # print(url)
    list_page = requests.get(url,headers=headers).text
    html = etree.HTML(list_page)
    href = html.xpath('//div[@class="vervideo-bd"]/a/@href')
    href =[base_url+x for x in href]
    # print(href)
    return href
def get_video_page(url):
    dict ={}
    vedio_page = requests.get(url, headers=headers).text
    vedio_html = etree.HTML(vedio_page)
    # vedio_scr = vedio_html.xpath('//div[@class="img prism-player play"]/video/@scr')
    vedio_title = vedio_html.xpath('//h1[@class="video-tt"]/text()')[0]
    vedio_title =re.sub(r'[\/\\\:\*\?\"\|\<\>]','',vedio_title)
    reg='srcUrl="(.*?)",vdoUrl='
    vedio_scr =re.findall(reg,vedio_page)[0]
    # print(vedio_scr,vedio_title)
    dict['title'] =vedio_title
    dict['scr'] =vedio_scr
    return dict
def get_vedio(url):
    vedio_sourse = requests.get(url, headers=headers).content

if __name__ == '__main__':
    for x in range(10):
        list_urls=list_url(x)
        for listurl in list_urls:
            dict=get_video_page(listurl)
            print(dict)
            vedio_sourse = requests.get(dict['scr'], headers=headers).content
            with open('梨视频下载\\{0}.mp4'.format(dict['title']),'wb') as ff:
                ff.write(vedio_sourse)
                print('梨视频下载\\{0}.mp4下载完成'.format(dict['title']))
                time.sleep(1)

上一篇下一篇

猜你喜欢

热点阅读