python爬虫系列1-沙沙野视频多线程
2020-07-29 本文已影响0人
livein80
任务需求:
- 网站地址:https://www.ssyer.com/
- 使用的库 requests,multiprocessing
- 使用的技术点:
- 使用多进程下载
- 解决cookies过期问题
- 使用工具:一种可以快速生成headers以及cookie的工具https://curl.trillworks.com/
# -*- coding: utf-8 -*-
# @Time : 2020/7/29 6:05 下午
# @Author : livein80
# @Email : 12985594@qq.com
# @File : ssyer.py
# @Software : PyCharm
import requests
import os
# 多进程下载
from multiprocessing import Pool
json_dir='./json_dir/'
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Accept': 'application/json',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Content-Type': 'application/json',
'Origin': 'https://www.ssyer.com',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://www.ssyer.com/cate/2',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ko;q=0.7,und;q=0.6',
}
data = '{"cateId":2,"order":2,"recommendType":1,"page":{"showCount":20,"currentPage":1}}'
session = requests.session()
def get_data():
cookies = {
'UM_distinctid': '17398ee35fb8eb-0679366a6e0d54-31627304-232800-17398ee35fc744',
'CNZZDATA1278764889': '613028250-1595997139-https%253A%252F%252Fwww.google.com%252F%7C1595997139',
'_dg_playback.7b6028a56aac520d.ce42': '1',
'_dg_abtestInfo.7b6028a56aac520d.ce42': '1',
'_dg_check.7b6028a56aac520d.ce42': '1',
'_dg_antiBotFlag.7b6028a56aac520d.ce42': '1',
'_dg_antiBotInfo.7b6028a56aac520d.ce42': '10%7C%7C%7C3600',
'SESSION': 'ZTg3OGVjMGUtZjA0Ni00NmVkLTg2MjctMTY0ZWJhODRmYTc2',
'Hm_lvt_8f50334c83664955c1a1a866dd168053': '1595998616,1595998662',
'Hm_lpvt_8f50334c83664955c1a1a866dd168053': '1595998662',
'_dg_id.7b6028a56aac520d.ce42': 'fc0bc167b752f00b%7C%7C%7C1595998616%7C%7C%7C0%7C%7C%7C1595998662%7C%7C%7C1595998616%7C%7C%7C%7C%7C%7Ce809b4e64783781d%7C%7C%7Chttps%3A%2F%2Fwww.google.com%2F%7C%7C%7Chttps%3A%2F%2Fwww.google.com%2F%7C%7C%7C1%7C%7C%7Cundefined',
}
# +++++++++++++++++++++
response = session.post('https://www.ssyer.com/apis/20001', headers=headers, cookies=cookies, data=data)
return response
# 开始下载视频
def start_load_vid(vid_name,vid_url):
res = requests.get(vid_url,verify=False).content
if not os.path.isdir('./vid/'):
os.mkdir('./vid/')
with open('./vid/{}.mp4'.format(vid_name),'wb') as file:
file.write(res)
print('%s 视频下载完成'%vid_name)
# 获取视频列表
def get_vid_lis(list):
vid_list = []
for item in list:
# start_load_vid(item['title'],item['zip'])
vid_list.append({'name':item['title'],'link':item['zip']})
# print("%s 视频下载完成" % item['title'])
return vid_list
# === 开始爬虫 ===
def start_spider(callback):
count=0
def check():
nonlocal count
count += 1
print('check-->', count)
response = get_data()
print(response.status_code,session.cookies)
if response.status_code == 200:
json_obj = response.json()
callback(json_obj)
else:
# 更新 cookies
json_obj=None
if count < 5:
check()
else:
print('cookies更新失败!')
return json_obj
return check()
def start_download(json_obj):
list = json_obj['data']
# ===获取视频列表===
vid_list = get_vid_lis(list)
# 多线程下载
pool = Pool(15)
for obj in vid_list:
pool.apply_async(start_load_vid, args=(obj['name'], obj['link']))
# 关闭池
pool.close()
pool.join()
print('所有视频现在完成!')
if __name__=='__main__':
# ===== 开始爬虫 =====
start_spider(start_download)