线程池抓取
2022-12-14 本文已影响0人
垃圾桶边的狗
import datetime
import json
import warnings
from jsonpath import jsonpath
from sqlalchemy.exc import ResourceClosedError
warnings.filterwarnings('ignore')
import random
import time
import requests
import pandas as pd
import records
from urllib import parse
import traceback
from pymysql.converters import escape_string
from sqlalchemy import create_engine
from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import deque
# from my_proxies import proxy
"""
1.到douyin_account_all表 获取main_page字段内容
2.根据main_page 内容拼接url
3.发请求
4.解析数据
5.存储数据
"""
proxy = [
{'http://': 'http://192.168.241.62:9976',
'https://': 'https://192.168.241.62:9976'},
]
class CrawlAima:
def __init__(self):
self.ua = UserAgent()
self.db = records.Database(
f"mysql+pymysql://user:{parse.quote_plus('abc')}@0.0.0.0:3306/db?charset=utf8mb4")
self.conn = create_engine(self.db.db_url)
self.dq = deque()
self.counter = {} # sec_id 记录失败次数
self.url = 'https://www.iesdouyin.com/post/'
self.get_time_by_vid_list = []
self.video_sql = self.get_video_sql()
self.vid_dict = self.get_all_video()
# 获取所有main_page
# self.get_main_page()
def get_all_video(self):
vid_dict = self.db.query("select video_id,post_time from douyin_video_extend_main").as_dict()
return {i['video_id']: str(i['post_time']) for i in vid_dict}
def get_video_sql(self):
video_sql = """
INSERT INTO douyin_video_extend_main (
video_id,
userid,
comment_num,
like_num,
collect_num,
share_num,
url,
content,
cover_pic,
post_time
)
VALUES
(
:video_id,
:userid,
:comment_num,
:like_num,
:collect_num,
:share_num,
:url,
:content,
:cover_pic,
:post_time
)
ON DUPLICATE KEY UPDATE
video_id = :video_id,
userid = :userid,
comment_num = :comment_num,
like_num = :like_num,
collect_num = :collect_num,
share_num = :share_num,
url = :url,
content = :content,
cover_pic = :cover_pic
"""
return video_sql
def get_main_page(self):
main_page = self.db.query("select userid, main_page,open_id from douyin_account_all;")
df = main_page.export("df")
df = df.dropna(subset=['main_page'])
self.main_page_list = []
if not df.empty:
df.main_page = df.main_page.map(lambda x: x.rsplit('/', 1)[-1])
self.main_page_list = df.main_page.to_list()
self.main_page_dict = {v: k+1 for k, v in enumerate(self.main_page_list)}
# 获取用户userid
self.userid_list = df.userid.to_list()
self.userid_dict = {str(u): 1 for u in self.userid_list}
# {userid:open_id}
df['userid'] = df['userid'].astype('str')
self.uid_oid_map = dict(zip(df['userid'].to_list(),df['open_id'].to_list()))
def counter_sec_uid(self,sec_uid):
# sec_uid 记录次数
if self.counter.get(sec_uid):
self.counter[sec_uid] += 1
else:
self.counter[sec_uid] = 1
def crawl_handler(self, sec_uid, max_cursor=0):
params = {
"sec_uid": sec_uid,
"max_cursor": max_cursor,
"count": 21,
"key": "188"
}
headers = {
"user-agent": self.ua.random,
"Connection": "keep-alive",
# "Host": "api.batmkey.cn:8000",
# "Upgrade-Insecure-Requests": "1",
}
try:
response = requests.get(self.url,
headers=headers,
params=params,
timeout=(100, 100),
verify=False,
proxies=random.choice(proxy))#,proxies=random.choice(proxy)) # , proxies=proxies
if response.status_code == 200:
if response.text:
data = response.json()
if isinstance(data, dict) and isinstance(data.get("aweme_list"), list):
print(f"还剩:{len(self.main_page_dict) - self.main_page_dict[sec_uid]} {sec_uid} success")
# res = data.get("data",{}).get('aweme_list')
res = data.get('aweme_list')
if res:
self.data_process(res)
else:
# print('self.dq.appendleft',sec_uid)
self.counter_sec_uid(sec_uid) # 加到队列
self.dq.appendleft(sec_uid) # count + 1
else:
if data.get('code') == 100 and data['msg'] == '没有访问权限':
print("报错:",data)
else:
self.counter_sec_uid(sec_uid) # 加到队列
self.dq.appendleft(sec_uid) # count + 1
else:
self.dq.appendleft(sec_uid) # 加到队列
self.counter_sec_uid(sec_uid) # count + 1
# print("服务器没有返回数据response=",response)
# time.sleep(2)
print(sec_uid,'没有获取到数据')
else:
self.dq.appendleft(sec_uid) # 加到队列
self.counter_sec_uid(sec_uid) # count + 1
# print("状态码:",response.status_code)
# time.sleep(0.5)
except Exception as e:
# print('Exception>>>>self.dq.appendleft', sec_uid)
self.dq.appendleft(sec_uid)
self.counter_sec_uid(sec_uid)
# traceback.print_exc()
print(f"还剩:{len(self.main_page_dict) - self.main_page_dict[sec_uid]} {sec_uid} fail")
if 'timed out' not in str(e):
traceback.print_exc()
print(sec_uid, '报错:',e,'\n')
time.sleep(random.choice([1, 0.8]))
def get_header(self, vids):
headers = {
'authority': 'www.douyin.com',
'accept': 'application/json',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'path': f'web/api/v2/aweme/iteminfo/?item_ids={vids}',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
return headers
def convert_time(self, t, vid):
get_post_time_url = "https://www.douyin.com/web/api/v2/aweme/iteminfo/?item_ids={}"
post_time = self.vid_dict.get(vid)
if post_time:
return post_time
else:
if t and t.rsplit('_')[-1].isdigit():
t = int(t.rsplit('_')[-1])
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t))
else:
res = requests.get(get_post_time_url.format(vid),
headers=self.get_header(vid),
verify=False) # proxies=random.choice(proxy)
if not res.text:
raise ValueError
item = res.json()
item = item['item_list'][0]
# print('item_list:', len(res['item_list']))
create_time = item["create_time"]
post_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(create_time)))
print(f"post_time:{post_time} vid:{vid}")
return post_time
def data_process(self, aweme_list):
# print(data.keys()) #['aweme_list', 'has_more', 'log_pb', 'max_cursor', 'request_item_cursor', 'status_code']
save_list = []
for item in aweme_list:
uid = item.get('author', {}).get('uid')
if self.userid_dict.get(str(uid)):
vid = item.get('statistics', {}).get('aweme_id', 0)
save_data = {
"userid": uid,
"cover_pic": item.get("video", {}).get("cover", {}).get("url_list", [''])[0],
"content": escape_string(item.get('desc', '')),
"video_id": vid,
"comment_num": item.get('statistics', {}).get('comment_count', 0),
"like_num": item.get('statistics', {}).get('digg_count', 0),
"share_num": item.get('statistics', {}).get('share_count', 0),
"play_num": item.get('statistics', {}).get('play_count', 0),
"collect_num": item.get('statistics', {}).get('collect_count', 0),
"post_time": self.convert_time(item.get("video", {}).get("dynamic_cover", {}).get('uri'), vid),
# "post_time": self.convert_time(item.get('create_time')),
"url": "https://www.douyin.com/video/" + str(item.get('statistics', {}).get('aweme_id', 0)),
}
userid = save_data['userid']
open_id = self.uid_oid_map.get(str(userid))
if open_id:
open_id = f"'{open_id}'"
else:
open_id = 'null'
save_data['open_id'] = open_id
save_list.append(save_data)
# save_data.pop('url')
# print('save_data:',save_data['post_time'],'video_id:',save_data['video_id'])
# save_list.append(save_data)
else:
print("用户uid:", uid, '不存在--------------')
# 存main表 迁移到main_true.py
for i in save_list:
self.db.query(self.video_sql, **i)
def loop():
start_time = time.time()
print("开始时间:", time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)))
res = CrawlAima()
# 获取所有page
res.get_main_page()
with ThreadPoolExecutor(max_workers=5) as t:
obj_list = []
for i in res.main_page_list:
obj = t.submit(res.crawl_handler, i)
obj_list.append(obj)
for future in as_completed(obj_list):
data = future.result()
# 添加到队列
# for i in res.main_page_list:
# res.dq.appendleft(i)
crawl_fail_list = []
while len(res.dq):
page = res.dq.pop()
print(f"剩余总数:{len(res.dq)} 当前page:{page}, 第 {res.counter.get(page, 0)} 抓取")
# if res.counter.get(page, 0) > 3: # 请求次数超过10 睡3秒
# time.sleep(3)
if res.counter.get(page, 0) > 1:
print(page, "请求次数超过3次 放弃抓取", res.counter.get(page, 0))
crawl_fail_list.append(page)
continue
res.crawl_handler(page)
print('------------3次 抓取失败----------------')
print('抓取失败:',json.dumps(crawl_fail_list))
print('----------------------------')
end_time = time.time()
print('finish', int(end_time - start_time) / 60, ' min',
time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
if __name__ == '__main__':
# while 1:
loop()
# time.sleep(60*10)
#