Python 爬虫 - Instagram异步协程!
首先分析网站图片加载流程,taeri__taeri 应该有人认识这个网红。ins照片一次只加载了一定数量的照片,往下翻又会加载,毫无疑问看 xhr
在预览栏里可以看到json数据, display_url 就是照片的链接,只要获取到这个就行了
回到 headers 看看请求用了哪些参数;就两个, quer_hash 和 variables
variables 是一个json,里面有 id、first、after 这三项;为了不麻烦。。我直接说这三个是啥玩意儿,有兴趣的可以自己分析
<pre spellcheck="false" style="box-sizing: border-box; margin: 5px 0px; padding: 5px 10px; border: 0px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-variant-numeric: inherit; font-variant-east-asian: inherit; font-weight: 400; font-stretch: inherit; font-size: 16px; line-height: inherit; font-family: inherit; vertical-align: baseline; cursor: text; counter-reset: list-1 0 list-2 0 list-3 0 list-4 0 list-5 0 list-6 0 list-7 0 list-8 0 list-9 0; background-color: rgb(240, 240, 240); border-radius: 3px; white-space: pre-wrap; color: rgb(34, 34, 34); letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial;">id:user id 即用户id
after:end cursor 这个参数是为了判断上一页的,没有这个就一直加载的第一页,而本页会带有一个end cursor参数来进行下一页请求
<pre spellcheck="false" style="box-sizing: border-box; margin: 5px 0px; padding: 5px 10px; border: 0px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-variant-numeric: inherit; font-variant-east-asian: inherit; font-weight: 400; font-stretch: inherit; font-size: 16px; line-height: inherit; font-family: inherit; vertical-align: baseline; cursor: text; counter-reset: list-1 0 list-2 0 list-3 0 list-4 0 list-5 0 list-6 0 list-7 0 list-8 0 list-9 0; background-color: rgb(240, 240, 240); border-radius: 3px; white-space: pre-wrap; color: rgb(34, 34, 34); letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial;">import json
import multiprocessing
import sys
from urllib.parse import urljoin
import aiohttp
import asyncio
import os
import re
from pathlib import Path
import requests
<pre spellcheck="false" style="box-sizing: border-box; margin: 5px 0px; padding: 5px 10px; border: 0px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-variant-numeric: inherit; font-variant-east-asian: inherit; font-weight: 400; font-stretch: inherit; font-size: 16px; line-height: inherit; font-family: inherit; vertical-align: baseline; cursor: text; counter-reset: list-1 0 list-2 0 list-3 0 list-4 0 list-5 0 list-6 0 list-7 0 list-8 0 list-9 0; background-color: rgb(240, 240, 240); border-radius: 3px; white-space: pre-wrap; color: rgb(34, 34, 34); letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial;">def init(self, username, maxtasks=200):
self.username = username
self.maxtasks = maxtasks # 最大任务数
self.queue = asyncio.Queue(maxsize=maxtasks * 2)
# 配置代理,没有科学上网没法访问ins
os.environ['http_proxy'] = PROXY
os.environ['https_proxy'] = PROXY
self.session = aiohttp.ClientSession(trust_env=True, headers=HEADERS)
首先获取user id:
<pre spellcheck="false" style="box-sizing: border-box; margin: 5px 0px; padding: 5px 10px; border: 0px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-variant-numeric: inherit; font-variant-east-asian: inherit; font-weight: 400; font-stretch: inherit; font-size: 16px; line-height: inherit; font-family: inherit; vertical-align: baseline; cursor: text; counter-reset: list-1 0 list-2 0 list-3 0 list-4 0 list-5 0 list-6 0 list-7 0 list-8 0 list-9 0; background-color: rgb(240, 240, 240); border-radius: 3px; white-space: pre-wrap; color: rgb(34, 34, 34); letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial;">async def get_shared_data(self):
获取 shared data
async with self.session.get(ROOT_URL + self.username) as resp:
html = await resp.text()
if html is not None and '_sharedData' in html:
shared_data = html.split("window._sharedData = ")[1].split(
if not shared_data: # 没有shared data可以直接终止程序了
return json.loads(shared_data)
except Exception:
async def init(self):
user = (await self.get_shared_data())['entry_data']['ProfilePage'][0]['graphql']['user']
if not user:
print('user is none.')
self.user_id = user['id'] # user id
self.count = user['edge_owner_to_timeline_media']['count'] # 照片数量
<pre spellcheck="false" style="box-sizing: border-box; margin: 5px 0px; padding: 5px 10px; border: 0px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-variant-numeric: inherit; font-variant-east-asian: inherit; font-weight: 400; font-stretch: inherit; font-size: 16px; line-height: inherit; font-family: inherit; vertical-align: baseline; cursor: text; counter-reset: list-1 0 list-2 0 list-3 0 list-4 0 list-5 0 list-6 0 list-7 0 list-8 0 list-9 0; background-color: rgb(240, 240, 240); border-radius: 3px; white-space: pre-wrap; color: rgb(34, 34, 34); letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial;">async def produce_download_urls(self, max=50):
:param max: 一次要获取照片数量
end_cursor = '' # 加载第一页
while True:
pic_params = {
'f2405b236d85e8296cf30347c9f08c2a', # query_hash 可以固定一个值
self.user_id, max, end_cursor),
pic_url = ROOT_URL + 'graphql/query/'
async with self.session.get(pic_url, params=pic_params) as resp:
json = await resp.json()
edge_media = json['data']['user'][
edges = edge_media['edges']
if edges:
for edge in edges:
await self.queue.put(edge['node']['display_url']) # queue通信
has_next_page = edge_media['page_info']['has_next_page'] # json中有一个has next page项,其值是 true或false,用来判断是否有下一页
if has_next_page:
end_cursor = edge_media['page_info']['end_cursor'] # 获取 end cursor
<pre spellcheck="false" style="box-sizing: border-box; margin: 5px 0px; padding: 5px 10px; border: 0px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-variant-numeric: inherit; font-variant-east-asian: inherit; font-weight: 400; font-stretch: inherit; font-size: 16px; line-height: inherit; font-family: inherit; vertical-align: baseline; cursor: text; counter-reset: list-1 0 list-2 0 list-3 0 list-4 0 list-5 0 list-6 0 list-7 0 list-8 0 list-9 0; background-color: rgb(240, 240, 240); border-radius: 3px; white-space: pre-wrap; color: rgb(34, 34, 34); letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial;">async def download(self):
while not (self.producer.done() and self.queue.empty()): # 生产任务是否没有完成以及queue队列是否不为空
url = await self.queue.get() # 获取照片链接
filename = PATH / url.split('?')[0].split('/')[-1]
async with self.session.get(url) as resp:
with'wb') as f:
async for chunk in resp.content.iter_any():
self.queue.task_done() # 表示刚刚排队的任务已完成(就是用get取出的照片url下载完成了)
print('.', end='', flush=True)
<pre spellcheck="false" style="box-sizing: border-box; margin: 5px 0px; padding: 5px 10px; border: 0px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-variant-numeric: inherit; font-variant-east-asian: inherit; font-weight: 400; font-stretch: inherit; font-size: 16px; line-height: inherit; font-family: inherit; vertical-align: baseline; cursor: text; counter-reset: list-1 0 list-2 0 list-3 0 list-4 0 list-5 0 list-6 0 list-7 0 list-8 0 list-9 0; background-color: rgb(240, 240, 240); border-radius: 3px; white-space: pre-wrap; color: rgb(34, 34, 34); letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial;">async def run(self):
await self.init()
print('User id: %r.' % self.user_id)
print('Total %r photos.' % self.count)
self.producer = asyncio.create_task(self.produce_download_urls())
print('Downloading...', end='', flush=True)
await asyncio.gather(( for _ in range(self.maxtasks))) # asyncio.gather和asyncio.wait差不多,具体百度
<pre spellcheck="false" style="box-sizing: border-box; margin: 5px 0px; padding: 5px 10px; border: 0px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-variant-numeric: inherit; font-variant-east-asian: inherit; font-weight: 400; font-stretch: inherit; font-size: 16px; line-height: inherit; font-family: inherit; vertical-align: baseline; cursor: text; counter-reset: list-1 0 list-2 0 list-3 0 list-4 0 list-5 0 list-6 0 list-7 0 list-8 0 list-9 0; background-color: rgb(240, 240, 240); border-radius: 3px; white-space: pre-wrap; color: rgb(34, 34, 34); letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial;">def check(_):
print('Start check...')
with requests.get(urljoin(ROOT_URL, USERNAME), headers=HEADERS,
proxies={'http': 'http://localhost:80001', 'https': 'https://localhost:8001'}) as resp:
pattern = '"edge_owner_to_timeline_media":.?{"count":(.*?),"page_info"'
count = int(re.findall(pattern, resp.text)[0])
while True:
files = len(os.listdir(PATH))
print('Check files:%r' % files)
if files == count:
# print('Total %r photos download done.' % count)
print('\nProduce done, Total %r photos, plz wait save done :)' % count)
<pre spellcheck="false" style="box-sizing: border-box; margin: 5px 0px; padding: 5px 10px; border: 0px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-variant-numeric: inherit; font-variant-east-asian: inherit; font-weight: 400; font-stretch: inherit; font-size: 16px; line-height: inherit; font-family: inherit; vertical-align: baseline; cursor: text; counter-reset: list-1 0 list-2 0 list-3 0 list-4 0 list-5 0 list-6 0 list-7 0 list-8 0 list-9 0; background-color: rgb(240, 240, 240); border-radius: 3px; white-space: pre-wrap; color: rgb(34, 34, 34); letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial;">async def main():
ins = Instagram(USERNAME)
await ins.close()
if name == 'main':
<pre spellcheck="false" style="box-sizing: border-box; margin: 5px 0px; padding: 5px 10px; border: 0px; font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-variant-numeric: inherit; font-variant-east-asian: inherit; font-weight: 400; font-stretch: inherit; font-size: 16px; line-height: inherit; font-family: inherit; vertical-align: baseline; cursor: text; counter-reset: list-1 0 list-2 0 list-3 0 list-4 0 list-5 0 list-6 0 list-7 0 list-8 0 list-9 0; background-color: rgb(240, 240, 240); border-radius: 3px; white-space: pre-wrap; color: rgb(34, 34, 34); letter-spacing: normal; orphans: 2; text-align: left; text-indent: 0px; text-transform: none; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial;">if name == 'main':
p = multiprocessing.Process(target=check, args=(0,))
future = asyncio.ensure_future(main())
loop = asyncio.get_event_loop()
except KeyboardInterrupt:
