Python批量下载新浪微博相册

2016-07-16 本文已影响881人平仄_pingze

最近用Python写了一个新浪微博相册的批量下载程序，选好用户后运行就可以把他相册的全部相片下载到本地。记录了下载历史，可以中途停止和续传。

新浪微博浏览相册需要登录，在尝试了模拟登录之后，在登录机制和验证码上纠结了很久，最后我选择了直接用cookie登录，这样其实更省事。

Python2.7，应该就用了requests这个第三方库，API用起来更舒服。

pip install requests

使用的时候：
1.先打开该用户的微博页面，F12或者查看源代码，找到他的page_id，填到程序的uid处。
2.用F12或者其他监听软件找到cookies，填入程序cookies处。
3.把希望保存的本地目录路径填入程序中dirpath处。

下面是完整代码

# coding=u8
#作者:平仄_pingze (简书)

"功能"
'''
获取新浪微博用户相册照片到本地
'''

"使用方法"
'''
1.填写储存目录
2.指定微博用户id
3.填写cookie
4.运行
'''

# ---|| 初始参数，需要预先填写 ||---
dirpath = ''  #储存目录
uid =   #用户page_id
cookies = ''  #cookies


import os
import requests
import urllib
import re
from StringIO import StringIO
import pickle
import traceback
import time


def list_find(alist,ele):
    '不报错的list.index()'
    try:
        return alist.index(ele)
    except:
        return -1

def get_response(url,headers='',params=''):
    '稳定高效的获取响应方法'
    max_try_times = 20 # 最大尝试次数
    wait_time = 0.75 # 最大单次尝试时间
    sleep_time = 0.25 # 尝试失败延时
    #print('[%s][INFO] Start trying to connect ...' % time.asctime()[11:19])
    for times in range(1,max_try_times+1):
        # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times))
        try:
            response = requests.get(url, timeout = wait_time, headers=headers, params=params)
            # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times))
            break
        except:
            if times < max_try_times:
                # print('[%s][WARN] The %s time try failed!' % (time.asctime()[11:19], times))
                time.sleep(sleep_time)
                continue
            else:
                print('[%s][ERROR] The last try failed at last , exit pro ...' % time.asctime()[11:19])
                traceback.print_exc()
                exit()
    # print('[%s][INFO] Successfully get the response!' % time.asctime()[11:19])
    return response

def retrieve(imgurl,imgpath):
    '稳定高效的下载图片方法（多次尝试失败后跳过）'
    max_try_times = 5 # 最大尝试次数
    wait_time = 15 # 最大单次尝试时间
    sleep_time = 3 # 尝试失败延时
    import socket
    socket.setdefaulttimeout(wait_time)
    #print('[%s][INFO] Start trying to connect ...' % time.asctime()[11:19])
    for times in range(1,max_try_times+1):
        # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times))
        try:
            urllib.urlretrieve(imgurl,imgpath)
            # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times))
            break
        except:
            if times < max_try_times:
                # print('[%s][WARN] The %s time try failed!' % (time.asctime()[11:19], times))
                time.sleep(sleep_time)
                continue
            else:
                print('[%s][ERROR] The last try failed at last , pass ...' % time.asctime()[11:19])
                break
    # print('[%s][INFO] Successfully get the response!' % time.asctime()[11:19])

def secp(string,pattern1,pattern2=''):
    '替换字符串中所有指定字符串为新字符串(效率低)'
    while True:
        index = string.find(pattern1)
        if index > -1:
            string = string[:index]+pattern2+string[index+len(pattern1):]
        else:
            break
    return string

def url_deal(url):
    'URL处理'
    urld = secp(url,'\\')
    urld = secp(urld,'thumb300','large')
    return urld

def get_imgurl(html):
    '解析html，获取图像url列表'
    imgurl_list = []
    extlist = ['jpg','gif','png']
    for ext in extlist:
        pattern = r'class=\\\"photo_pict\\\" src=\\\"(http:\S+thumb300\S+.'+ext+')'
        result = re.findall(pattern,html,re.S)
        if len(result) > 0:
            for url in result:
                imgurl_list.append(url_deal(url))    
    return imgurl_list  

def save_img(imgurl,savepath,imgname):
    '向本地目录储存图像'
    imgext = imgurl[-4:]
    imgname = imgname + imgext 
    retrieve(imgurl,savepath+os.sep+imgname)

def save_log(dic, path):
    '以pickle文件格式储存到目标路径'
    try:
        out_file = open(path, 'wb')
        pickle.dump(dic,out_file)
        return path
    except:
        traceback.print_exc()
        return None
    finally:
        out_file.close()     

def load_log(path):
    '从指定文件读取pickle文件转成字典'
    try:
        in_file = open(path, 'rb')
        dic = pickle.load(in_file)
        return dic
    except:
        traceback.print_exc()
        return None

def main():
    url = 'http://www.weibo.com/p/'+str(uid)+'/photos'
    headers = {
        'Cookie': cookies
    }
    #访问网址，获取html文档
    response = get_response(url, headers=headers)
    print('[%s][INFO] Pro starting at %s ...' % (time.asctime()[11:19], response.url))
    html = response.text
    #检查html是否有效；若无效，报错并中止
    if len(re.findall('thumb300',html,re.S)) < 1 and len(re.findall('oid',html,re.S)) < 1 and len(re.findall('的微薄',re.S)) < 1:
        print('[%s][ERROR] Invalid cookies or page_id, please check !' % (time.asctime()[11:19]))
        exit()
    #解析文档，获取用户信息和图片路径
    uname = re.findall(u'content="(.+?)，',html,re.S)[0]
    imgurl_list = get_imgurl(html)
    #动态获取循环
    while True:
        #获取since_id，进一步获取动态加载的页面
        result = re.findall('since_id=(\S+)">',html,re.S)
        if len(result)>0:
            since_id = result[0][:-1]
        else:
            break
        #print(since_id)
        payload={
            'since_id': since_id,
            'page_id': uid,
            'ajax_call': 1
        }
        url = 'http://weibo.com/p/aj/album/loading'
        response = get_response(url,params=payload,headers=headers)
        html = response.text
        print('[%s][INFO] Got new page of %s !' % (time.asctime()[11:19], response.url))
        #解析文档，获取html路径
        imgurl_list = imgurl_list + get_imgurl(html)
    savepath = dirpath + os.sep + uname
    if(os.path.exists(savepath)==False or os.path.isdir(savepath)==False):
        os.mkdir(savepath)
    imgurl_list.reverse()
    global total_num
    total_num = len(imgurl_list)
    #log文件存在性检查
    logpath = savepath + os.sep + 'log.pkl'
    if os.path.exists(logpath) and os.path.isfile(logpath):
        print('[%s][INFO] Found log.pkl, loading...' % (time.asctime()[11:19]))
        logdic = load_log(logpath)
        log_last_num = logdic.get('last_num')
        log_imgurl_list = logdic.get('imgurl_list')
        index = log_last_num + 1
    else:
        print('[%s][INFO] Not found log.pkl, creating a new one ...' % (time.asctime()[11:19]))
        log_imgurl_list = []
        index = 1
    #开始下载图片
    num = 1
    for imgurl in imgurl_list:
        if list_find(log_imgurl_list, imgurl) < 0: 
            imgname = '{:0>5}'.format(index)
            save_img(imgurl, savepath, imgname)
            index = index + 1
            last_num = index - 1
            log_imgurl_list.append(imgurl)
            logdic = {
                'last_num': last_num,
                'imgurl_list': log_imgurl_list
            }
            print('[%s][INFO] Writing log ... (%d/%d) !' % (time.asctime()[11:19], num, total_num))
            save_log(logdic, logpath)
            print('[%s][INFO] Successfully saved image as %s (%d/%d) !' % (time.asctime()[11:19], imgname, num, total_num))
        else:
            print('[%s][INFO] Jump this image (%d/%d) !' % (time.asctime()[11:19], num, total_num))
        num = num + 1

if __name__ == '__main__':
    main()

比如我的初始参数是:

dirpath = 'images' #与脚本同目录的images文件夹
uid = 1035051191258123 # 韩寒
cookies = 'SINAGLOBAL=221.237.83.131_146556……' #很长，不给你看

套路是这么个套路，大家有什么想法可以提一提嘛……

Python批量下载新浪微博相册

猜你喜欢

热点阅读