python3--有cookie下载网页上的文件

2020-11-26  本文已影响0人  w_dll

en....
总的来说,就是把这个网页上的我想要的文件先从html里过滤出来,再下载。
其中访问需要cookie,不能通过get直接访问;
脚本如下

#!/usr/bin/python3
import sys, io, re, os
from urllib import request


def get_download_url(sub_url):
  url_key_pattern = re.compile(r"href=\".*\"")
  url_key = url_key_pattern.findall(str(sub_url))
  url_key_pattern = re.compile(r"\/redmine.*\"")
  url_key = url_key_pattern.findall(str(url_key))
  sub_url = str(url_key[0])
  sub_url = sub_url.split('"')
  sub_url = str(sub_url[0])
  download_url = "http://redmine.springgroup.cn" + sub_url
  return download_url

def get_file_name(sub_url):
  url_key_pattern = re.compile(r"href=\".*")
  url_key = url_key_pattern.findall(str(sub_url))
  this_file_name = str(url_key[0])
  this_key_pattern = re.compile(r"\>.*?\<")
  this_file_name = this_key_pattern.findall(str(this_file_name))
  this_file_name = str(this_file_name[0])
  this_file_name = this_file_name.strip('>,<')
  return this_file_name


def pre_fun(save_dir, redmine_number):
  #浏览器登录后得到的cookie,也就是刚才复制的字符串
  #cookie_str = r'JSESSIONID=xxxxxxxxxxxxxxxxxxxxxx; iPlanetDirectoryPro=xxxxxxxxxxxxxxxxxx'
  cookie_str = r'_redmine_session=ajhuOC9xbG9NaWlyUjJ4RTBzcDF4cjl1SVVzUlF4V1dURitCQ2x1U0FpQ1kva1ZrM1ppZ3FDTjVXbnNkdlNHSld3WCt4UjVIYlFBcFhMd29mTVdTc290ZGk5WGRERzl0RmR6V3VubFMxQkF1VGQvQlVGcHdEZWhkMTJFMzNGbVdQSlhYcnJldG8'
  os.chdir(save_dir)
  if not os.path.isdir(redmine_number):
    os.makedirs(redmine_number)
  os.chdir(redmine_number)
  #登录后才能访问的网页
  redmine_url = 'http://redmine.springgroup.cn/redmine/issues/' + redmine_number

  return redmine_url, cookie_str


def start_download(redmine_url, cookie_str):
  sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
  req = request.Request(redmine_url)

  #设置cookie
  req.add_header('cookie', cookie_str)
  #设置请求头
  req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')

  resp = request.urlopen(req)


  #print(resp.read().decode('utf-8'))
  #this_text = resp.read().decode('utf-8')
  while resp:
    li = resp.readline().decode('utf-8')
    if "</html>" in li:
      break
    # 匹配文件名 和 下载 url
    if ("数据库" not in li and "已添加" not in li and "手册" not in li and ".htm" not in li)\
    and ("rar" in li or "zip" in li or "tgz" in li or "tar" in li)\
    and ("href" in li and "download" in li):
      # 文件名
      file_name = get_file_name(li)
      # 下载链接
      download_url = get_download_url(li)

      print("开始下载 %s ..." % (file_name))
      req1 = request.Request(download_url)

      #设置cookie
      req1.add_header('cookie', cookie_str)
      #设置请求头
      req1.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')

      this_file = request.urlopen(req1)
      data = this_file.read()
      with open(file_name, "wb") as code:
        code.write(data)
      print("ok\n")


if __name__ == '__main__':

  #redmine号码
  redmine_number = '346683'
  #下载存放路径
  save_dir = "/home/xxwdll/soft/nginx/download/tmp"
  redmine_url, cookie_str = pre_fun(save_dir, redmine_number)
  #开始下载
  start_download(redmine_url, cookie_str)

结果如下


上一篇 下一篇

猜你喜欢

热点阅读