2020-06-28 下载cvpr2020的文章
2020-06-28 本文已影响0人
Joyner2018
下载cvpr2020的文章
运行环境
ubuntu 16.04
安装的包
pip install bs4 --user
pip install youtube-dl --user
爬虫下载的代码
#encoding=utf-8
import sys
import os
from bs4 import BeautifulSoup
def get_content(url):
import sys
info = sys.version_info
if info[0] == 2: #python2
import urllib2
header = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:48.0) Gecko/20100101 Firefox/48.0"}
request = urllib2.Request(url=url,headers=header)
response = urllib2.urlopen(request)
content = response.read()
return content
elif info[0] == 3: # python3
import requests
req = requests.get(url=url)
# print(req.text)
return req.text
else: # python版本信息不可知
raise("python info not found.")
def get_a_flags(html): #
soup = BeautifulSoup(html,"html.parser")
a_falgs = soup.find_all("a")
return a_falgs
def get_a_hrefs(a_falgs):
a_hrefs = []
for a_falg in a_falgs:
if a_falg.get("href"):
a_hrefs.append(a_falg["href"])
return a_hrefs
def download(a_hrefs):
if not isinstance(a_hrefs, list):
download([a_hrefs])
else:
import platform
sysstr = platform.system()
if sysstr =="Windows":
for href in a_hrefs:
cmd = "youtube-dl "+ href
os.system(cmd)
elif sysstr =="Linux":
for href in a_hrefs:
cmd = "youtube-dl "+ href
os.system(cmd)
else:
print("Other System tasks")
def main(url):
content = get_content(url)
a_falgs = get_a_flags(content)
a_hrefs = get_a_hrefs(a_falgs)
a_hrefs = ["http://openaccess.thecvf.com/"+a for a in a_hrefs if ".pdf" in a]
download(a_hrefs)
print("end!")
if __name__=="__main__":
main("http://openaccess.thecvf.com/CVPR2020.py")