爬取 wall.alphacoders.com 网站 图片
resuslt:
requests还是挺快的
issue: 打算使用scrapy 下载呢,没有报错也没有结果,将cookie 添加到settings中header中也不行,但是单独使用Requests
库请求是可以成功的。
solution: 可能是因为连接中有?我没有转义。。。
get: 全局变量声明,继续学习BS4
1使用requests获取图片
使用requests 需要很多代码呀,果然还是框架舒服,
本次以 夏目 相关图片'https://wall.alphacoders.com/tags.php?tid=45523&page=1' 页面进行演示
issue: 不能下载所有图片
solution: 原来是把jpg加到搜索 的正则字符串中了,有的图片后缀为png
issue: request 按顺序挨个请求,
solution : 学习使用 multiprocessing
与 multiprocess
issue: 下载的只是页面课加载到的图片,更高清图片需要点击下载按钮
solution: 手动点击下载得到 的文件大小与requests下载文件大小一致。下面尝试使用scrapy下载
#coding utf-t
#usage: python script
import requests,re,os
from lxml import etree
from multiprocessing import pool
from bs4 import BeautifulSoup
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Cookie': '__cfduid=d863900ab9b493ee3cefa677797c43c121589890737; cookieconsent_status=allow; wa_session=so9sub6dc51vsemee397th2ncttu15sq59vt3f59skc84a810os4d5cv0dk76q4649s8at2mlhalrrc24cbhpjqlhqk8923hntjmh20'
}
def get_all_page_url(): #获取图片所有页面url
first_page = input("please input the first page url, as below \n https://wall.alphacoders.com/tags.php?tid=45523&page=1 \n : "
)
pages_num = input("please input the max page num showd in the page or the number you like lower the max page num \n: ")
main_url = '='.join(first_page.split("=")[:-1]) #为列表,url中有2个等号
all_pages_url = [main_url + "=" +str(p+1) for p in range(int(pages_num))] #得到页面url
#print(all_pages_url)
return all_pages_url
def get_html(url): #请求页面url
try:
res = requests.get(url,headers=HEADERS)
print(res.status_code)
res.raise_for_status()
res.encoding = res.apparent_encoding #apparent 写错了,搞了好久
return res.text
except:
print("cant get html")
return ""
def parse_page(ilt,html): #使用正则搜索图片链接
try:
srcs = re.findall(r'data-src="https://images[\d]?\.alphacoders.com/.*\..+?"',html)
for i in srcs:
src = i.split("=")[1].strip("\"") #get pic thumb url
src = src.replace('thumb-350-','') #get hd pic url
#print(src)
ilt.append(src)
except:
print("提取url失败")
else:
print("process success")
def down_pic(ilt):
j = 0
for i in ilt:
print('正在请求',i)
html = requests.get(i,headers=HEADERS)
title = i.split('/')[-1]
print("正在下载第",str(j+1),"张 ",title)
with open(path+title,'wb') as f:
f.write(html.content)
j+=1
print('共下载图片数量',j)
def main(): #下载函数
path1 = input('请输入存储路径,路径有比如 \n "C:\\Users\\Acer\\Desktop\\xxmu" \n win下可直接在文件夹窗口点击复制路径 \n: ')
global path
path = path1.strip("\"").replace('\\','\\') + "\\" #转义下载路径
print("文件将被下载至",str(path),'\n')
ilt = []
all_pages_url = get_all_page_url()
all_pages_url_num = len(all_pages_url)
for single_page_url in all_pages_url:
page_number = single_page_url.split("=")[-1]
html = get_html(single_page_url)
parse_page(ilt,html)
all_pics_number = len(ilt)
print(str(all_pics_number),"pics are found")
down_pic(ilt) #先注释这个,查看连接数够不够
if __name__ == '__main__':
import time
st = time.time()
main()
time.sleep(1)
et = time.time()
print("用时",str(et-st),"s")
2 使用Requests + multiprocessing
pool.map中的,变量应为迭代对象
87张图片 约423MB 用时约 155s
issue : 声明全局变量失败
solution: 不造
#coding: utf-8
#usage: python script
import requests,re,os
from multiprocessing import pool
from bs4 import BeautifulSoup
import multiprocessing as mp
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Cookie': '__cfduid=d863900ab9b493ee3cefa677797c43c121589890737; cookieconsent_status=allow; wa_session=so9sub6dc51vsemee397th2ncttu15sq59vt3f59skc84a810os4d5cv0dk76q4649s8at2mlhalrrc24cbhpjqlhqk8923hntjmh20'
}
#获得文件加路径
def get_all_page_url(): #获取图片所有页面url
first_page = input("please input the second page url, as below \n https://wall.alphacoders.com/tags.php?tid=45523&page=1 \n : "
)
pages_num = input("please input the max page num showd in the page or the number you like lower the max page num \n: ")
main_url = '='.join(first_page.split("=")[:-1]) #为列表,url中有2个等号
all_pages_url = [main_url + "=" +str(p+1) for p in range(int(pages_num))] #得到页面url
#print(all_pages_url)
return all_pages_url
def get_html(url): #请求页面url
try:
res = requests.get(url,headers=HEADERS)
print(res.status_code)
res.raise_for_status()
res.encoding = res.apparent_encoding #apparent 写错了,搞了好久
return res.text
except:
print("cant get html")
return ""
def parse_page(ilt,html): #使用正则搜索图片链接
try:
srcs = re.findall(r'data-src="https://images[\d]?\.alphacoders.com/.*\..+?"',html)
for i in srcs:
src = i.split("=")[1].strip("\"") #get pic thumb url
src = src.replace('thumb-350-','') #get hd pic url
#print(src)
ilt.append(src)
except:
print("提取url失败")
else:
print("all pics url got")
def down_single_pic(single_pic_url):
#single_page_url = single_pic_url
print('正在请求下载:',single_pic_url)
html = requests.get(single_pic_url,headers=HEADERS)
title = single_pic_url.split('/')[-1]
with open(title,'wb') as f:
f.write(html.content)
def down_pic(ilt):
pool = mp.Pool()
pool.map(down_single_pic,ilt)
#print("正在下载第",str(j+1),"张 ",(title))
# with open(path+title,'wb') as f:
# f.write(html.content)
#j+=1
#print('共下载图片数量',j)
def main(): #下载函数
ilt = []
all_pages_url = get_all_page_url()
all_pages_url_num = len(all_pages_url)
for single_page_url in all_pages_url:
page_number = single_page_url.split("=")[-1]
html = get_html(single_page_url)
parse_page(ilt,html)
all_pics_number = len(ilt)
print(str(all_pics_number),"pics are found")
down_pic(ilt) #注释这个,查看连接数够不够
if __name__ == '__main__':
# path1 = input('请输入存储路径,路径有比如 \n "C:\\Users\\Acer\\Desktop\\xxmu" \n win下可直接在文件夹窗口点击复制路径 \n: ')
# global path
# path = path1.strip("\"").replace('\\','\\') + "\\" #转义下载路径
# print("文件将被下载至",str(path),'\n')
import time
st = time.time()
main()
time.sleep(1)
et = time.time()
print("用时",str(et-st),"s")
3 使用scrapy下载.
虽然之前照着视频可以操作成功,但是自己写还是遇到不少问题,121张,64MB 用时在2分钟内
issue: 获取img标签下的url属性值为空列表,
solution:对应的属性值不在response对象中,应先查看标签内容,有无此属性值,是否在page source中。
crawlspider流程,pipeline如下下
crawlspider流程
issue:
1 建立项目及文件
scrapy startproject xxmu
进入项目文件加,生成spider
scrapy genspider --list #查看可用模板
scrapy genspider -t crawl xxmu_spider https://wall.alphacoders.com/
2.spider配置
运行后没有结果,需要带cookies登录
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class XxmuSpiderSpider(CrawlSpider):
name = 'xxmu_spider'
allowed_domains = ['wall.alphacoders.com']
start_urls = ['https://wall.alphacoders.com/by_sub_category.php?id=266516&name=Natsume%27s+Book+of+Friends+Wallpapers']
rules = (
Rule(LinkExtractor(allow=r'https://wall.alphacoders.com/by_sub_category.php?id=266516&name=Natsume%27s+Book+of+Friends+Wallpapers.*'),
callback='parse_xxmu',
follow=True),
)
def parse_xxmu(self, response):
srcs = response.xpath('//div[@class="thumb-container"]//a/img/@src').getall()
for src in srcs:
print(src)
3.setting配置
4.pipeline
request_objcs return request_objcs