爬取 wall.alphacoders.com 网站 图片

2020-05-26  本文已影响0人  EZ

resuslt:


requests还是挺快的

issue: 打算使用scrapy 下载呢,没有报错也没有结果,将cookie 添加到settings中header中也不行,但是单独使用Requests库请求是可以成功的。
solution: 可能是因为连接中有?我没有转义。。。

get: 全局变量声明,继续学习BS4

1使用requests获取图片

使用requests 需要很多代码呀,果然还是框架舒服,
本次以 夏目 相关图片'https://wall.alphacoders.com/tags.php?tid=45523&page=1' 页面进行演示

issue: 不能下载所有图片
solution: 原来是把jpg加到搜索 的正则字符串中了,有的图片后缀为png

issue: request 按顺序挨个请求,
solution : 学习使用 multiprocessingmultiprocess

issue: 下载的只是页面课加载到的图片,更高清图片需要点击下载按钮
solution: 手动点击下载得到 的文件大小与requests下载文件大小一致。下面尝试使用scrapy下载

#coding utf-t
#usage: python script

import requests,re,os
from lxml import etree
from multiprocessing import pool
from bs4 import BeautifulSoup

HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'en',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
    'Cookie': '__cfduid=d863900ab9b493ee3cefa677797c43c121589890737; cookieconsent_status=allow; wa_session=so9sub6dc51vsemee397th2ncttu15sq59vt3f59skc84a810os4d5cv0dk76q4649s8at2mlhalrrc24cbhpjqlhqk8923hntjmh20'
}

def get_all_page_url():  #获取图片所有页面url
    first_page = input("please input the first page url, as below \n https://wall.alphacoders.com/tags.php?tid=45523&page=1 \n : "
                    )
    pages_num = input("please input the max page num showd in the page or the number you like lower the max page num \n: ")

    main_url = '='.join(first_page.split("=")[:-1])  #为列表,url中有2个等号
    all_pages_url = [main_url + "=" +str(p+1) for p in range(int(pages_num))] #得到页面url
    #print(all_pages_url)
    return all_pages_url

def get_html(url):  #请求页面url
    try:
        res = requests.get(url,headers=HEADERS)
        print(res.status_code)
        res.raise_for_status()
        res.encoding = res.apparent_encoding  #apparent 写错了,搞了好久
        return res.text
    except:
        print("cant get html")
        return ""   
        
def parse_page(ilt,html): #使用正则搜索图片链接
    try:
        srcs = re.findall(r'data-src="https://images[\d]?\.alphacoders.com/.*\..+?"',html)
        for i in srcs:
            src = i.split("=")[1].strip("\"")  #get pic thumb url 
            src = src.replace('thumb-350-','')  #get hd pic url
            #print(src)
            ilt.append(src)
    except:
            print("提取url失败")
    else:
        print("process success")
            

def down_pic(ilt):
    j = 0
    for i in ilt:
        print('正在请求',i)
        html = requests.get(i,headers=HEADERS)
        title = i.split('/')[-1]
        print("正在下载第",str(j+1),"张 ",title)
        with open(path+title,'wb') as f:
            f.write(html.content)
        j+=1
    print('共下载图片数量',j)

def main():    #下载函数
    path1 = input('请输入存储路径,路径有比如 \n "C:\\Users\\Acer\\Desktop\\xxmu" \n win下可直接在文件夹窗口点击复制路径 \n: ')
    global path
    path = path1.strip("\"").replace('\\','\\') + "\\" #转义下载路径
    print("文件将被下载至",str(path),'\n')
    ilt = []
    all_pages_url = get_all_page_url()
    all_pages_url_num = len(all_pages_url)
    
    for single_page_url in all_pages_url:
        page_number = single_page_url.split("=")[-1]
        html = get_html(single_page_url)
        parse_page(ilt,html)
    all_pics_number = len(ilt)
    print(str(all_pics_number),"pics are found")
    down_pic(ilt) #先注释这个,查看连接数够不够
        
if __name__ == '__main__':
    import time
    st = time.time()
    main()
    time.sleep(1)
    et = time.time()
    print("用时",str(et-st),"s")

2 使用Requests + multiprocessing

pool.map中的,变量应为迭代对象
87张图片 约423MB 用时约 155s
issue : 声明全局变量失败
solution: 不造

#coding: utf-8
#usage: python script

import requests,re,os
from multiprocessing import pool
from bs4 import BeautifulSoup
import multiprocessing as mp  

HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'en',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
    'Cookie': '__cfduid=d863900ab9b493ee3cefa677797c43c121589890737; cookieconsent_status=allow; wa_session=so9sub6dc51vsemee397th2ncttu15sq59vt3f59skc84a810os4d5cv0dk76q4649s8at2mlhalrrc24cbhpjqlhqk8923hntjmh20'
}
#获得文件加路径

def get_all_page_url():  #获取图片所有页面url
    first_page = input("please input the second page url, as below \n https://wall.alphacoders.com/tags.php?tid=45523&page=1 \n : "
                    )
    pages_num = input("please input the max page num showd in the page or the number you like lower the max page num \n: ")

    main_url = '='.join(first_page.split("=")[:-1])  #为列表,url中有2个等号
    all_pages_url = [main_url + "=" +str(p+1) for p in range(int(pages_num))] #得到页面url
    #print(all_pages_url)
    return all_pages_url

def get_html(url):  #请求页面url
    try:
        res = requests.get(url,headers=HEADERS)
        print(res.status_code)
        res.raise_for_status()
        res.encoding = res.apparent_encoding  #apparent 写错了,搞了好久
        return res.text
    except:
        print("cant get html")
        return ""   
        
def parse_page(ilt,html): #使用正则搜索图片链接
    try:
        srcs = re.findall(r'data-src="https://images[\d]?\.alphacoders.com/.*\..+?"',html)
        for i in srcs:
            src = i.split("=")[1].strip("\"")  #get pic thumb url 
            src = src.replace('thumb-350-','')  #get hd pic url
            #print(src)
            ilt.append(src)
    except:
            print("提取url失败")
    else:
        print("all pics url got")
            

def down_single_pic(single_pic_url):
    #single_page_url = single_pic_url
    print('正在请求下载:',single_pic_url)
    html = requests.get(single_pic_url,headers=HEADERS)
    title = single_pic_url.split('/')[-1]
    
    with open(title,'wb') as f:
        f.write(html.content)

    
            
def down_pic(ilt):
    pool = mp.Pool()
    pool.map(down_single_pic,ilt)
    #print("正在下载第",str(j+1),"张 ",(title))
        # with open(path+title,'wb') as f:
            # f.write(html.content)
    #j+=1
    #print('共下载图片数量',j)

def main():    #下载函数
    ilt = []
    all_pages_url = get_all_page_url()
    all_pages_url_num = len(all_pages_url)
    
    for single_page_url in all_pages_url:
        page_number = single_page_url.split("=")[-1]
        html = get_html(single_page_url)
        parse_page(ilt,html)
    all_pics_number = len(ilt)
    print(str(all_pics_number),"pics are found")
    down_pic(ilt) #注释这个,查看连接数够不够
        
if __name__ == '__main__':
    # path1 = input('请输入存储路径,路径有比如 \n "C:\\Users\\Acer\\Desktop\\xxmu" \n win下可直接在文件夹窗口点击复制路径 \n: ')
    # global path
    # path = path1.strip("\"").replace('\\','\\') + "\\" #转义下载路径
    # print("文件将被下载至",str(path),'\n')

    import time
    st = time.time()
    main()
    time.sleep(1)
    et = time.time()
    print("用时",str(et-st),"s")

3 使用scrapy下载.

虽然之前照着视频可以操作成功,但是自己写还是遇到不少问题,121张,64MB 用时在2分钟内
issue: 获取img标签下的url属性值为空列表,
solution:对应的属性值不在response对象中,应先查看标签内容,有无此属性值,是否在page source中。
crawlspider流程,pipeline如下下


crawlspider流程

issue:
1 建立项目及文件

scrapy startproject xxmu
进入项目文件加,生成spider
scrapy genspider --list  #查看可用模板
scrapy genspider -t crawl  xxmu_spider https://wall.alphacoders.com/

2.spider配置
运行后没有结果,需要带cookies登录

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class XxmuSpiderSpider(CrawlSpider):
    name = 'xxmu_spider'
    allowed_domains = ['wall.alphacoders.com']
    start_urls = ['https://wall.alphacoders.com/by_sub_category.php?id=266516&name=Natsume%27s+Book+of+Friends+Wallpapers']

    rules = (
        Rule(LinkExtractor(allow=r'https://wall.alphacoders.com/by_sub_category.php?id=266516&name=Natsume%27s+Book+of+Friends+Wallpapers.*'),
             callback='parse_xxmu',
             follow=True),

    )


    def parse_xxmu(self, response):
        srcs = response.xpath('//div[@class="thumb-container"]//a/img/@src').getall()
        for src in srcs:
            print(src)

3.setting配置

4.pipeline


request_objcs return request_objcs

return request_objcs 将request_objcs中的request请求挨个return

file_path中的路径,文件名为计算得到
上一篇下一篇

猜你喜欢

热点阅读