Scrapy 爬取wdsz
2021-05-23 本文已影响0人
Noza_ea8f
结构
image.png爬虫
import scrapy
from wdsz.items import WdszItem
DOWN_FLODER = 'download'
class WdszspiderSpider(scrapy.Spider):
name = 'wdszSpider'
allowed_domains = ['www.wdsz.org']
start_urls = ['http://www.wdsz.org/thread.php?fid-438-page-1.html']
# 因为网页需要登录,所以需要加上cookies
def start_requests(self):
cookies = '***'
# cookies需要转换成字典
cookies = {i.split('=')[0]: i.split('=')[1] for i in cookies.split('; ')}
yield scrapy.Request(
self.start_urls[0],
callback=self.parse,
cookies=cookies
)
def parse(self, response):
# 获取链接
urls = response.xpath(
'//tbody[@id="threadlist"]//td[@class="subject"]/a[contains(@name,"readlink")]/@href').getall()
for url in urls:
url = 'http://www.wdsz.org/' + url
yield scrapy.Request(url=url, callback=self.get_data)
# 翻页
next_url = response.xpath('//a[@class="pages_next"]/@href').extract_first()
if next_url:
next_url = 'http://www.wdsz.org/' + next_url
yield scrapy.Request(next_url, callback=self.parse)
# 去除新建目录中的特殊字符
def correct_title(self, title):
error_set = ['/', '\\', ':', '*', '?', '"', '|', '<', '>', ' ']
for c in title:
if c in error_set:
title = title.replace(c, '')
return title
def get_data(self, response):
# 实例化item
item = WdszItem()
# 获取图片链接
item['img_urls'] = response.xpath(
'//div[@class="tpc_content"]//span[contains(@class,"J_attImg")]/img/@src').getall()
# 提取目录名
img_dir_name = response.xpath('//h1[@id="subject_tpc"]/text()').extract_first()
item['img_paths'] = self.correct_title(img_dir_name)
item['referer'] = response.url
yield item
image.png
import scrapy
class WdszItem(scrapy.Item):
img_urls = scrapy.Field()
referer = scrapy.Field() # 用来去除防盗链
img_paths = scrapy.Field()
image.png
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class WdszPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 遍历每一个图片地址,若传过来的不是集合无需循环直接yield
for img_url in item['img_urls']:
# meta里面的数据是直接从spider里获取,然后通过meta传递给下面的方法:file_path
yield Request(img_url, meta={'name': item['img_paths']}, dont_filter=False,
headers={'referer': item['referer']})
# 重命名,若不重写这些函数,图片为哈希,就是一堆乱七八糟的名字
def file_path(self, request, response=None, info=None, *, item=None):
# 其实就是提取每张图片的链接地址
img_guid = request.url.split('/')[-1]
# 但这个地址后面是“***.jpg?123”这样的形式,所以我截取了其中不重复的那部分作为文件名
# 后面的数字是随机的,可能会重复,所以不能作为文件名,否则就把旧文件覆盖了
img_guid = img_guid[12:27]
# 接受上面meta传递过来的图片名称
name = request.meta['name']
# 构造图片名称
filename = f'{name}/{img_guid}.jpg'
print(filename)
return filename
image.png
基本设置
LOG_LEVEL = 'WARNING'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'wdsz (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
管道设置
ITEM_PIPELINES = {
'wdsz.pipelines.WdszPipeline': 300,
}
IMAGES_STORE = r"E:\wdsz"