爬虫实战:scrapy爬取西米漫画的图片
2020-04-07 本文已影响0人
后山小鲨鱼
这个网站的漫画是分等级的,有一些是要会员才可以看的,毕竟是限制级漫画-。
用钱买了个会员,开爬。
和之前的爬虫不同的地方,这个要加cookie,这样才可以爬取会员章节的内容。
getsimicomic.py
# -*- coding: utf-8 -*-
import scrapy
from simicomic.items import SimicomicItem
class GetsimicomicSpider(scrapy.Spider):
name = 'getsimicomic'
allowed_domains = ['simicomic.com/booklist']
start_urls = ['http://simicomic.com/booklist/']
def parse(self, response):
bookNames = response.xpath('//ul[@class="manga-list-2"]/li/p[@class="manga-list-2-title"]/a/text()').extract()
bookHrefs = response.xpath('//ul[@class="manga-list-2"]/li/p[@class="manga-list-2-title"]/a/@href').extract()
bookChapters = response.xpath('//ul[@class="manga-list-2"]/li/p[@class="manga-list-2-tip"]/a/text()').extract()[0]
i = 0
for bookHref in bookHrefs:
bookHref = 'http://simicomic.com'+bookHref
bookName = bookNames[i]
i = i + 1
# 浏览器headers
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Referfer':'http://simicomic.com/booklist'
}
# 指定cookies
cookies = {
'book_referer':'http%3A%2F%2Fsimicomic.com%2Fbooklist',
'PHPSESSID':'ti662eu77k3t781gpo5vos67jj',
'nav_switch':'booklist',
}
yield scrapy.Request(url=bookHref, callback=lambda response, bookName=bookName,bookChapters = bookChapters: self.parse2(response, bookName,bookChapters), dont_filter=True,headers=headers,cookies=cookies)
# nowPage = int(response.xpath('//ul[@class = "pagination"]/li[@class="active"]/span/text()').extract()[0])
# if nowPage <= 9:
# nextPage = nowPage + 1
# nextPageUrl = 'http://simicomic.com/booklist/?tag=%E5%85%A8%E9%83%A8&area=-1&end=-1&page='+str(nextPage)
# yield scrapy.Request(url=bookHref, callback=self.parse)
def parse2(self,response,bookName,bookChapters):
chapters = response.xpath('//ul[@id="detail-list-select"]/li/a/@href').extract()
author = response.xpath('//p[@class="detail-main-info-author"]/a/text()').extract()[0]
describe = response.xpath('//p[@class="detail-main-info-author"]/text()').extract()[2]
pages = response.xpath('//ul[@id="detail-list-select"]/li/a/@title').extract()
i = 0
for chapter in chapters:
nowPage = pages[i]
i = i + 1
chapterHref = 'http://simicomic.com'+chapter
# 浏览器headers
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Referfer': 'http://simicomic.com'
}
# 指定cookies
cookies = {
'PHPSESSID': 'ti662eu77k3t781gpo5vos67jj',
'nav_switch': 'booklist',
}
yield scrapy.Request(url=chapterHref, callback=lambda response, bookName=bookName,bookChapters=bookChapters,author=author,describe = describe,nowPage=nowPage: self.parse3(response, bookName,bookChapters,author,describe,nowPage), dont_filter=True,headers=headers,cookies=cookies)
def parse3(self,response,bookName,bookChapters,author,describe,nowPage):
img_urls = response.xpath('//div[@id="cp_img"]/div/img/@data-original').extract()
for img_url in img_urls:
item = SimicomicItem()
item["bookName"] = bookName
item["author"] = author
item["myDescribe"] = describe
item["bookChapters"] = bookChapters
item["img_url"] = img_url
item["nowPage"] = nowPage
yield item
item.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class SimicomicItem(scrapy.Item):
table = 'simicomic'
id = scrapy.Field()
bookName = scrapy.Field()
author = scrapy.Field()
myDescribe = scrapy.Field()
bookChapters = scrapy.Field()
img_url = scrapy.Field()
nowPage = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy import Request
from scrapy.utils.project import get_project_settings
import time
import random
import os
import shutil
import pymysql
class SimicomicPipeline(object):
def process_item(self, item, spider):
return item
# 图片下载管道
class ImagePipeline(ImagesPipeline):
# 获取settings文件里设置的变量值
IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
def item_completed(self, results, item, info):
image_path = [x['path'] for ok,x in results if ok]
# 定义分类保存的路径
img_path = "%s/%s/%s" % (self.IMAGES_STORE, item['bookName'],'第'+item['nowPage']+'章')
# # 目录不存在则创建目录
if os.path.exists(img_path) == False:
os.makedirs(img_path)
# 将文件从默认下路路径移动到指定路径下
shutil.move(self.IMAGES_STORE + "/" + image_path[0],
img_path + "/" + item["img_url"].split('/')[-1])
item['img_url'] = img_path + "/" + item["img_url"].split('/')[-1]
if not image_path:
raise DropItem('Image Download Failed')
return item
def get_media_requests(self, item, info):
# 浏览器headers
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Referfer': 'http://simicomic.com'
}
# 指定cookies
cookies = {
'PHPSESSID': 'ti662eu77k3t781gpo5vos67jj',
'nav_switch': 'booklist',
}
yield Request(item['img_url'], headers=headers,cookies = cookies)
#mysql管道
class MysqlPipeline():
def __init__(self,host,database,user,password,port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls,crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database = crawler.settings.get('MYSQL_DATABASE'),
user = crawler.settings.get('MYSQL_USER'),
password = crawler.settings.get('MYSQL_PASSWORD'),
port = crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self,spider):
self.db = pymysql.connect(self.host,self.user,self.password,self.database,charset='utf8',port = self.port)
self.cursor = self.db.cursor()
def close_spider(self,spider):
self.db.close()
def process_item(self,item,spider):
data = dict(item)
keys = ','.join(data.keys())
values = ','.join(['%s']*len(data))
# selectSql = 'select * from simicomic where img_url like (%s)' % (item["img_url"].split('/')[-1])
# print(selectSql)
# hasUrl = self.cursor.execute(selectSql)
# print("hasUrl==>"+str(hasUrl))
sql = 'insert into %s (%s) values (%s)' % (item.table,keys,values)
self.cursor.execute(sql,tuple(data.values()))
self.db.commit()
return item
setting.py
# -*- coding: utf-8 -*-
# Scrapy settings for simicomic project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'simicomic'
SPIDER_MODULES = ['simicomic.spiders']
NEWSPIDER_MODULE = 'simicomic.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'simicomic (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
#图片下载
IMAGES_STORE = './images'
# 爬取时间间隔
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {
'simicomic.pipelines.ImagePipeline':300,
'simicomic.pipelines.MysqlPipeline': 302,
}
#mysql配置
MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'scrapy_db'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'fuhong888'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'simicomic.middlewares.SimicomicSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'simicomic.middlewares.SimicomicDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'simicomic.pipelines.SimicomicPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
效果:
![](https://img.haomeiwen.com/i19597329/53a9fafafbff2328.png)