爬虫实战

爬虫实战:scrapy爬取西米漫画的图片

2020-04-07  本文已影响0人  后山小鲨鱼

这个网站的漫画是分等级的,有一些是要会员才可以看的,毕竟是限制级漫画-
用钱买了个会员,开爬。
和之前的爬虫不同的地方,这个要加cookie,这样才可以爬取会员章节的内容。
getsimicomic.py

# -*- coding: utf-8 -*-
import scrapy
from simicomic.items import SimicomicItem

class GetsimicomicSpider(scrapy.Spider):
    name = 'getsimicomic'
    allowed_domains = ['simicomic.com/booklist']
    start_urls = ['http://simicomic.com/booklist/']

    def parse(self, response):
        bookNames = response.xpath('//ul[@class="manga-list-2"]/li/p[@class="manga-list-2-title"]/a/text()').extract()
        bookHrefs = response.xpath('//ul[@class="manga-list-2"]/li/p[@class="manga-list-2-title"]/a/@href').extract()
        bookChapters = response.xpath('//ul[@class="manga-list-2"]/li/p[@class="manga-list-2-tip"]/a/text()').extract()[0]

        i = 0
        for bookHref in bookHrefs:
            bookHref = 'http://simicomic.com'+bookHref
            bookName = bookNames[i]
            i = i + 1
            # 浏览器headers
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
                'Referfer':'http://simicomic.com/booklist'
            }
            # 指定cookies
            cookies = {
                'book_referer':'http%3A%2F%2Fsimicomic.com%2Fbooklist',
                'PHPSESSID':'ti662eu77k3t781gpo5vos67jj',
                'nav_switch':'booklist',
            }
            yield scrapy.Request(url=bookHref, callback=lambda response, bookName=bookName,bookChapters = bookChapters: self.parse2(response, bookName,bookChapters), dont_filter=True,headers=headers,cookies=cookies)

        # nowPage = int(response.xpath('//ul[@class = "pagination"]/li[@class="active"]/span/text()').extract()[0])
        # if nowPage <= 9:
        #     nextPage = nowPage + 1
        #     nextPageUrl = 'http://simicomic.com/booklist/?tag=%E5%85%A8%E9%83%A8&area=-1&end=-1&page='+str(nextPage)
        #     yield scrapy.Request(url=bookHref, callback=self.parse)

    def parse2(self,response,bookName,bookChapters):
        chapters = response.xpath('//ul[@id="detail-list-select"]/li/a/@href').extract()
        author = response.xpath('//p[@class="detail-main-info-author"]/a/text()').extract()[0]
        describe = response.xpath('//p[@class="detail-main-info-author"]/text()').extract()[2]
        pages = response.xpath('//ul[@id="detail-list-select"]/li/a/@title').extract()

        i = 0
        for chapter in chapters:
            nowPage = pages[i]
            i = i + 1
            chapterHref =  'http://simicomic.com'+chapter
            # 浏览器headers
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
                'Referfer': 'http://simicomic.com'
            }
            # 指定cookies
            cookies = {
                'PHPSESSID': 'ti662eu77k3t781gpo5vos67jj',
                'nav_switch': 'booklist',
            }
            yield scrapy.Request(url=chapterHref, callback=lambda response, bookName=bookName,bookChapters=bookChapters,author=author,describe = describe,nowPage=nowPage: self.parse3(response, bookName,bookChapters,author,describe,nowPage), dont_filter=True,headers=headers,cookies=cookies)

    def parse3(self,response,bookName,bookChapters,author,describe,nowPage):
        img_urls = response.xpath('//div[@id="cp_img"]/div/img/@data-original').extract()
        for img_url in img_urls:
            item = SimicomicItem()
            item["bookName"] = bookName
            item["author"] = author
            item["myDescribe"] = describe
            item["bookChapters"] = bookChapters
            item["img_url"] = img_url
            item["nowPage"] = nowPage
            yield item

item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class SimicomicItem(scrapy.Item):
    table = 'simicomic'
    id = scrapy.Field()
    bookName = scrapy.Field()
    author = scrapy.Field()
    myDescribe = scrapy.Field()
    bookChapters = scrapy.Field()
    img_url = scrapy.Field()
    nowPage = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy import Request
from scrapy.utils.project import get_project_settings
import time
import random
import os
import shutil
import pymysql

class SimicomicPipeline(object):
    def process_item(self, item, spider):
        return item


# 图片下载管道
class ImagePipeline(ImagesPipeline):
    # 获取settings文件里设置的变量值
    IMAGES_STORE = get_project_settings().get("IMAGES_STORE")


    def item_completed(self, results, item, info):
        image_path = [x['path'] for ok,x in results if ok]
        # 定义分类保存的路径
        img_path = "%s/%s/%s" % (self.IMAGES_STORE, item['bookName'],'第'+item['nowPage']+'章')
        # # 目录不存在则创建目录
        if os.path.exists(img_path) == False:
            os.makedirs(img_path)

        # 将文件从默认下路路径移动到指定路径下
        shutil.move(self.IMAGES_STORE + "/" + image_path[0],
                    img_path + "/" + item["img_url"].split('/')[-1])
        item['img_url'] = img_path + "/" + item["img_url"].split('/')[-1]

        if not image_path:
            raise DropItem('Image Download Failed')
        return item



    def get_media_requests(self, item, info):
        # 浏览器headers
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
            'Referfer': 'http://simicomic.com'
        }
        # 指定cookies
        cookies = {
            'PHPSESSID': 'ti662eu77k3t781gpo5vos67jj',
            'nav_switch': 'booklist',
        }
        yield Request(item['img_url'], headers=headers,cookies = cookies)

#mysql管道
class MysqlPipeline():
    def __init__(self,host,database,user,password,port):
        self.host = host
        self.database = database
        self.user = user
        self.password = password
        self.port = port

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            host=crawler.settings.get('MYSQL_HOST'),
            database = crawler.settings.get('MYSQL_DATABASE'),
            user = crawler.settings.get('MYSQL_USER'),
            password = crawler.settings.get('MYSQL_PASSWORD'),
            port = crawler.settings.get('MYSQL_PORT'),
        )

    def open_spider(self,spider):
        self.db = pymysql.connect(self.host,self.user,self.password,self.database,charset='utf8',port = self.port)
        self.cursor = self.db.cursor()

    def close_spider(self,spider):
        self.db.close()

    def process_item(self,item,spider):
        data = dict(item)
        keys = ','.join(data.keys())
        values = ','.join(['%s']*len(data))
        # selectSql = 'select * from simicomic where img_url like (%s)' % (item["img_url"].split('/')[-1])
        # print(selectSql)
        # hasUrl = self.cursor.execute(selectSql)
        # print("hasUrl==>"+str(hasUrl))
        sql = 'insert into %s (%s) values (%s)' % (item.table,keys,values)
        self.cursor.execute(sql,tuple(data.values()))
        self.db.commit()
        return item

setting.py

# -*- coding: utf-8 -*-

# Scrapy settings for simicomic project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'simicomic'

SPIDER_MODULES = ['simicomic.spiders']
NEWSPIDER_MODULE = 'simicomic.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'simicomic (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}


#图片下载
IMAGES_STORE = './images'
# 爬取时间间隔
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {
    'simicomic.pipelines.ImagePipeline':300,
    'simicomic.pipelines.MysqlPipeline': 302,
}



#mysql配置
MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'scrapy_db'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'fuhong888'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'simicomic.middlewares.SimicomicSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'simicomic.middlewares.SimicomicDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'simicomic.pipelines.SimicomicPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

效果:


上一篇下一篇

猜你喜欢

热点阅读