爬虫实战

爬虫实战:scrapy爬取糗事百科

2020-04-07  本文已影响0人  后山小鲨鱼

要爬取的网站:


我们爬取热图中的标题和图片,下载图片,并将路径和标题等相关信息保存到数据库。
1.新建项目

scrapy startproject mySpider
生成爬虫
cd mySpider
scrapy genspider qiushibaike "www.qiushibaike.com/imgrank/"

目录如下


qiushibaike.py

# -*- coding: utf-8 -*-
import scrapy
from mySpider.items import  QiushibaikeItem
import time


class QiushibaikeSpider(scrapy.Spider):
    name = 'qiushibaike'
    allowed_domains = ['www.qiushibaike.com']

    start_urls = ['https://www.qiushibaike.com/imgrank/']



    def parse(self, response):
        texts = response.xpath('//div[@class="content"]/span/text()').extract()
        imgurls = response.xpath('//div[@class="thumb"]/a/img/@src').extract()

        nextUrl = response.xpath('//span[@class="next"]/../@href').extract()
        for index in range(len(imgurls)):
            if imgurls[index].find("video") == -1:
                item = QiushibaikeItem()
                item['title'] = texts[index]
                item['content'] = texts[index]
                item['img_prefix'] = 'https:'+imgurls[index]
                file_name = imgurls[index].split('/')[-1]
                item['img_name'] = file_name
                item['create_time'] = round(time.time() * 1000)
                yield item

        time.sleep(2)

        if nextUrl != []:
            url = 'https://www.qiushibaike.com' + nextUrl[0]
            yield scrapy.Request(url=url, callback=self.parse)
        else:
            time.sleep(550)
            url = 'https://www.qiushibaike.com/imgrank/'
            yield scrapy.Request(url=url, callback=self.parse)


item.py

import scrapy
class QiushibaikeItem(scrapy.Item):
   table = 'joke_list'
   id = scrapy.Field()
   title = scrapy.Field()
   content = scrapy.Field()
   img_prefix = scrapy.Field()
   img_name = scrapy.Field()
   create_time = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy import Request
import pymysql
import random
import time

class MyspiderPipeline(object):
    def process_item(self, item, spider):
        return item
# 图片下载广告
class ImagePipeline(ImagesPipeline):
    def file_path(self,request,response=None,info=None):
        url = request.url
        file_name = url.split('/')[-1]
        return file_name


    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok,x in results if ok]
        if not image_paths:
            raise DropItem('Image Download Failed')
        return item



    def get_media_requests(self, item, info):
        time.sleep(random.randint(1,3))
        yield Request(item['img_prefix'])

#mysql管道
class MysqlPipeline():
    def __init__(self,host,database,user,password,port):
        self.host = host
        self.database = database
        self.user = user
        self.password = password
        self.port = port

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            host=crawler.settings.get('MYSQL_HOST'),
            database = crawler.settings.get('MYSQL_DATABASE'),
            user = crawler.settings.get('MYSQL_USER'),
            password = crawler.settings.get('MYSQL_PASSWORD'),
            port = crawler.settings.get('MYSQL_PORT'),
        )

    def open_spider(self,spider):
        self.db = pymysql.connect(self.host,self.user,self.password,self.database,charset='utf8',port = self.port)
        self.cursor = self.db.cursor()

    def close_spider(self,spider):
        self.db.close()

    def process_item(self,item,spider):
        data = dict(item)
        keys = ','.join(data.keys())
        values = ','.join(['%s']*len(data))
        sql = 'insert into %s (%s) values (%s)' % (item.table,keys,values)
        self.cursor.execute(sql,tuple(data.values()))
        self.db.commit()
        return item
#去重管道
class DuplicatesPipeline(object):
    """
    去重
    """

    def __init__(self):
        self.imgName_set = set()

    def process_item(self, item, spider):
        name = item['img_name']
        if name in self.imgName_set:
            raise DropItem("Duplicate img_name found:%s" % item)

        self.imgName_set.add(name)
        return item


setting.py

# -*- coding: utf-8 -*-

# Scrapy settings for mySpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'mySpider'

SPIDER_MODULES = ['mySpider.spiders']
NEWSPIDER_MODULE = 'mySpider.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mySpider (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
    'referer':'https://www.qiushibaike.com/',
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
}


# 修改编码为utf-8
FEED_EXPORT_ENCODING = 'utf-8'

#图片下载
IMAGES_STORE = './images'
ITEM_PIPELINES = {
    'mySpider.pipelines.ImagePipeline':300,
    'mySpider.pipelines.MysqlPipeline':302,
    'mySpider.pipelines.DuplicatesPipeline': 299,
}

#mysql配置
MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'jy_read_db'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'fuhong888'


# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'mySpider.middlewares.MyspiderSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'mySpider.pipelines.MyspiderPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

爬取到的图片

存入mysql的数据

上一篇 下一篇

猜你喜欢

热点阅读