爬虫实战:scrapy爬取糗事百科
2020-04-07 本文已影响0人
后山小鲨鱼
要爬取的网站:
![](https://img.haomeiwen.com/i19597329/e26a150d6e23469c.png)
我们爬取热图中的标题和图片,下载图片,并将路径和标题等相关信息保存到数据库。
1.新建项目
scrapy startproject mySpider
生成爬虫
cd mySpider
scrapy genspider qiushibaike "www.qiushibaike.com/imgrank/"
目录如下
![](https://img.haomeiwen.com/i19597329/f66b4c92253cfe91.png)
qiushibaike.py
# -*- coding: utf-8 -*-
import scrapy
from mySpider.items import QiushibaikeItem
import time
class QiushibaikeSpider(scrapy.Spider):
name = 'qiushibaike'
allowed_domains = ['www.qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/imgrank/']
def parse(self, response):
texts = response.xpath('//div[@class="content"]/span/text()').extract()
imgurls = response.xpath('//div[@class="thumb"]/a/img/@src').extract()
nextUrl = response.xpath('//span[@class="next"]/../@href').extract()
for index in range(len(imgurls)):
if imgurls[index].find("video") == -1:
item = QiushibaikeItem()
item['title'] = texts[index]
item['content'] = texts[index]
item['img_prefix'] = 'https:'+imgurls[index]
file_name = imgurls[index].split('/')[-1]
item['img_name'] = file_name
item['create_time'] = round(time.time() * 1000)
yield item
time.sleep(2)
if nextUrl != []:
url = 'https://www.qiushibaike.com' + nextUrl[0]
yield scrapy.Request(url=url, callback=self.parse)
else:
time.sleep(550)
url = 'https://www.qiushibaike.com/imgrank/'
yield scrapy.Request(url=url, callback=self.parse)
item.py
import scrapy
class QiushibaikeItem(scrapy.Item):
table = 'joke_list'
id = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
img_prefix = scrapy.Field()
img_name = scrapy.Field()
create_time = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy import Request
import pymysql
import random
import time
class MyspiderPipeline(object):
def process_item(self, item, spider):
return item
# 图片下载广告
class ImagePipeline(ImagesPipeline):
def file_path(self,request,response=None,info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok,x in results if ok]
if not image_paths:
raise DropItem('Image Download Failed')
return item
def get_media_requests(self, item, info):
time.sleep(random.randint(1,3))
yield Request(item['img_prefix'])
#mysql管道
class MysqlPipeline():
def __init__(self,host,database,user,password,port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls,crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database = crawler.settings.get('MYSQL_DATABASE'),
user = crawler.settings.get('MYSQL_USER'),
password = crawler.settings.get('MYSQL_PASSWORD'),
port = crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self,spider):
self.db = pymysql.connect(self.host,self.user,self.password,self.database,charset='utf8',port = self.port)
self.cursor = self.db.cursor()
def close_spider(self,spider):
self.db.close()
def process_item(self,item,spider):
data = dict(item)
keys = ','.join(data.keys())
values = ','.join(['%s']*len(data))
sql = 'insert into %s (%s) values (%s)' % (item.table,keys,values)
self.cursor.execute(sql,tuple(data.values()))
self.db.commit()
return item
#去重管道
class DuplicatesPipeline(object):
"""
去重
"""
def __init__(self):
self.imgName_set = set()
def process_item(self, item, spider):
name = item['img_name']
if name in self.imgName_set:
raise DropItem("Duplicate img_name found:%s" % item)
self.imgName_set.add(name)
return item
setting.py
# -*- coding: utf-8 -*-
# Scrapy settings for mySpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'mySpider'
SPIDER_MODULES = ['mySpider.spiders']
NEWSPIDER_MODULE = 'mySpider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mySpider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'referer':'https://www.qiushibaike.com/',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
}
# 修改编码为utf-8
FEED_EXPORT_ENCODING = 'utf-8'
#图片下载
IMAGES_STORE = './images'
ITEM_PIPELINES = {
'mySpider.pipelines.ImagePipeline':300,
'mySpider.pipelines.MysqlPipeline':302,
'mySpider.pipelines.DuplicatesPipeline': 299,
}
#mysql配置
MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'jy_read_db'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'fuhong888'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'mySpider.pipelines.MyspiderPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
爬取到的图片
存入mysql的数据