Scrapy总结

2020-05-18  本文已影响0人  凉风有信2020

一、pipeline 的用法

1、读写文件形式:
def __init__(self):
    self.fp = open('name.txt',"w",encoding='utf-8')
def process_item(self,item,spider):
    detail = item['key']            # 直接传字符串
    # 或者detail = dumps(dict(item),ensure_ascii=False)      
    # item是item对象,通过dict转化为字典传Json格式文件
    self.fp.write(detail) 
    return item
def close_spider(self,spider):
    #重写close_spider
    self.fp.close()
以下两个函数用于json数据类型的转换
以下两个函数用于json数据类型转换的同时操作文件
2、Item Pipeline 形式:
from scrapy.exporters import JsonItemExporter
def __init__(self):
    self.fp = open('name.txt',"wb")
    self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,,encoding='utf-8')
    self.exporter.start_exporting()
def process_item(self,item,spider):
    self.exporter.export_item(item)
    return item
def close_spider(self,spider):
    self.exporter.finish_exporting()
    self.fp.close()
3、Lines Item Pipeline形式:
from scrapy.exporters import JsonItemExporter
def __init__(self):
    self.fp = open('name.txt',"wb")
    self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,,encoding='utf-8')
def process_item(self,item,spider):
    self.exporter.export_item(item)
    return item
def close_spider(self,spider):
    self.fp.close()
4、连接数据库:
import pymysql

class MysqlPipeline(object):
    def __init__(self):
        dbparams = {
            'host':'localhost',
            'port':3306,
            'user':'root',
            'password':'123456',
            'database':'jianshu',
            'charset':'utf8'
        }
        self.conn = pymysql.connect(**dbparams)
        self.cursor = self.conn.cursor()
        

    def process_item(self,item,spider):
        item_execute = 'insert into article(id,title,content) values(null,%s,%s)'
        self.cursor.execute(item_execute,(item['title'],item['content']))
        self.conn.commit()
        return item

二、Middleware中间件处理(目前的三种用法):

Middleware中间件主要包括:
1、process_request
代理IP:
import random

class IPProxyMiddleware(object):
    proxies = {
        "125.116.174.17"
        "2:808",
        "117.88.176.196:3000"
    }

    def process_request(self,request,spiders):
        proxy = random.choice(self.proxies)
        request.meta['proxy'] = request.url.split("://")[0] + proxy

代理必须加上http://或https://才能正常使用,此处是自动根据request的类型加上了头。

User_agent代理:
import random

class UserAgentMiddleware(object):
    UserAgents = {
        'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; XH; rv:8.578.498) fr,Gecko/20121021 Camino/8.723+ (Firefox compatible)',
        'Mozilla/5.0 (X11; U; Linux i686; nl; rv:1.8.1b2) Gecko/20060821 BonEcho/2.0b2 (Debian-1.99+2.0b2+dfsg-1)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; Avant Browser; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
    }

    def process_request(self,request,spider):
        UserAgent = random.choice(self.UserAgents)
        request.headers['User_Agent'] = UserAgent

本中间件用于对request进行进一步的处理

Selenium中间件

用于爬取正常爬取不到的界面内容,如JS渲染的内容等。比如使用scrapy发送request,利用中间件来进行JS渲染页面的加载,通过webdriver获取页面的所有内容,然后生成Response返回给爬虫处理。

from selenium import webdriver:
import time
from scrapy.http.response.html import HtmlResponse

class SeleniumMiddleware(object):

    def __init__(self):
        self.driver = webdriver.Chrome()

    def process_request(self,request,spider):
        self.driver.get(request.url)
        time.sleep(1)   # 预留加载页面的时间
        try:
            # 有的网站可能没有show_more,加入try增加爬虫的健壮度
            while True:
                show_more = self.driver.find_element_by_class_name('show-more')
                show_more.click
        except:
            pass
        source = self.driver.page_source
        response = HtmlResponse(self.driver.current_url,body=source,request=request)
        return response

三、Crawl Spider

用一定规则来批量爬取待请求的网站。

生成爬虫:
scrapy genspider -t crawl spider "spider_name" "allow_domain"
spider文件编写:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class spider_name(CrawlSpider):
    ... # 省略

    rules = {
        Rule(LinkExtractor(allow=r"正则表达式"),callback="回调函数",follow=True/False)
    }

四、Settings设置

    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False

    # Configure a delay for requests for the same website (default: 0)
    DOWNLOAD_DELAY = 1
    # Override the default request headers:

    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
     }

    #Enable or disable spider middlewares
    SPIDER_MIDDLEWARES = {
        'jianshu.middlewares.JianshuSpiderMiddleware': 543,
    }
    # Configure item pipelines
    ITEM_PIPELINES = {
        'jianshu.pipelines.JianshuTwistedPipeline': 300,
    }
上一篇下一篇

猜你喜欢

热点阅读