Scrapy总结
2020-05-18 本文已影响0人
凉风有信2020
一、pipeline 的用法
1、读写文件形式:
def __init__(self):
self.fp = open('name.txt',"w",encoding='utf-8')
def process_item(self,item,spider):
detail = item['key'] # 直接传字符串
# 或者detail = dumps(dict(item),ensure_ascii=False)
# item是item对象,通过dict转化为字典传Json格式文件
self.fp.write(detail)
return item
def close_spider(self,spider):
#重写close_spider
self.fp.close()
以下两个函数用于json数据类型的转换
- json.dumps() 字典 -> json
- json.loads() json -> 字典
以下两个函数用于json数据类型转换的同时操作文件
- json.dump()
- json.load()
2、Item Pipeline 形式:
from scrapy.exporters import JsonItemExporter
def __init__(self):
self.fp = open('name.txt',"wb")
self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,,encoding='utf-8')
self.exporter.start_exporting()
def process_item(self,item,spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.exporter.finish_exporting()
self.fp.close()
- 缺点:不是即时存储,需要start和finish exporting
- 优点:存储后的整个文件为Json格式
3、Lines Item Pipeline形式:
from scrapy.exporters import JsonItemExporter
def __init__(self):
self.fp = open('name.txt',"wb")
self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,,encoding='utf-8')
def process_item(self,item,spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.fp.close()
- 优点:实时传输速度快,因此不需要开始和结束传输语句
- 缺点:按条传输的结果是输出文件不是Json格式,没有{}
4、连接数据库:
import pymysql
class MysqlPipeline(object):
def __init__(self):
dbparams = {
'host':'localhost',
'port':3306,
'user':'root',
'password':'123456',
'database':'jianshu',
'charset':'utf8'
}
self.conn = pymysql.connect(**dbparams)
self.cursor = self.conn.cursor()
def process_item(self,item,spider):
item_execute = 'insert into article(id,title,content) values(null,%s,%s)'
self.cursor.execute(item_execute,(item['title'],item['content']))
self.conn.commit()
return item
二、Middleware中间件处理(目前的三种用法):
Middleware中间件主要包括:
- process_request(self,request,spider):和
- process_response(self,request,response,spider):
1、process_request
代理IP:
import random
class IPProxyMiddleware(object):
proxies = {
"125.116.174.17"
"2:808",
"117.88.176.196:3000"
}
def process_request(self,request,spiders):
proxy = random.choice(self.proxies)
request.meta['proxy'] = request.url.split("://")[0] + proxy
代理必须加上http://或https://才能正常使用,此处是自动根据request的类型加上了头。
User_agent代理:
import random
class UserAgentMiddleware(object):
UserAgents = {
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; XH; rv:8.578.498) fr,Gecko/20121021 Camino/8.723+ (Firefox compatible)',
'Mozilla/5.0 (X11; U; Linux i686; nl; rv:1.8.1b2) Gecko/20060821 BonEcho/2.0b2 (Debian-1.99+2.0b2+dfsg-1)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; Avant Browser; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
}
def process_request(self,request,spider):
UserAgent = random.choice(self.UserAgents)
request.headers['User_Agent'] = UserAgent
本中间件用于对request进行进一步的处理
Selenium中间件
用于爬取正常爬取不到的界面内容,如JS渲染的内容等。比如使用scrapy发送request,利用中间件来进行JS渲染页面的加载,通过webdriver获取页面的所有内容,然后生成Response返回给爬虫处理。
from selenium import webdriver:
import time
from scrapy.http.response.html import HtmlResponse
class SeleniumMiddleware(object):
def __init__(self):
self.driver = webdriver.Chrome()
def process_request(self,request,spider):
self.driver.get(request.url)
time.sleep(1) # 预留加载页面的时间
try:
# 有的网站可能没有show_more,加入try增加爬虫的健壮度
while True:
show_more = self.driver.find_element_by_class_name('show-more')
show_more.click
except:
pass
source = self.driver.page_source
response = HtmlResponse(self.driver.current_url,body=source,request=request)
return response
三、Crawl Spider
用一定规则来批量爬取待请求的网站。
生成爬虫:
scrapy genspider -t crawl spider "spider_name" "allow_domain"
spider文件编写:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class spider_name(CrawlSpider):
... # 省略
rules = {
Rule(LinkExtractor(allow=r"正则表达式"),callback="回调函数",follow=True/False)
}
四、Settings设置
- Obey robots.txt rules:不遵守机器人守则
- DOWNLOAD_DELAY:下载延迟
- DEFAULT_REQUEST_HEADERS:默认请求头
- SPIDER_MIDDLEWARES:要使用的的中间件
- ITEM_PIPELINES:要使用的Pipeline
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure a delay for requests for the same website (default: 0)
DOWNLOAD_DELAY = 1
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
#Enable or disable spider middlewares
SPIDER_MIDDLEWARES = {
'jianshu.middlewares.JianshuSpiderMiddleware': 543,
}
# Configure item pipelines
ITEM_PIPELINES = {
'jianshu.pipelines.JianshuTwistedPipeline': 300,
}