Python写个爬虫框架第一天
2018-03-12 本文已影响178人
大猪厂
layout: "post"
title: "项目第一天"
date: "2018-03-12 23:09"
整个项目用面向对象的思想完成,兼容py2、py3,运用MySQL与Redis,使用异步、装饰器、日志等方式实现一个简单爬虫框架。
子主题 1.png实现思路:
利用scrapy框架的流程图,根据流程图实现mini_scrapy框架
爬虫主要流程:
- 构建HTTP请求信息(URL method header..)
- 发起HTTP请求,获取HTTP响应
- 解析响应,获取数据
- 数据存储
- 第一步和第三步对应爬虫模块,第二步对应下载器,第四步对应管道
- 调度器属于请求队列,存储数据供下载器使用。
- 引擎主要作用调节中心。
具体实现方案
- 下载器:
- request模块实现
- 调度器:
- 队列(先进先出Queue)或 redis list类型(数据类型str/list/set/hash/zset)实现
- 数据去重方案(sha1/md5/hash/sha256/布隆过滤器)
>>> from hashlib import sha1
>>> s1 = sha1()
>>> s1.update('python'.encode('ascii'))
>>> s1.hexdigest()
'4235227b51436ad86d07c7cf5d69bda2644984de'
- 管道模块
- 设计数据库mysql
- 利用django ORM框架、flask-SQLAlchemy
- 爬虫模块
- 类方法
- 引擎模块
- 管控驱动main
代码实现
爬虫模块 spiders
#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
'''
程序一开始时,提供初始请求对象
提供解析响应的方法,负责返回提取的数据或者新增的请求对象
'''
class Request(object):
def __init__(self, url, method='GET', headers=None, params=None, data=None):
self._url = url
self._method = method
self.headers = headers
self.params = params
self.data = data
@property
def url(self):
return self._url
@property
def method(self):
return self._method
class Spider(object):
start_url = None
def start_requests(self):
'''提供初始请求对象'''
# return ['http://www.baidu.com','GET',{}, None]#
# return {
# "url":'',
# "method":''
# } #
# request.url = '213431325'
# request.method
return Request(self.start_url)
def parse(self, response):
'''解析响应'''
return response.content
class BaiduSpider(Spider):
start_url = 'http://www.baidu.com'
调度器 Scheduler
- py2与py3兼容
- Queue模块引入,可用try..except..方法
- import six, PY2/PY3
- from six.moves.queue import Queue
#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
'''
负责存储批量的请求对象
对接收到的请求进行去重判断,如果重复,就不进行存储
对外弹出请求对象,交给下载器去发起http请求
'''
# python3, 2.7.14
# from queue import Queue
# <=python2.7.13
# from Queue import Queue
# 1. try except
# try:
# from Queue import Queue
# except:
# from queue import Queue
# 2. six为兼容而生, PY2 PY3
# import six
# six.PY2
# six.PY3
# 3. six.moves
import six
from six.moves.queue import Queue
class Scheduler(object):
'''queue'''
def __init__(self):
self.queue = Queue()
def add_request(self, request):
'''
存储批量的请求对象
return None
'''
self.queue.put(request)
def get_request(self):
'''
对外弹出请求对象
:return: request object
'''
return self.queue.get()
def fp_filter(self):
'''去重'''
pass
下载器downloader.py
判断请求方法
#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
'''
根据请求信息,发出http请求,获取http响应
返回一个http响应
'''
import requests
class Response(object):
def __init__(self, url, headers, content, status_code):
self._url = url # 响应url
self.headers = headers # 响应头
self.content = content # 响应体
self.status_code = status_code # 状态码
class Downloader(object):
def get_resposne(self, request):
'''根据请求信息,发出http请求,获取http响应'''
# 1. 判断请求方法:
response = None
if request.method == "GET":
response = requests.get(request.url, headers=request.headers, params=request.params)
elif request.method == "POST":
response = requests.post(request.url, headers=request.headers, params=request.params, data=request.data)
return Response(response.url, response.headers, response.content, response.status_code)
管道 pipeline
#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
'''处理爬虫数据'''
class Pipeline(object):
def process_item(self, item):
'''处理爬虫数据'''
print('pipeline:')
print("item: ", item)
引擎 engine
#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
from .spiders import BaiduSpider, Request
from .scheduler import Scheduler
from .downloader import Downloader
from .pipeline import Pipeline
class Engine(object):
def __init__(self, spider_mids=[], downloader_mids=[]):
self.spider = BaiduSpider()
self.scheduler = Scheduler()
self.downloader = Downloader()
self.pipeline = Pipeline()
self.spider_mids = spider_mids
self.downloader_mids = downloader_mids
def main(self):
'''提供程序运行的入口'''
# 1. 调用爬虫的start_requests方法 A
request = self.spider.start_requests() # 10000
# 加入爬虫中间件
for spider_mid in self.spider_mids:
request = spider_mid.process_request(request)
# 2. 把请求对象交给调度器 B
self.scheduler.add_request(request)
# 3. 从调度器取出请求,并交给下载器 C
# 3.1
request = self.scheduler.get_request()
# 加入下载中间件
for downloader_mid in self.downloader_mids:
request = downloader_mid.process_request(request)
# 3.2
response = self.downloader.get_resposne(request)
# 加入下载中间件
for downloader_mid in self.downloader_mids:
response = downloader_mid.process_response(response)
# 4. 交给爬虫去解析 D
result = self.spider.parse(response)
# 5. PP判断result是不是request对象 E
if type(result) == Request: # 如果是
# 加入爬虫中间件
for spider_mid in self.spider_mids:
result = spider_mid.process_request(result)
self.scheduler.add_request(request)
else:
# 加入爬虫中间件
for spider_mid in self.spider_mids:
result = spider_mid.process_item(result)
self.pipeline.process_item(result)
并发:单核cpu轮流进行
并行:多核cpu
同步:前项执行完,继续执行
异步:同时执行
阻塞:b任务执行依赖于A的返回结果(杯子没水却喝水没有结果)
测试代码 main
- 安装自定义模块
依赖包:requestments.txt
VERSION:版本号
setup.py: 运行python setup.py install
会自动创建文件夹
#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
questions = '''
爬虫 web
1. asdfasd
2. asdfasdf
3. sdfsadf
'''
import redis
rd = redis.StrictRedis("192.168.108.29")
rd.hset("questions", 'lst_', questions)
# 管控驱动main
#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
from mini_scrapy.core.engine import Engine
from middlewares import *
engine = Engine(spider_mids=[SpiderMiddleware()], downloader_mids=[DownloaderMiddleware()])
engine.main()
# 日志
# 单例
# 装饰器
# 异步
编写中间件
请求信息补全中间件、反爬中间件、爬取错误日志、清洗中间件等
#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
class SpiderMiddleware(object):
def process_request(self, request):
return request
def process_item(self, item):
return item
class DownloaderMiddleware(object):
def process_request(self, request):
return request
def process_response(self, response):
return response