我的Python自学之路程序猿阵线联盟-汇总各类技术干货python

Python写个爬虫框架第一天

2018-03-12  本文已影响178人  大猪厂

layout: "post"
title: "项目第一天"
date: "2018-03-12 23:09"


整个项目用面向对象的思想完成,兼容py2、py3,运用MySQL与Redis,使用异步、装饰器、日志等方式实现一个简单爬虫框架。

子主题 1.png

实现思路:

利用scrapy框架的流程图,根据流程图实现mini_scrapy框架

爬虫主要流程:

  1. 构建HTTP请求信息(URL method header..)
  2. 发起HTTP请求,获取HTTP响应
  3. 解析响应,获取数据
  4. 数据存储

具体实现方案

  1. 下载器:
    • request模块实现
  2. 调度器:
    • 队列(先进先出Queue)或 redis list类型(数据类型str/list/set/hash/zset)实现
    • 数据去重方案(sha1/md5/hash/sha256/布隆过滤器)
>>> from hashlib import sha1
>>> s1 = sha1()
>>> s1.update('python'.encode('ascii'))
>>> s1.hexdigest()
'4235227b51436ad86d07c7cf5d69bda2644984de'
  1. 管道模块
    • 设计数据库mysql
    • 利用django ORM框架、flask-SQLAlchemy
  2. 爬虫模块
    • 类方法
  3. 引擎模块
    • 管控驱动main

代码实现

爬虫模块 spiders

#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
'''
程序一开始时,提供初始请求对象
提供解析响应的方法,负责返回提取的数据或者新增的请求对象
'''
class Request(object):

    def __init__(self, url, method='GET', headers=None, params=None, data=None):
        self._url = url
        self._method = method
        self.headers = headers
        self.params = params
        self.data = data

    @property
    def url(self):
        return self._url

    @property
    def method(self):
        return self._method


class Spider(object):

    start_url = None

    def start_requests(self):
        '''提供初始请求对象'''
        # return ['http://www.baidu.com','GET',{}, None]#
        # return {
        #     "url":'',
        #     "method":''
        # }   #
        # request.url = '213431325'
        # request.method
        return Request(self.start_url)

    def parse(self, response):
        '''解析响应'''
        return response.content

class BaiduSpider(Spider):
    start_url = 'http://www.baidu.com'


调度器 Scheduler

#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
'''
负责存储批量的请求对象
对接收到的请求进行去重判断,如果重复,就不进行存储
对外弹出请求对象,交给下载器去发起http请求
'''
# python3, 2.7.14
# from queue import Queue
# <=python2.7.13
# from Queue import Queue

# 1. try except
# try:
#     from Queue import Queue
# except:
#     from queue import Queue

# 2. six为兼容而生, PY2 PY3
# import six
# six.PY2
# six.PY3

# 3. six.moves
import six
from six.moves.queue import Queue


class Scheduler(object):
    '''queue'''

    def __init__(self):
        self.queue = Queue()

    def add_request(self, request):
        '''
        存储批量的请求对象
        return  None
        '''
        self.queue.put(request)

    def get_request(self):
        '''
        对外弹出请求对象
        :return: request object
        '''
        return self.queue.get()

    def fp_filter(self):
        '''去重'''
        pass

下载器downloader.py

判断请求方法

#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
'''
根据请求信息,发出http请求,获取http响应
返回一个http响应
'''
import requests

class Response(object):

    def __init__(self, url, headers, content, status_code):
        self._url = url    # 响应url
        self.headers = headers    # 响应头
        self.content = content     # 响应体
        self.status_code = status_code   # 状态码


class Downloader(object):

    def get_resposne(self, request):
        '''根据请求信息,发出http请求,获取http响应'''
        # 1. 判断请求方法:
        response = None

        if request.method == "GET":
            response = requests.get(request.url, headers=request.headers, params=request.params)
        elif request.method == "POST":
            response = requests.post(request.url, headers=request.headers, params=request.params, data=request.data)

        return Response(response.url, response.headers, response.content, response.status_code)



管道 pipeline

#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
'''处理爬虫数据'''


class Pipeline(object):

    def process_item(self, item):
        '''处理爬虫数据'''
        print('pipeline:')
        print("item: ", item)

引擎 engine

#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
from .spiders import BaiduSpider, Request
from .scheduler import Scheduler
from .downloader import Downloader
from .pipeline import Pipeline


class Engine(object):

    def __init__(self, spider_mids=[], downloader_mids=[]):
        self.spider = BaiduSpider()
        self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipeline = Pipeline()
        self.spider_mids = spider_mids
        self.downloader_mids = downloader_mids

    def main(self):
        '''提供程序运行的入口'''
        # 1. 调用爬虫的start_requests方法    A
        request = self.spider.start_requests()   # 10000

        # 加入爬虫中间件
        for spider_mid in self.spider_mids:
            request = spider_mid.process_request(request)

        # 2. 把请求对象交给调度器   B
        self.scheduler.add_request(request)

        # 3. 从调度器取出请求,并交给下载器   C
        # 3.1
        request = self.scheduler.get_request()

        # 加入下载中间件
        for downloader_mid in self.downloader_mids:
            request = downloader_mid.process_request(request)
        # 3.2
        response = self.downloader.get_resposne(request)

        # 加入下载中间件
        for downloader_mid in self.downloader_mids:
            response = downloader_mid.process_response(response)

        # 4. 交给爬虫去解析   D
        result = self.spider.parse(response)


        # 5. PP判断result是不是request对象  E
        if type(result) == Request:   # 如果是
            # 加入爬虫中间件
            for spider_mid in self.spider_mids:
                result = spider_mid.process_request(result)
            self.scheduler.add_request(request)
        else:
            # 加入爬虫中间件
            for spider_mid in self.spider_mids:
                result = spider_mid.process_item(result)
            self.pipeline.process_item(result)




并发:单核cpu轮流进行
并行:多核cpu
同步:前项执行完,继续执行
异步:同时执行
阻塞:b任务执行依赖于A的返回结果(杯子没水却喝水没有结果)

测试代码 main

#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''


questions = '''
爬虫 web
1. asdfasd
2. asdfasdf
3. sdfsadf
'''


import redis

rd = redis.StrictRedis("192.168.108.29")
rd.hset("questions", 'lst_', questions)


# 管控驱动main
#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''
from mini_scrapy.core.engine import Engine

from middlewares import *

engine = Engine(spider_mids=[SpiderMiddleware()], downloader_mids=[DownloaderMiddleware()])
engine.main()


# 日志
# 单例
# 装饰器
# 异步




编写中间件

请求信息补全中间件、反爬中间件、爬取错误日志、清洗中间件等

#!/usr/bin/env python
# encoding=utf-8
'''
@author: Francis
'''

class SpiderMiddleware(object):

    def process_request(self, request):
        return request

    def process_item(self, item):
        return item



class DownloaderMiddleware(object):

    def process_request(self, request):
        return request

    def process_response(self, response):
        return response

上一篇 下一篇

猜你喜欢

热点阅读