爬虫高级爬虫技术爬虫技术

Scrapy框架学习---Request/Response(七)

2018-06-15  本文已影响114人  8f3a71b379c1

Request

Request 部分源码:

# 部分代码
class Request(object_ref):

    def __init__(self, url, callback=None, method='GET', headers=None, body=None, 
                 cookies=None, meta=None, encoding='utf-8', priority=0,
                 dont_filter=False, errback=None):

        self._encoding = encoding  # this one has to be set first
        self.method = str(method).upper()
        self._set_url(url)
        self._set_body(body)
        assert isinstance(priority, int), "Request priority not an integer: %r" % priority
        self.priority = priority

        assert callback or not errback, "Cannot use errback without a callback"
        self.callback = callback
        self.errback = errback

        self.cookies = cookies or {}
        self.headers = Headers(headers or {}, encoding=encoding)
        self.dont_filter = dont_filter

        self._meta = dict(meta) if meta else None

    @property
    def meta(self):
        if self._meta is None:
            self._meta = {}
        return self._meta

其中,比较常用的参数:

url: 就是需要请求,并进行下一步处理的url

callback: 指定该请求返回的Response,由那个函数来处理。

method: 请求一般不需要指定,默认GET方法,可设置为"GET", "POST", "PUT"等,且保证字符串大写

headers: 请求时,包含的头文件。一般不需要。内容一般如下:
# 自己写过爬虫的肯定知道
Host: media.readthedocs.org
User-Agent: Mozilla/5.0 (Windows NT 6.2; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0
Accept: text/css,/;q=0.1
Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3
Accept-Encoding: gzip, deflate
Referer: http://scrapy-chs.readthedocs.org/zh_CN/0.24/
Cookie: _ga=GA1.2.1612165614.1415584110;
Connection: keep-alive
If-Modified-Since: Mon, 25 Aug 2014 21:59:35 GMT
Cache-Control: max-age=0

meta: 比较常用,在不同的请求之间传递数据使用的。字典dict型

   request_with_cookies = Request(
       url="http://www.example.com",
       cookies={'currency': 'USD', 'country': 'UY'},
       meta={'dont_merge_cookies': True}
  )

encoding: 使用默认的 'utf-8' 就行。

dont_filter: 表明该请求不由调度器过滤。这是当你想使用多次执行相同的请求,忽略重复的过滤器。默认为False。

errback: 指定错误处理函数

Response

Response部分源码:

# 部分代码
class Response(object_ref):
    def __init__(self, url, status=200, headers=None, body='', flags=None, request=None):
        self.headers = Headers(headers or {})
        self.status = int(status)
        self._set_body(body)
        self._set_url(url)
        self.request = request
        self.flags = [] if flags is None else list(flags)

    @property
    def meta(self):
        try:
            return self.request.meta
        except AttributeError:
            raise AttributeError("Response.meta not available, this response " \
                "is not tied to any request")

大部分参数和上面的差不多:

status: 响应码
_set_body(body): 响应体
_set_url(url):响应url
self.request = request

发送POST请求

class mySpider(scrapy.Spider):
    # start_urls = ["http://www.example.com/"]

    def start_requests(self):
        url = 'http://www.renren.com/PLogin.do'

        # FormRequest 是Scrapy发送POST请求的方法
        yield scrapy.FormRequest(
            url = url,
            formdata = {"email" : "smartliu_it@163.com", "password" : "axxxxxxxe"},
            callback = self.parse_page
        )
    def parse_page(self, response):
        # do something

模拟登陆

使用FormRequest.from_response()方法模拟用户登录

通常网站通过 实现对某些表单字段(如数据或是登录界面中的认证令牌等)的预填充。

使用Scrapy抓取网页时,如果想要预填充或重写像用户名、用户密码这些表单字段, 可以使用 FormRequest.from_response() 方法实现。

下面是使用这种方法的爬虫例子:

import scrapy

class LoginSpider(scrapy.Spider):
    name = 'example.com'
    start_urls = ['http://www.example.com/users/login.php']

    def parse(self, response):
        return scrapy.FormRequest.from_response(
            response,
            formdata={'username': 'kongshan', 'password': 'cool'},
            callback=self.after_login
        )

    def after_login(self, response):
        # check login succeed before going on
        if "authentication failed" in response.body:
            self.log("Login failed", level=log.ERROR)
            return

        # continue scraping with authenticated session...

豆瓣爬虫案例参考:

Item类设置

import scrapy

class DoubanItem(scrapy.Item):
    # 用户名
    user_name = scrapy.Field()
    # 发布时间
    publishtime = scrapy.Field()

douban.py爬虫代码

# -*- coding: utf-8 -*-
import scrapy
import urllib.request
from PIL import Image
from douBan.items import DoubanItem


class DoubanSpider(scrapy.Spider):
    name = 'douban'
    allowed_domains = ['douban.com']
    start_urls = ['https://accounts.douban.com/login']
    login_url = "https://accounts.douban.com/login"
    base_url = "https://www.douban.com/"

    def parse(self, response):
        img_link = response.xpath('//img[@id="captcha_image"]/@src').extract_first()
        captcha_id = response.xpath('//input[@name="captcha-id"]/@value').extract_first()

        if img_link is None:
            print("登陆时没有遇到验证码...")
            formdata = {
                "source": "index_nav",
                "form_email": "smartliu_it@163.com",
                "form_password": "kongshan123.0",
            }

        else:
            print("登陆时遇到验证码...")
            img_path = "/home/python/Desktop/douBan-豆瓣登录/douBan/spiders/douban.jpg"
            urllib.request.urlretrieve(img_link, img_path)
            try:
                im = Image.open(img_path)
                im.show()
            except:
                print("打开图片失败...")

            captcha_solution = input("请输入验证码:")

            formdata = {
                "source": "index_nav",
                "redir": "https://www.douban.com",
                "form_email": "smartliu_it@163.com",
                "form_password": "kongshan123.0",
                "captcha-solution": captcha_solution,
                "captcha-id": captcha_id,
                "login": "登录",
            }

        print("正在登陆中...")
        return scrapy.FormRequest(self.login_url, formdata=formdata, callback=self.login_after)

    def login_after(self, response):

        item = DoubanItem()

        next_url = response.xpath('//span[@class="next"]/a/@href').extract_first()
        title_list = response.xpath('//a[@class="lnk-people"]/text()').extract()
        publish_time_list = response.xpath('//span[@class="created_at"]/@title').extract()
        for title, publish_time in zip(title_list, publish_time_list):
            item['user_name'] = title
            item['publishtime'] = publish_time
            yield item
        # 拼接下一页链接
        next_page = self.base_url + next_url
        yield scrapy.Request(url=next_page, callback=self.login_after)

pipelines.py管道代码

import json

class DoubanPipeline(object):
    # 打开爬虫时调用
    def open_spider(self, spider):
        self.file = open("douban.json", "w")
    # 必须实现的方法
    def process_item(self, item, spider):
        content = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(content)
        return item
    # 爬虫程序关闭时调用
    def close_spider(self, spider):
        self.file.close()

settings.py设置代码

# 请求头部
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3192.0 Safari/537.36'
#设置下载间隔为250ms
DOWNLOAD_DELAY = 0.25   
# 打开cookis传递
COOKIES_ENABLED = True
# 注册管道
ITEM_PIPELINES = {
    'douBan.pipelines.DoubanPipeline': 300,
}

最后执行爬虫文件:

scrapy crawl douban

上一篇下一篇

猜你喜欢

热点阅读