scrapy处理HttpDigestAuth认证
2018-06-06 本文已影响37人
tenlee
由于requests
类库自带了Http Digest Auth,故搬砖到scrapy。
加入如下MiddleWare
,并在settings.py
文件中配置HTTP_DIGEST_USERNAME
和HTTP_DIGEST_PASSWORD
即可
import hashlib
import os
import re
import time
import threading
from scrapy import signals
from requests.compat import urlparse
from requests.utils import parse_dict_header
class HTTPDigestAuthMiddleWare(object):
def __init__(self, username, password):
self.username = username
self.password = password
self._thread_local = threading.local()
@classmethod
def from_crawler(cls, crawler):
s = cls(crawler.settings.get('HTTP_DIGEST_USERNAME'),
crawler.settings.getlist('HTTP_DIGEST_PASSWORD'))
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
def process_request(self, request, spider):
if hasattr(self._thread_local, 'chal') and self._thread_local.chal:
auth = self.build_digest_header(request.method, request.url)
request.headers['Authorization'] = auth
return None
def process_response(self, request, response, spider):
if not response.status == 401:
self._thread_local.num_401_calls = 1
else:
self.handle_401(response)
return request
return response
def init_per_thread_state(self):
if not hasattr(self._thread_local, 'init'):
self._thread_local.init = True
self._thread_local.last_nonce = ''
self._thread_local.nonce_count = 0
self._thread_local.chal = {}
self._thread_local.num_401_calls = 1
def build_digest_header(self, method, url):
"""
:rtype: str
"""
realm = self._thread_local.chal['realm']
nonce = self._thread_local.chal['nonce']
qop = self._thread_local.chal.get('qop')
algorithm = self._thread_local.chal.get('algorithm')
opaque = self._thread_local.chal.get('opaque')
hash_utf8 = None
if algorithm is None:
_algorithm = 'MD5'
else:
_algorithm = algorithm.upper()
# lambdas assume digest modules are imported at the top level
if _algorithm == 'MD5' or _algorithm == 'MD5-SESS':
def md5_utf8(x):
if isinstance(x, str):
x = x.encode('utf-8')
return hashlib.md5(x).hexdigest()
hash_utf8 = md5_utf8
elif _algorithm == 'SHA':
def sha_utf8(x):
if isinstance(x, str):
x = x.encode('utf-8')
return hashlib.sha1(x).hexdigest()
hash_utf8 = sha_utf8
KD = lambda s, d: hash_utf8("%s:%s" % (s, d))
if hash_utf8 is None:
return None
# XXX not implemented yet
entdig = None
p_parsed = urlparse(url)
#: path is request-uri defined in RFC 2616 which should not be empty
path = p_parsed.path or "/"
if p_parsed.query:
path += '?' + p_parsed.query
A1 = '%s:%s:%s' % (self.username, realm, self.password)
A2 = '%s:%s' % (method, path)
HA1 = hash_utf8(A1)
HA2 = hash_utf8(A2)
if nonce == self._thread_local.last_nonce:
self._thread_local.nonce_count += 1
else:
self._thread_local.nonce_count = 1
ncvalue = '%08x' % self._thread_local.nonce_count
s = str(self._thread_local.nonce_count).encode('utf-8')
s += nonce.encode('utf-8')
s += time.ctime().encode('utf-8')
s += os.urandom(8)
cnonce = (hashlib.sha1(s).hexdigest()[:16])
if _algorithm == 'MD5-SESS':
HA1 = hash_utf8('%s:%s:%s' % (HA1, nonce, cnonce))
if not qop:
respdig = KD(HA1, "%s:%s" % (nonce, HA2))
elif qop == 'auth' or 'auth' in qop.split(','):
noncebit = "%s:%s:%s:%s:%s" % (
nonce, ncvalue, cnonce, 'auth', HA2
)
respdig = KD(HA1, noncebit)
else:
# XXX handle auth-int.
return None
self._thread_local.last_nonce = nonce
# XXX should the partial digests be encoded too?
base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
'response="%s"' % (self.username, realm, nonce, path, respdig)
if opaque:
base += ', opaque="%s"' % opaque
if algorithm:
base += ', algorithm="%s"' % algorithm
if entdig:
base += ', digest="%s"' % entdig
if qop:
base += ', qop="auth", nc=%s, cnonce="%s"' % (ncvalue, cnonce)
return 'Digest %s' % (base)
def handle_401(self, resp, **kwargs):
self.init_per_thread_state()
s_auth = resp.headers.get('www-authenticate', b'').decode()
if 'digest' in s_auth.lower() and self._thread_local.num_401_calls < 2:
self._thread_local.num_401_calls += 1
pat = re.compile(r'digest ', flags=re.IGNORECASE)
self._thread_local.chal = parse_dict_header(pat.sub('', s_auth, count=1))
self._thread_local.num_401_calls = 1