socket 搭建一个简单的爬虫客户端
2018-07-18 本文已影响123人
两点半的杂货铺
[danger] 编写client
1.拆分一个做协议判断的方法socket_by_protocol
2.一个循环获得服务器返回所有信息的方法response_by_socket
3.一个解析 header body 请求状态码的方法,巧妙利用元组可以被多个元素接受
4.在整体的大方法中,我们也可以用列表存储代替if判断
import socket,ssl
def socket_by_protocol(protocol):
"""
判断使用http 还是https 协议
"""
if protocol == "http":
s = socket.socket()
else:
s = ssl.wrap_socket(socket.socket())
return s
def response_by_socket(s):
"""
参数是一个 socket 实例
返回这个 socket 读取的所有数据
"""
response = b''
buffer_size = 1024
while True:
r = s.recv(buffer_size)
if len(r) == 0:
break
response += r
return response
def parsed_response(r):
"""
把 response 解析出 状态码 headers body 返回
状态码是 int
headers 是 dict
body 是 str
"""
header, body = r.split('\r\n\r\n', 1)
h = header.split('\r\n')
status_code = h[0].split()[1]
status_code = int(status_code)
headers = {}
for line in h[1:]:
k, v = line.split(': ')
headers[k] = v
return status_code, headers, body
def get(url):
protocol, host, port, path = parsed_url(url)
s = socket_by_protocol(protocol)
s.connect((host, port))
# 不用持续连接Connection: close
request = 'GET {} HTTP/1.1\r\nHost: {}\r\nConnection: close\r\n\r\n'.format(path, host)
s.send(request.encode("utf-8"))
response = response_by_socket(s)
r = response.decode("utf-8")
parsed_response(r)
status_code, headers, body = parsed_response(r)
if status_code in [301, 302]:
url = headers['Location']
return get(url)
return status_code, headers, body