网络爬虫:urllib模块应用9--urllib--parse包
2018-12-23 本文已影响0人
牛耀
# urllib的parse模块:实现url的解析、合并、编码、解码
from urllib import parse
# 实现url的识别和分段(协议、域、路径 #锚点)
url = 'https://www.1217B.com/daxuesheng?name=zhangsan#123'
"""
url:要解析和拆分的url
scheme:设置协议,只有在没有协议的情况下才会生效
allow_fragments=True:是否忽略锚点,默认True不忽略
(scheme='https'(协议),
netloc='www.1217B.com'(域),
path='/daxuesheng'(路径),
params=''(可选项), # https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=技术&kt=3&_v=0.92285166&x-zp-page-request-id=93cd5308082743a794c9d50e646f8e08-1545304784592-499104
query='name=zhangsan'(查询参数),
fragment='123'(锚点))
"""
# 拆分
url = 'https://fe-api.zhaopin.com/c/i/sou?&pageSize=90&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=技术&kt=3&_v=0.92285166&x-zp-page-request-id=93cd5308082743a794c9d50e646f8e08-1545304784592-499104'
result = parse.urlparse(url)
str = 'start={}'.format()
print('a',result)
# 取出拆分后的某一个参数
print(result.scheme)
#url组合 ,data长度必须是6的可迭代对象
data = [sub_str for sub_str in result]
print('---',data)
full_url = parse.urlunparse(data)
print('urlunparse',full_url)
# url拼接,需要传递一个基类的url,根据基类将某个不完整的url拼接完成
sub_url = '/p/123456'
# 基类url
base_url = 'https://www.1217B.com/daxuesheng?name=zhangsan#123'
full_url = parse.urljoin(base_url,sub_url)
print('urljoin:',full_url)
# 将字典类型的参数转为url的编码格式的字符串
parmars = {
'name':'扎昂三',
'class':'1712B'
}
result = parse.urlencode(parmars)
print('urlencode:',result)
# 反序列化,将url格式编码的字符串转化为字典类型
result = parse.parse_qs(result)
print('parse.parse.qs:',result)
# 可以将中文字符转化为url编码格式
kw = '某某某'
result = parse.quote(kw)
print('parse.quote',result)
# 将url编码解码
result = parse.unquote(result)
print('parse.unquote:',result)
# encode、join:最常用的两个方法