6.爬取拉勾网职位信息

2020-07-18  本文已影响0人  M_小七

首先先尝试爬取沈阳python相关岗位信息

# 爬取拉钩网python信息
def spider_lagou():
    from urllib import request, parse
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
    url = "https://www.lagou.com/jobs/list_python/p-city_44?&cl=false&fromSearch=true&labelWords=&suginput=
"
    headers = {"User-Agent":" Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"}
    req = request.Request(url, headers = headers)
    resp = request.urlopen(req).read().decode("utf-8")
    print(resp)
if __name__ =='__main__':
    spider_lagou()

此时我们得到爬取数据,但是并不能在得到的数据内查找到岗位信息
通过观察可以发现此网站的职位信息并不是在https://www.lagou.com/jobs/list_python/p-city_44?&cl=false&fromSearch=true&labelWords=&suginput=内,而是通过https://www.lagou.com/jobs/positionAjax.json?city=%E6%B2%88%E9%98%B3&needAddtionalResult=false"这里传入修改以下代码

# 爬取拉钩网python信息
def spider_lagou():
    from urllib import request, parse
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
    url = "https://www.lagou.com/jobs/positionAjax.json?city=%E6%B2%88%E9%98%B3&needAddtionalResult=false"
    headers = {"User-Agent":" Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"}
    data = {"first": "true", "pn": 1, "kd": "python"}
    req = request.Request(url, headers = headers, data = parse.urlencode(data).encode("utf-8"), method = "POST")
    resp = request.urlopen(req).read().decode("utf-8")
    print(resp)
if __name__ =='__main__':
    spider_lagou()

运行后得到提示:{"status":false,"msg":"您操作太频繁,请稍后再访问","clientIp":"182.200.18.29","state":2402},然后就把参数更详细的添加进去发现也不好用,查资料(https://blog.csdn.net/m0_43400362/article/details/88396490
)用requests,那么我也尝试使用requests。
拉钩的网页加载的时候有一个url专门返回除了招聘信息以外的其它东西,加载招聘信息的时候会产生另外一个ajax请求,请求返回的才是想要爬取的内容,只需要在先发送主请求,之后用requests.Session()建立session,建立完成session之后通过session来获取cookie,拿到cookie就可以直接用

import requests
s = requests.Session( )
# requests库的session会话对象可以跨请求保持某些参数,说白了,就是比如你使用session成功的登录了某个网站,则在再次使用该session对象请求该网站的其他网页都会默认使用该session之前使用的cookie等参数

优化后可以正常爬取拉勾网职位信息
下面是代码(爬取后存入excel)

# 爬取拉钩网python信息
def spider_lagou():
    import requests
    import time
    import json
    import ssl

    # 在浏览器中访问该网站时会弹出证书不受信任,但是忽略仍可继续访问
    ssl._create_default_https_context = ssl._create_unverified_context

    url_start = "https://www.lagou.com/jobs/list_python/p-city_44?&cl=false&fromSearch=true&labelWords=&suginput="
    url_parse = "https://www.lagou.com/jobs/positionAjax.json?city=%E6%B2%88%E9%98%B3&needAddtionalResult=false"
    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
               "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
               "Referer": "https://www.lagou.com/jobs/list_python/p-city_44?&cl=false&fromSearch=true&labelWords=&suginput="
              }
    info_list = []
    for x in range(1, 3):
        data = {"first": "true", "pn": str(x), "kd": "python"}
        s = requests.Session()
        s.get(url_start, headers=headers, timeout=3)  # 请求首页获取cookies
        cookie = s.cookies  # 为此次获取的cookies
        resp = s.post(url_parse, data=data, headers=headers, cookies=cookie, timeout=3)  # 获取此次文本
        time.sleep(5)
        resp.encoding = resp.apparent_encoding
        text = json.loads(resp.text)
        info = text["content"]["positionResult"]["result"]
        for i in info:
            info_d = {}
            info_d['公司名称'] = i["companyFullName"]
            info_d['职位名称'] = i["positionName"]
            info_d['工资'] = i["salary"]
            info_d['公司规模'] = i["companySize"]
            info_d['技能要求'] = i["skillLables"]
            info_d['公司位置'] = i["district"]
            # print(info_d)
            info_list.append(info_d)
    print(info_list)
    return info_list




def save_excel(info_list):
    import xlwt
    workbook = xlwt.Workbook()
    ws = workbook.add_sheet("information")

    style = xlwt.XFStyle()
    for i in info_list:
        keys = list(i.keys())

    for i in range(len(keys)):
        ws.write(0, i, keys[i], style = style)

    for li in range(len(info_list)):

        for key, value in info_list[li].items():
            keys = list(info_list[li].keys())
            # 找到key的index
            ws.write(li + 1, keys.index(key), value, style = style)

        # 保存表
        workbook.save('职位信息.xls')
if __name__ =='__main__':
    info_list = spider_lagou()
    save_excel(info_list)
沈阳python相关职位信息
上一篇下一篇

猜你喜欢

热点阅读