使用 Python+Selenium 爬取拉勾网的 python

2020-01-15 本文已影响0人 Mr_戋戋

在知乎上爬虫入门的 100 个案例，不过大多数案例拉下来之后，都是不能运行的，自己就决定把这 100 个案例当成题目，自己手动写一下，防止自己眼高手低。接下来进入我们的主题

方案选择

第一种：示例上给的是使用浏览器的开发者工具中的 Network，找到我们需要的 Api 接口，根据结构获取我们需要的信息。步骤如下：

1.浏览器打开拉勾网地址：https://www.lagou.com

2.浏览器右上角设置-->更多工具-->开发者工具

打开开发工具

3.开发者工具选中-Network，然后选中 XHR。

选中开发者工具中 Network下的 XHR

4.在搜索框输入 Python，点击搜索按钮，可以看到右侧会出现一堆请求.最终我们筛选到了https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false，这个url 对应的 response 内容是我们需要的,参数有（"pn"，"kd"，"sid"）分别代表页码，关键词和一个 id 照抄过来就可以用了。

5.使用 python 模拟浏览器的网络请求

import requests
import json
import pymysql.cursors


def get_position_info(pageNo,keyWords):
#   """返回当前页面的信息列表"""
    url = "https://www.lagou.com/jobs/positionAjax.json"

    headers = {
        "Accept":"application/json, text/javascript, */*; q=0.01",
        "Accept-Encoding":"gzip, deflate, br",
        "Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7",
        "Connection":"keep-alive",
        "Content-Length":"63",
        "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
        "Cookie":"JSESSIONID=ABAAAECABGFABFF2355DDD810AA4B7200888D1AD6748FCE; user_trace_token=20200114140751-a42b5275-8421-48d3-b9e7-68dfd54a7268; WEBTJ-ID=20200114140806-16fa2aa95b2747-088a3c550e67ef-1136685a-1764000-16fa2aa95b37cb; _ga=GA1.2.1962027842.1578982086; _gid=GA1.2.552871320.1578982086; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1578982086; LGUID=20200114140752-33060498-3694-11ea-b2bf-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; X_MIDDLE_TOKEN=52a24466865c3ceb89c3746da8f4044e; lagou_utm_source=A; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fa2aa96bf214-0b00a41712c003-1136685a-1764000-16fa2aa96c0e6c%22%2C%22%24device_id%22%3A%2216fa2aa96bf214-0b00a41712c003-1136685a-1764000-16fa2aa96c0e6c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; X_HTTP_TOKEN=c2100305a94fa1bd24375097511424ef3d7cabe162; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1579057355; LGSID=20200115110222-735d7e62-3743-11ea-b2de-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Futrack%2FtrackMid.html%3Ff%3Dhttps%253A%252F%252Fwww.lagou.com%252Fjobs%252Flist%255Fpython%253FlabelWords%253D%2526fromSearch%253Dtrue%2526suginput%253D%26t%3D1579057335%26_ti%3D1; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; LGRID=20200115110222-735d8073-3743-11ea-b2de-525400f775ce; SEARCH_ID=72019fd8d0824c75bf82cd2e145a7d13",
        "Host":"www.lagou.com",
        "Origin":"https://www.lagou.com",
        "Referer":"https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
        "User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Mobile Safari/537.36",
        "X-Anit-Forge-Code":"0",
        "X-Anit-Forge-Token":"None",
        "X-Requested-With":"XMLHttpRequest"
    }
    postData = {"needAddtionalResult":"false","pn":pageNo,"kd":keyWords,"sid":"098c18fdd37945f7a5781f7d465e0bd8"}
    
    response = requests.post(url, data=postData, headers=headers)
    return response.json()

把打印的结果json 格式化,查找一下我们需要的字段
格式化后的数据
（ps：我们需要的字段分别是：positionName, companyFullName, companySize, financeStage, industryField, city, salary, workYear）

7.把单个的职位信息转换成元组，再把所有的职位信息转换成 list

def get_positions_list(positions_dic):
    list_positions =  positions_dic['content']['positionResult']['result']

    result = []
    for position in list_positions:
      info = []
      info.append(position["positionName"])
      info.append(position["companyFullName"])
      info.append(position["companySize"])
      info.append(position["financeStage"])
      info.append(position["industryField"])
      info.append(position["city"])
      info.append(position["salary"])
      info.append(position["workYear"])
      result.append(info)

    return result
lsit = get_positions_list(get_position_info(1,"python"))
print(list)

8.通过数据库，把爬取到的信息存起来（首先要安装数据库，并且创建好数据库pythondb和表position_info_detail），下方代码是使用 python 连接数据库和向数据库pythondb的position_info_detail表中插入数据

def get_conn():
   '''建立数据库连接'''
   conn = pymysql.connect(host='localhost',
                               user='username',
                               password='password',
                               db='pythondb',
                               charset='utf8mb4',
                               cursorclass=pymysql.cursors.DictCursor)
   return conn


def insert(conn, info):
    '''数据写入数据库'''
#    print(info)
    with conn.cursor() as cursor:
        sql = "INSERT INTO `position_info_detail` (`positionName`, `companyFullName`, `companySize`, `financeStage`, `industryField`, `city`, `salary`, `workYear`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
        cursor.execute(sql, info) 
    conn.commit()

9.完整代码

#!/usr/bin/env python3

# -*- encoding=utf-8 -*-

import requests

import pymysql.cursors


def get_position_info(pageNo,keyWords):
#   """返回当前页面的信息列表"""
    url = "https://www.lagou.com/jobs/positionAjax.json"

    headers = {
        "Accept":"application/json, text/javascript, */*; q=0.01",
        "Accept-Encoding":"gzip, deflate, br",
        "Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7",
        "Connection":"keep-alive",
        "Content-Length":"63",
        "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
        "Cookie":"JSESSIONID=ABAAAECABGFABFF2355DDD810AA4B7200888D1AD6748FCE; user_trace_token=20200114140751-a42b5275-8421-48d3-b9e7-68dfd54a7268; WEBTJ-ID=20200114140806-16fa2aa95b2747-088a3c550e67ef-1136685a-1764000-16fa2aa95b37cb; _ga=GA1.2.1962027842.1578982086; _gid=GA1.2.552871320.1578982086; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1578982086; LGUID=20200114140752-33060498-3694-11ea-b2bf-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; X_MIDDLE_TOKEN=52a24466865c3ceb89c3746da8f4044e; lagou_utm_source=A; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fa2aa96bf214-0b00a41712c003-1136685a-1764000-16fa2aa96c0e6c%22%2C%22%24device_id%22%3A%2216fa2aa96bf214-0b00a41712c003-1136685a-1764000-16fa2aa96c0e6c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; X_HTTP_TOKEN=c2100305a94fa1bd24375097511424ef3d7cabe162; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1579057355; LGSID=20200115110222-735d7e62-3743-11ea-b2de-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Futrack%2FtrackMid.html%3Ff%3Dhttps%253A%252F%252Fwww.lagou.com%252Fjobs%252Flist%255Fpython%253FlabelWords%253D%2526fromSearch%253Dtrue%2526suginput%253D%26t%3D1579057335%26_ti%3D1; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; LGRID=20200115110222-735d8073-3743-11ea-b2de-525400f775ce; SEARCH_ID=72019fd8d0824c75bf82cd2e145a7d13",
        "Host":"www.lagou.com",
        "Origin":"https://www.lagou.com",
        "Referer":"https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
        "User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Mobile Safari/537.36",
        "X-Anit-Forge-Code":"0",
        "X-Anit-Forge-Token":"None",
        "X-Requested-With":"XMLHttpRequest"
    }
    postData = {"needAddtionalResult":"false","pn":pageNo,"kd":keyWords,"sid":"098c18fdd37945f7a5781f7d465e0bd8"}
    
    response = requests.post(url, data=postData, headers=headers)
    return response.json()




def get_positions_list(positions_dic):
    list_positions =  positions_dic['content']['positionResult']['result']

    result = []
    for position in list_positions:
      info = []
      info.append(position["positionName"])
      info.append(position["companyFullName"])
      info.append(position["companySize"])
      info.append(position["financeStage"])
      info.append(position["industryField"])
      info.append(position["city"])
      info.append(position["salary"])
      info.append(position["workYear"])
      result.append(info)

    return result
    


def get_conn():
   '''建立数据库连接'''
   conn = pymysql.connect(host='localhost',
                               user='root',
                               password='password',
                               db='pythondb',
                               charset='utf8mb4',
                               cursorclass=pymysql.cursors.DictCursor)
   return conn


def insert(conn, info):
    '''数据写入数据库'''
#    print(info)
    with conn.cursor() as cursor:
        sql = "INSERT INTO `position_info_detail` (`positionName`, `companyFullName`, `companySize`, `financeStage`, `industryField`, `city`, `salary`, `workYear`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
        cursor.execute(sql, info)
    conn.commit()



def main():
    try:
        conn = get_conn()  # 建立数据库连接  不存数据库 注释此行
        for i in [2,3,4,5,6,7,8,9,10,11]:
            positions_dic = get_position_info(i,"python")
            print(positions_dic)
            array = get_positions_list(positions_dic)
            for position in array:
                insert(conn, tuple(position))

        conn.close()  # 关闭数据库连接，不存数据库 注释此行
    except Exception as e:
        print(e)

if __name__ == '__main__':
    main()

第二种：使用 Selenium进行数据爬取
1.首先就是 selenium 的安装,具体参照selenium官网

#这里写一下简要步骤，terminal 执行以下命令
pip3 install selenium

2.接下来我们使用 selenium 获取到我们需要的 html 页面内容

def get_page_content():
    driver = webdriver.Chrome()
    driver.get("https://www.lagou.com/")
    htmls = []

    try:
        chooseLocationBox = driver.find_element_by_id("cboxWrapper")

        cboxCloseBtn = driver.find_element_by_id("cboxClose")
        cboxCloseBtn.click()
        time.sleep(2)
        input = driver.find_element_by_id("search_input")
        input.clear()
        input.send_keys("python")
        searchBtn = driver.find_element_by_id("search_button")
        searchBtn.click()
        time.sleep(2)

        closeADBtn = driver.find_element_by_class_name("body-btn")
        closeADBtn.click()
        
        htmls.append(driver.page_source)
        try:
#            nextPageBtn = driver.find_element_by_class_name("pager_next")
            i = 1
            while i<2:
                nextPageBtn = driver.find_element_by_class_name("pager_next")
                ActionChains(driver).move_to_element(nextPageBtn).perform()
                nextPageBtn.click();
                time.sleep(2)
                htmls.append(driver.page_source)
                i +=1


        except Exception as e:
            print(e)
            print("当前已经是最后一页了，下一页按钮不可点击")

#        print("html Content:",driver.page_source)
        
        return htmls
        
        
    except Exception as e:
        print("----------error-----------")
        print(e)
        print("--------error end---------")
        return htmls

    driver.close()

3.此处需要安装 BeautifulSoup,不了解的小伙伴可以看一下官网
安装是为了将上一步获取到的 html页面内容，转换成我们需要的职位信息，代码如下：

def get_position_info(htmlContents):
    res = []
    for html in htmlContents:
        soup = BeautifulSoup(html, 'html.parser')
        positions = soup.find_all("li",class_=("con_list_item default_list"))
        for position in positions:
            posi = []
            posi.append(position.get("data-company"))
            posi.append(position.get("data-salary"))
            posi.append(position.get("data-positionname"))
            res.append(posi)
            
    return res

4.最后将爬取到的数据信息存储到数据库和 excel 表格即可

def main():

    info = get_position_info(get_page_content())
    lang_name = 'python'
    wb = Workbook()  # 打开 excel 工作簿

    ws1 = wb.active
    ws1.title = lang_name
    
    conn = get_conn()
    
    
    for row in info:
        ws1.append(row)
        insert(conn,tuple(row))

    wb.save('{}职位信息.xlsx'.format(lang_name))

完整代码：

#!/usr/bin/env python3

# -*-encoding:utf-8-*-

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import requests
import re
from selenium.webdriver.common.action_chains import ActionChains #引入ActionChains鼠标操作类
from openpyxl import Workbook

import pymysql.cursors


def get_conn():
    """建立数据库连接"""
    conn = pymysql.connect(host='localhost',
                            user='root',
                            password='jj1234567',
                            db='pythondb',
                            charset='utf8',
                            cursorclass=pymysql.cursors.DictCursor)
    return conn;

def insert(conn,info):
    print(info)
    with conn.cursor() as cursor:
        sql = "INSERT INTO `position_info` (`company_name`,`salary`,`position_name`) VALUES (%s, %s, %s)"
        cursor.execute(sql,info)
    conn.commit()


def get_page_content():
    driver = webdriver.Chrome()
    driver.get("https://www.lagou.com/")
    htmls = []

    try:
        chooseLocationBox = driver.find_element_by_id("cboxWrapper")

        cboxCloseBtn = driver.find_element_by_id("cboxClose")
        cboxCloseBtn.click()
        time.sleep(2)
        input = driver.find_element_by_id("search_input")
        input.clear()
        input.send_keys("python")
        searchBtn = driver.find_element_by_id("search_button")
        searchBtn.click()
        time.sleep(2)

        closeADBtn = driver.find_element_by_class_name("body-btn")
        closeADBtn.click()
        
        htmls.append(driver.page_source)
        try:
#            nextPageBtn = driver.find_element_by_class_name("pager_next")
            i = 1
            while i<2:
                nextPageBtn = driver.find_element_by_class_name("pager_next")
                ActionChains(driver).move_to_element(nextPageBtn).perform()
                nextPageBtn.click();
                time.sleep(2)
                htmls.append(driver.page_source)
                i +=1


        except Exception as e:
            print(e)
            print("当前已经是最后一页了，下一页按钮不可点击")

#        print("html Content:",driver.page_source)
        
        return htmls
        
        
    except Exception as e:
        print("----------error-----------")
        print(e)
        print("--------error end---------")
        return htmls

    driver.close()
    
def get_position_info(htmlContents):
    res = []
    for html in htmlContents:
        soup = BeautifulSoup(html, 'html.parser')
        positions = soup.find_all("li",class_=("con_list_item default_list"))
        for position in positions:
            posi = []
            posi.append(position.get("data-company"))
            posi.append(position.get("data-salary"))
            posi.append(position.get("data-positionname"))
            res.append(posi)
            
    return res



def main():

    info = get_position_info(get_page_content())
    lang_name = 'python'
    wb = Workbook()  # 打开 excel 工作簿

    ws1 = wb.active
    ws1.title = lang_name
    
    conn = get_conn()
    
    
    for row in info:
        ws1.append(row)
        insert(conn,tuple(row))

    wb.save('{}职位信息.xlsx'.format(lang_name))

if __name__ == '__main__':
   main()

GitHub 地址： https://github.com/jiangongzheng/spider

使用 Python+Selenium 爬取拉勾网的 python

方案选择

猜你喜欢

热点阅读