抓取智联招聘数据

2018-07-17  本文已影响146人  幻想无极

运行平台: Mac
Python版本: Python3.7
IDE: Sublime Text
其他工具: Chrome浏览器
时间:2018.7.17

1.分析请求链接

搜索页面链接

https://sou.zhaopin.com/jobs/searchresult.ashx?jl=成都&kw=iOS&sm=0&p=1

字段

l=成都 #工作地点
kw=iOS  #职位
sm=0  #区
p=1 #页数

2.模拟请求获取网页


def get_one_page(city, keyword, region, page):
    paras = {
        'jl': city,         # 搜索城市
        'kw': keyword,      # 搜索关键词 
        'isadv': 0,         # 是否打开更详细搜索选项
        'isfilter': 1,      # 是否对结果过滤
        'p': page,          # 页数
        # 're': region        # region的缩写,地区,2005代表海淀
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        'Host': 'sou.zhaopin.com',
        'Referer': 'https://www.zhaopin.com/',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9'
    }

    url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' + urlencode(paras)
    try:
        # 获取网页内容,返回html数据
        response = requests.get(url, headers=headers)
        # 通过状态码判断是否获取成功
        if response.status_code == 200:
            return response.text
        return None
    except RequestException as e:
        return None

3.分析网页标签编写正则

    pattern = re.compile('td class="zwmc".*?href="(.*?)" target="_blank">(.*?)</a>.*?'
    '<td class="gsmc"><a href="(.*?)".*?target="_blank">(.*?)</a>.*?'
    '<td class="zwyx">(.*?)</td>.*?'
    '<td class="gxsj"><span>(.*?)</span>.*?'
    '<li class="newlist_deatil_two"><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span>', re.S)                  
        #匹配所有符合条件的内容

4.取出数据并存入数据

def main(city, keyword, region, pages):
    
    filename = '智联招聘测试_' + city + '_' + keyword+ '.csv'
    headers = ['网站','岗位', '公司网站', '公司', '薪水','首发日','地点','公司性质','规模','经验']
    write_csv_headers(filename, headers)

    for i in tqdm(range(pages)):
        
        jobs = []

        html = get_one_page(city,keyword,region,i)
        items = parse_one_page(html)
        for item in items:
            jobs.append(item)
        write_csv_rows(filename, headers, jobs)

5.源码

#-*- coding: utf-8 -*-
import requests
import re
import csv
from tqdm import tqdm
from urllib.parse import urlencode
from requests.exceptions import RequestException


def write_csv_file(path, headers, rows):
    '''
    将表头和行写入csv文件
    '''
    # 加入encoding防止中文写入报错
    # newline参数防止每写入一行都多一个空行
    with open(path, 'a', encoding='gb18030', newline='') as f:
        f_csv = csv.DictWriter(f, headers)
        f_csv.writeheader()
        f_csv.writerows(rows)

def write_csv_headers(path, headers):
    '''
    写入表头
    '''
    with open(path, 'a', encoding='gb18030', newline='') as f:
        f_csv = csv.DictWriter(f, headers)
        f_csv.writeheader()

def write_csv_rows(path, headers, rows):
    '''
    写入行
    '''
    with open(path, 'a', encoding='gb18030', newline='') as f:
        f_csv = csv.DictWriter(f, headers)
        f_csv.writerows(rows)


def get_one_page(city, keyword, region, page):
    paras = {
        'jl': city,         # 搜索城市
        'kw': keyword,      # 搜索关键词 
        'isadv': 0,         # 是否打开更详细搜索选项
        'isfilter': 1,      # 是否对结果过滤
        'p': page,          # 页数
        # 're': region        # region的缩写,地区,2005代表海淀
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        'Host': 'sou.zhaopin.com',
        'Referer': 'https://www.zhaopin.com/',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9'
    }

    url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' + urlencode(paras)
    try:
        # 获取网页内容,返回html数据
        response = requests.get(url, headers=headers)
        # 通过状态码判断是否获取成功
        if response.status_code == 200:
            return response.text
        return None
    except RequestException as e:
        return None

def parse_one_page(html):
    #
    pattern = re.compile('td class="zwmc".*?href="(.*?)" target="_blank">(.*?)</a>.*?'
    '<td class="gsmc"><a href="(.*?)".*?target="_blank">(.*?)</a>.*?'
    '<td class="zwyx">(.*?)</td>.*?'
    '<td class="gxsj"><span>(.*?)</span>.*?'
    '<li class="newlist_deatil_two"><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span><span>(.*?)</span>', re.S)                  
        #匹配所有符合条件的内容
    items = re.findall(pattern,html)

    for item in items:
        #ob_name = job_name.replace('<b>', '')
        #地点:
        job_name = item[1].replace('<b>', '')
        job_name = job_name.replace('</b>', '')
        city = item[6]
        city = city.replace('地点:', '')
        nature = item[7].replace('公司性质:', '')
        size = item[8].replace('公司规模:', '')
        experience = item[9].replace('经验:', '')
        experience = experience.replace('学历:', '')
        if '学历:' in size:
            experience = size
            experience = experience.replace('学历:', '')
            size = ''
        if len(experience)>10:
            experience = ''
        yield {
            '网站': item[0],
            '岗位': job_name,
            '公司网站': item[2],
            '公司': item[3],
            '薪水': item[4],
            '首发日': item[5],
            '地点': city,
            '公司性质': nature,
            '规模': size,
            '经验': experience,
        }

def main(city, keyword, region, pages):
    
    filename = '智联招聘测试_' + city + '_' + keyword+ '.csv'
    headers = ['网站','岗位', '公司网站', '公司', '薪水','首发日','地点','公司性质','规模','经验']
    write_csv_headers(filename, headers)

    for i in tqdm(range(pages)):
        
        jobs = []

        html = get_one_page(city,keyword,region,i)
        items = parse_one_page(html)
        for item in items:
            jobs.append(item)
        write_csv_rows(filename, headers, jobs)

if __name__ == '__main__':
    main('成都', 'ios', 2005, 10)


6.效果

1153D852E628D56A71873CFBC3E2B937.jpg
上一篇 下一篇

猜你喜欢

热点阅读