分析了上千个谷歌职位，发现...

2018-07-22 本文已影响117人 Gaius_Yao

0 引言

Google 通过其招聘网站发布了大量的职位，这些数据有助于我们了解当今互联网巨头之一的 Google 需要怎么样的人才，也从一个侧面体现了当今互联网行业最流行哪些编程语言或工具。感谢 Niyamat Ullah，他爬取了 1250 条 Google 招聘信息，并发布在 Kaggle 上，而本文也正是使用了他提供的数据集，对 Google 的招聘信息进行简单分析。

1 导入相关包

# Jupyter 魔法函数，在当前页面输出图像
%matplotlib inline

# 科学计算
import numpy as np
# 数据处理及导入导出
import pandas as pd

# 数据可视化
import matplotlib.pyplot as plt
plt.style.use('ggplot') #使用 ggplot 主题
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体  
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 
# 更好的可视化效果
import seaborn as sns
sns.set_style("whitegrid") #设置 Seaborn 主题
sns.set(font='SimHei')  # 解决 Seaborn 中文显示问题

# 词云
from wordcloud import WordCloud, STOPWORDS
from imageio import imread

# 正则表达式
import re
# 默认字典
from collections import defaultdict

2 准备数据

导入数据集，并查看数据集信息。

# 导入数据集
df = pd.read_csv('job_skills.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 7 columns):
Company 1250 non-null object
Title 1250 non-null object
Category 1250 non-null object
Location 1250 non-null object
Responsibilities 1235 non-null object
Minimum Qualifications 1236 non-null object
Preferred Qualifications 1236 non-null object
dtypes: object(7)
memory usage: 68.4+ KB

查看 DataFrame 中 5 行数据。

df.sample(5)

修改列名。

# 修改 DataFrame 列名
df.columns = ['公司名称', '职位名称', '职位类别', '工作地点', '岗位职责', '最低资格', '首选资格']

检查是否含有 NaN 值，若有，则将其删除。

# 检查是否含有 NaN 值
pd.isnull(df).sum()

公司名称 0
职位名称 0
职位类别 0
工作地点 0
岗位职责 15
最低资格 14
首选资格 14
dtype: int64

# 删除含 NaN 值的列
df = df.dropna(how='any', axis='rows')

检查是否含有其他公司的记录，若有，则将其删除。
# 虽然油管也属于 Google 的，但这里主要是分析 Google 的招聘信息，因此将其删除。

df['公司名称'].value_counts()

Google 1212
YouTube 23
Name: 公司名称, dtype: int64

# 删除 YouTube
df = df[df['公司名称'] != 'YouTube']

在之前查看 DataFrame 中 5 条记录的时候，不难发现，工作地点那列其实是包含了工作城市和工作国家的，将其分成新的两列。

df['工作城市'] = df['工作地点'].apply(lambda x : x.split(',')[0])
df['工作国家'] = df['工作地点'].apply(lambda x : x.split(',')[-1])

3 统计分析

首先对 DataFrame 进行快速统计汇总。

df.describe()

接下来，查看下这些职位都分布在那些国家、城市和类别。

def top15_bar(df, series):
    """
        生成前 15 的条形图
        
        Args:
            df: DataFrame Name, DataFrame
            series: Series name, string
    """
    f, ax = plt.subplots(figsize=(12, 6))
    
    top15 = df[series].value_counts()[0:15]
    
    title = series + ' TOP 15'
    x = top15.values
    y = top15.index
    
    ax.set(title=title)
    sns.set_color_codes("muted")
    sns.barplot(x=x, y=y, color='b');

top15_bar(df, '工作国家')

top15_bar(df, '工作城市')

top15_bar(df, '职位类别')

很明显，一半的职位都集中在美国，而销售、市场两种类别占据了最多的职位。
再看看在 Google 内最受欢迎的编程语言是什么，这需要先对 DataFrame 进行一些处理。

# 编程语言列表
programing_language_list = ['java', 'c', 'c++', 'python', 'c#', 'php', 'javascript', 'sql', 'objective-c', 
                            'swift', 'ruby', 'r', 'matlab', 'perl', 'go', 'kotlin']

def tostr(df, series):
    """
        将指定 Series 转换为字符串
        
        Args:
            df: DataFrame name, DataFrame
            series: Series name, string
            
        Returns:
            string
    """
    list_temp = df[series].tolist() #先转换成列表
    string_temp =  "".join(str(v) for v in list_temp).lower()
    
    return string_temp

minimum_qualifications = tostr(df, '最低资格')

def word_count(countstr, wordlist):
    """
        对指定词语进行计数
        
        Args:
            countstr: target string, string
            wordlist: words to count, list
            
        Returns:
            DataFrame
    """    
    wordcount = dict((x,0) for x in wordlist)
    for w in re.findall(r"[\w']+|[.,!?;’]", countstr):
        if w in wordcount:
            wordcount[w] += 1
            
    result = sorted(wordcount.items(), key=lambda kv: kv[1], reverse=True) #对结果进行排序
    df_temp = pd.DataFrame(result, columns=['Item','Frequency']) #转换为 DataFrame
    df_temp['Item'] = df_temp['Item'].str.capitalize()
    
    return df_temp

programming_lang_count = word_count(minimum_qualifications, programing_language_list)

f, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x="Frequency",  y="Item", data=programming_lang_count, palette="Blues_d");

在 Google，最受欢迎的编程语言前 5 名分别是 Python、C、Javascript、Java、SQL。接着继续看看 Google 中对学位和工作年限的最低要求又是怎么样的。

# 学位列表
degree_list = ["ba", "bs", "bachelor", "mba", "master", "phd"]

degree_count = word_count(minimum_qualifications, degree_list)

f, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x="Frequency",  y="Item", data=degree_count, palette="Blues_d");

在 Google 对学位的最低要求中，理学学士和文学学士占了绝大部分。

years_exp = defaultdict(lambda: 0)
for w in re.findall(r'([0-9]+) year', minimum_qualifications):
     years_exp[w] += 1
             
df_years_exp = pd.DataFrame.from_dict(years_exp, 'index') #创建一个新的 DataFrame   
df_years_exp = df_years_exp.reset_index()
df_years_exp.columns = ['Years', 'Frequency']
df_years_exp = df_years_exp.astype('int') #转换为整型
df_years_exp.sort_values(by='Years', inplace=True)

f, ax = plt.subplots(figsize=(12, 6))
with sns.color_palette("husl", 8):
    sns.barplot(x="Years",  y="Frequency", data=df_years_exp);

Google 对于工作年限的最低要求，主要集中于 3-5 年。再来看看对工作年限的要求，与职位类别有没关系。

df['工作年限'] = df['最低资格'].str.extract(r'([0-9]+) year')

df_cate = df[['职位类别', '工作年限', '工作国家']]

f, ax = plt.subplots(figsize=(12, 12))
with sns.color_palette("husl", 3):
    sns.countplot(y='职位类别', hue='工作年限', data=df_cate, 
                  hue_order=df_cate['工作年限'].value_counts().iloc[:3].index)
plt.yticks(fontsize=16)
plt.show()

4 词云

将 Google 对这些职位的最低要求和首选要求生成词云。

# 设置停用词
STOPWORDS.add('degree')
STOPWORDS.add('name')
STOPWORDS.add('dtype')
STOPWORDS.add('location')
stopwords = set(STOPWORDS)

def word_cloud(df, series):
    """
        生成词云并保存为图片
        
        Args:
            df: DataFrame name, DataFrame
            series: Series name, string
    """
    mask = imread('img\google.jpg')
    
    wordcloud = WordCloud(
                                background_color = 'white',
                                stopwords = stopwords,
                                mask = mask,
                                max_font_size = 128, 
                                min_font_size = 8,
                                random_state = False
                            ).generate(str(df[series]))

    plt.figure(figsize = (12, 12))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show();

word_cloud(df, '最低资格')

word_cloud(df, '首选资格')

5 分析技术解决方案类别

最后，选择技术解决方案（Technical Solutions）这一职位类别进行分析，首先将该职位类别下的数据筛选出来。之后查看该类别下 TOP 15 的工作国家和工作城市，以及最受欢迎的编程语言，最后将该类别的最低任职资格、首选任职资格和岗位职责生成对应词云。

df_TS = df[df['职位类别'] == 'Technical Solutions']

df_TS.describe()

df_TS.sample(3)

top15_bar(df_TS, '工作国家')

top15_bar(df_TS, '工作城市')

TS_minimum_qualifications = tostr(df_TS, '最低资格')

TS_programming_lang_count = word_count(TS_minimum_qualifications, programing_language_list)

f, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x="Frequency",  y="Item", data=TS_programming_lang_count, palette="Blues_d");

word_cloud(df_TS, '最低资格')

word_cloud(df_TS, '首选资格')

word_cloud(df_TS, '岗位职责')