主成分分析_员工离职分析

2021-03-29  本文已影响0人  a_big_cat

介绍

利用主成分分析原理,将原来的变量重新组合成一组互相无关的几个综合变量,而这些变量尽可能的保留原有的信息,
从而达到降维的目的,低维的数据让人更易进行可视化分析,以便观察数据的结构分布。

# satisfaction_level :对公司的满意程度
# last_evaluation :对公司的评价
# number_project :做过项目的数量
# average_montly_hours :每月工作时长
# time_spend_company :每天在公司的时间
# Work_accident :工作差错
# promotion_last_5years :五年内有没有提升

1.导入数据

import numpy as np
import pandas as pd
HR_comma_sep=pd.read_csv('./HR_comma_sep.csv')
HR_comma_sep.head()
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low

2.数据探索



HR_comma_sep.shape


(14999, 10)


HR_comma_sep.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


HR_comma_sep.describe().T


count mean std min 25% 50% 75% max
satisfaction_level 14999.0 0.612834 0.248631 0.09 0.44 0.64 0.82 1.0
last_evaluation 14999.0 0.716102 0.171169 0.36 0.56 0.72 0.87 1.0
number_project 14999.0 3.803054 1.232592 2.00 3.00 4.00 5.00 7.0
average_montly_hours 14999.0 201.050337 49.943099 96.00 156.00 200.00 245.00 310.0
time_spend_company 14999.0 3.498233 1.460136 2.00 3.00 3.00 4.00 10.0
Work_accident 14999.0 0.144610 0.351719 0.00 0.00 0.00 0.00 1.0
left 14999.0 0.238083 0.425924 0.00 0.00 0.00 0.00 1.0
promotion_last_5years 14999.0 0.021268 0.144281 0.00 0.00 0.00 0.00 1.0


import numpy as np 
import pandas as pd  
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
correlation = HR_comma_sep.corr()
plt.figure(figsize=(10,10))
sns.heatmap(correlation, vmax=1,
square=True,annot=True,cmap='cubehelix')
plt.title('Correlation between different fearures')


Text(0.5, 1, 'Correlation between different fearures')
output_9_1.png

3.模型开发



HR_comma_sep_X=HR_comma_sep.drop(labels=['sales','salary','left'],axis=1)
HR_comma_sep_X.head()


satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident promotion_last_5years
0 0.38 0.53 2 157 3 0 0
1 0.80 0.86 5 262 6 0 0
2 0.11 0.88 7 272 4 0 0
3 0.72 0.87 5 223 5 0 0
4 0.37 0.52 2 159 3 0 0


from sklearn.preprocessing import StandardScaler
HR_comma_sep_X_std = StandardScaler().fit_transform(HR_comma_sep_X)




mean_vec = np.mean(HR_comma_sep_X_std, axis=0)
cov_mat = (HR_comma_sep_X_std - mean_vec).T.dot((HR_comma_sep_X_std - mean_vec)) / (HR_comma_sep_X_std.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)


Covariance matrix 
[[ 1.00006668  0.10502822 -0.14297912 -0.02004945 -0.1008728   0.05870115
   0.02560689]
 [ 0.10502822  1.00006668  0.34935588  0.33976445  0.1315995  -0.00710476
  -0.00868435]
 [-0.14297912  0.34935588  1.00006668  0.41723845  0.19679901 -0.00474086
  -0.00606436]
 [-0.02004945  0.33976445  0.41723845  1.00006668  0.12776343 -0.01014356
  -0.00354465]
 [-0.1008728   0.1315995   0.19679901  0.12776343  1.00006668  0.00212056
   0.06743742]
 [ 0.05870115 -0.00710476 -0.00474086 -0.01014356  0.00212056  1.00006668
   0.03924805]
 [ 0.02560689 -0.00868435 -0.00606436 -0.00354465  0.06743742  0.03924805
   1.00006668]]


eig_vals, eig_vecs = np.linalg.eig(cov_mat)
print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)


Eigenvectors 
[[-0.08797699 -0.29189921  0.27784886  0.33637135  0.79752505  0.26786864
  -0.09438973]
 [ 0.50695734  0.30996609 -0.70780994  0.07393548  0.33180877  0.1101505
  -0.13499526]
 [ 0.5788351  -0.77736008 -0.00657105 -0.19677589 -0.10338032 -0.10336241
  -0.02293518]
 [ 0.54901653  0.45787675  0.63497294 -0.25170987  0.10388959 -0.01034922
  -0.10714981]
 [ 0.31354922  0.05287224  0.12200054  0.78782241 -0.28404472  0.04036861
   0.42547869]
 [-0.01930249  0.04433104 -0.03622859 -0.05762997  0.37489883 -0.8048393
   0.45245222]
 [ 0.00996933  0.00391698 -0.04873036 -0.39411153  0.10557298  0.50589173
   0.75836313]]

Eigenvalues 
[1.83017431 0.54823098 0.63363587 0.84548166 1.12659606 0.95598647
 1.06036136]


# 构造元组
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
# 对特征值进行排序
eig_pairs.sort(key=lambda x: x[0], reverse=True)
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])


Eigenvalues in descending order:
1.830174313875499
1.1265960639915473
1.0603613622840846
0.9559864740066265
0.8454816637143464
0.633635874483021
0.5482309765420602


tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
plt.figure(figsize=(6, 4))
plt.bar(range(7), var_exp, alpha=0.5, align='center', label='individual explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()


output_16_0.png


matrix_w = np.hstack((eig_pairs[0][1].reshape(7,1), 
                      eig_pairs[1][1].reshape(7,1)
                    ))
print('Matrix W:\n', matrix_w)


Matrix W:
 [[-0.08797699  0.79752505]
 [ 0.50695734  0.33180877]
 [ 0.5788351  -0.10338032]
 [ 0.54901653  0.10388959]
 [ 0.31354922 -0.28404472]
 [-0.01930249  0.37489883]
 [ 0.00996933  0.10557298]]


Y = HR_comma_sep_X_std.dot(matrix_w)
Y


array([[-1.90035018, -1.12083103],
       [ 2.1358322 ,  0.2493369 ],
       [ 3.05891625, -1.68312693],
       ...,
       [-2.0507165 , -1.182032  ],
       [ 2.91418496, -1.42752606],
       [-1.91543672, -1.17021407]])


from sklearn.decomposition import PCA
pca = PCA().fit(HR_comma_sep_X_std)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlim(0,7,1)
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')


Text(0, 0.5, 'Cumulative explained variance')
output_19_1.png


from sklearn.decomposition import PCA 
sklearn_pca = PCA(n_components=6)
Y_sklearn = sklearn_pca.fit_transform(HR_comma_sep_X_std)
print(Y_sklearn)
Y_sklearn.shape


[[-1.90035018 -1.12083103 -0.0797787   0.03228437 -0.07256447  0.06063013]
 [ 2.1358322   0.2493369   0.0936161   0.50676925  1.2487747  -0.61378158]
 [ 3.05891625 -1.68312693 -0.301682   -0.4488635  -1.12495888  0.29066929]
 ...
 [-2.0507165  -1.182032   -0.04594506  0.02441143 -0.01553247  0.24980658]
 [ 2.91418496 -1.42752606 -0.36333357 -0.31517759 -0.97107375  0.51444624]
 [-1.91543672 -1.17021407 -0.07024077  0.01486762 -0.09545357  0.01773844]]





(14999, 6)

4.总结:

前6个因子方差累计超过90%,因此可以将7维特征空间缩减为6维子空间。

上一篇下一篇

猜你喜欢

热点阅读