数据工程师数据咖程序员

泰坦尼克(Titanic)的数据分析(特征工程部分)

2017-03-12  本文已影响1154人  苟雨

泰坦尼克数据集是一个好的可选数据集对于kaggle的新手,
而且很多获胜的kaggle竞赛的团队都对这个数据集有很好的分析.

import numpy as np
import pandas as pd
import re
import sklearn
train_ = pd.read_csv('train.csv')
# test_ = pd.read_csv('test.csv')
print(train_.head())
PassengerId = train_['PassengerId']
print('follow is passengerid')
print(PassengerId[:5])
PassengerId  Survived  Pclass  \

0            1         0       3

1            2         1       1

2            3         1       3

3            4         1       1

4            5         0       3

                                               Name     Sex   Age  SibSp  \

0                            Braund, Mr. Owen Harris    male  22.0      1

1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1

2                             Heikkinen, Miss. Laina  female  26.0      0

3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1

4                           Allen, Mr. William Henry    male  35.0      0

  Parch            Ticket     Fare Cabin Embarked

0      0         A/5 21171   7.2500   NaN        S

1      0          PC 17599  71.2833   C85        C

2      0  STON/O2. 3101282   7.9250   NaN        S

3      0            113803  53.1000  C123        S

4      0            373450   8.0500   NaN        S

follow is passengerid

0    1

1    2

2    3

3    4

4    5

Name: PassengerId, dtype: int64

In [3]:

print (train_[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())

print (train_[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean())

Pclass  Survived

0       1  0.629630

1       2  0.472826

2       3  0.242363

Sex  Survived

0  female  0.742038

1    male  0.188908

In [4]:

train_['Age'].mean()

Out[4]:

29.69911764705882

数据清理

feature engineering 特征工程,构造出我们需要的特征
加上一些自己推出的特征

# 计算名字的长度
train_['Name_length'] = train_['Name'].apply(len)
# 将旅客是否住在头等舱二值化
train_['Has_Cabin'] = train_["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
# 发现房子的大小
train_['FamilySize'] = train_['SibSp'] + train_['Parch'] + 1
# 发现是否独居
train_['IsAlone'] = 0
train_.loc[train_['FamilySize'] == 1, 'IsAlone'] = 1
# 移除所有穿上人员的 Embarked 的NULL值
train_['Embarked'] = train_['Embarked'].fillna('S')
train_['Fare'] = train_['Fare'].fillna(train_['Fare'].median())
train_['CategoricalFare'] = pd.qcut(train_['Fare'], 4)
age_avg = train_['Age'].mean()
age_std = train_['Age'].std()
age_null_count = train_['Age'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
train_['Age'][np.isnan(train_['Age'])] = age_null_random_list
train_['Age'] = train_['Age'].astype(int)
train_['CategoricalAge'] = pd.cut(train_['Age'], 5)
# 定义函数导出旅客的Title
def get_title(name):
   title_search = re.search('([A-Za-z]+)\.',name)
   if title_search:
       return title_search.group(1)
   return ''
train_['Title'] = train_['Name'].apply(get_title)
train_['Title'] = train_['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train_['Title'] = train_['Title'].replace('Mlle', 'Miss')
train_['Title'] = train_['Title'].replace('Ms', 'Miss')
train_['Title'] = train_['Title'].replace('Mme', 'Mrs')
#  映射 Sex
train_['Sex'] = train_['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
# 映射 titles
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train_['Title'] = train_['Title'].map(title_mapping)
train_['Title'] = train_['Title'].fillna(0)
# 映射 Embarked
train_['Embarked'] = train_['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
# 映射 Fare
train_.loc[ train_['Fare'] <= 7.91, 'Fare'] = 0
train_.loc[(train_['Fare'] > 7.91) & (train_['Fare'] <= 14.454), 'Fare'] = 1
train_.loc[(train_['Fare'] > 14.454) & (train_['Fare'] <= 31), 'Fare']   = 2
train_.loc[ train_['Fare'] > 31, 'Fare'] = 3
train_['Fare'] = train_['Fare'].astype(int)
# 映射 Age
train_.loc[ train_['Age'] <= 16, 'Age'] = 0
train_.loc[(train_['Age'] > 16) & (train_['Age'] <= 32), 'Age'] = 1
train_.loc[(train_['Age'] > 32) & (train_['Age'] <= 48), 'Age'] = 2
train_.loc[(train_['Age'] > 48) & (train_['Age'] <= 64), 'Age'] = 3
train_.loc[train_['Age'] > 64, 'Age']

Out[5]:
33     66
54     65
96     71
116    70
280    65
456    65
493    71
630    80
672    70
745    70
851    74
Name: Age, dtype: int64

特征选择

drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']

train_ = train_.drop(drop_elements, axis = 1)

train_ = train_.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)

# test_  = test_.drop(drop_elements, axis = 1)

In [7]:

train_.head()

Out[7]:

Survived    Pclass  Sex Age Parch   Fare    Embarked    Name_length Has_Cabin   FamilySize  IsAlone Title

0   0   3   1   1   0   0   0   23  0   2   0   1

1   1   1   0   2   0   3   1   51  1   2   0   3

2   1   3   0   1   0   1   0   22  0   1   1   2

3   1   1   0   2   0   3   0   44  1   2   0   3

4   0   3   1   2   0   1   0   24  0   1   1   1

可视化

Pearson Correlation Heatmap(Pearson相关性热力图)
# 可视化特征之间的关联

import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib inline

colormap = plt.cm.viridis

plt.figure(figsize=(12,12))

plt.title('Pearson Correlation of Features', y=1.05, size=15)

sns.heatmap(train_.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

__results___11_1.png

1.图中可以告诉我们特征之间没有太多的强关联
这对我们的模型来说是一个好消息,因为这意味着训练数据中没有太多的冗余信息,我们每一个特征都可以提供一个独立的信息
2.两个关联最强的特征是Family size and Parch

最后让我们生成一个 pairplots图来观察每个特征与其它特征之间的关系

g = sns.pairplot(train_[[u'Survived', u'Pclass', u'Sex', u'Age', u'Parch', u'Fare', u'Embarked',

      u'FamilySize', u'Title']], hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )

g.set(xticklabels=[])

用模型来选择特征的重要性

这里使用随机森林来得到每个特征的重要程度

In [60]:
import sklearn
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold

# 使用面向对象的编程方式(OOP),用python类来帮助我们生成多个实例,方便后面多个模型的构建与训练

# 下面创建SkleanHelper类它允许我们扩展内建的方法

ntrain = train_.shape[0]

SEED = 0 # 用于后面多个实例seed参数的重置

NFOLDS = 5 # set folds for out-of-fold prediction

kf = KFold(ntrain,n_folds=NFOLDS,random_state=SEED)

class SklearnHelper(object):

   def __init__(self,clf,seed=0,params=None):

       params['random_state'] = seed

       self.clf = clf(**params)

   def train(self, x_train, y_train):

       self.clf.fit(x_train, y_train)

   def predict(self, x):

       return self.clf.predict(x)



def fit(self,x,y):

return self.clf.fit(x,y)



def feature_importances(self,x,y):

print(self.clf.fit(x,y).feature_importances_)

这里有5个模型,它们都来自于sklearn工具包

# Random Forest classifier
# Extra Trees classifier
# AdaBoost classifer
# Gradient Boosting classifer
# Support Vector Machine

参数(parameters)

# Random Forest parameters

rf_params = {

'n_jobs': -1,

'n_estimators': 500,

'warm_start': True,

#'max_features': 0.2,

'max_depth': 6,

'min_samples_leaf': 2,

'max_features' : 'sqrt',

'verbose': 0

}

# Extra Trees Parameters

et_params = {

   'n_jobs': -1,

   'n_estimators':500,

   #'max_features': 0.5,

   'max_depth': 8,

   'min_samples_leaf': 2,

   'verbose': 0

}

# AdaBoost parameters

ada_params = {

   'n_estimators': 500,

   'learning_rate' : 0.75

}

# Gradient Boosting parameters

gb_params = {

   'n_estimators': 500,

    #'max_features': 0.2,

   'max_depth': 5,

   'min_samples_leaf': 2,

   'verbose': 0

}

# Support Vector Classifier parameters

svc_params = {

'kernel' : 'linear',

'C' : 0.025

}

并且我们创建5个对象来分别训练不同的模型

rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)

et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)

ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)

gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)

svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

创建用于训练和测试的Numpy数组

y_train = train_['Survived'].ravel() # 把Survived作为分类的标签(label)

train = train_.drop(['Survived'], axis=1) # 除去Survived

x_train = train_.values

从不同的分类模型中得到不同的特征比重

# .featureimportances 可以返回每个特征的重要性比重

rf_feature = rf.feature_importances(x_train,y_train)

et_feature = et.feature_importances(x_train, y_train)

ada_feature = ada.feature_importances(x_train, y_train)

gb_feature = gb.feature_importances(x_train,y_train)

[ 0.65595918  0.03713221  0.09610253  0.00581797  0.00419678  0.01383511

0.00504756  0.02717307  0.02183406  0.0192449   0.00290235  0.11075428]

[ 0.72430616  0.03342865  0.12294606  0.00239191  0.00269613  0.01104144

0.00395793  0.00814264  0.02822738  0.00713839  0.00575401  0.04996929]

[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]

[ 0.148  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.

0.   ]

由于返回的数据并不能直接使用,所以复制粘贴一下,

rf_features = [0.10474135,  0.21837029,  0.04432652,  0.02249159,  0.05432591,  0.02854371

,0.07570305,  0.01088129 , 0.24247496,  0.13685733 , 0.06128402]

et_features = [ 0.12165657,  0.37098307  ,0.03129623 , 0.01591611 , 0.05525811 , 0.028157

,0.04589793 , 0.02030357 , 0.17289562 , 0.04853517,  0.08910063]

ada_features = [0.028 ,   0.008  ,      0.012   ,     0.05866667,   0.032 ,       0.008

,0.04666667 ,  0.     ,      0.05733333,   0.73866667,   0.01066667]

gb_features = [ 0.06796144 , 0.03889349 , 0.07237845 , 0.02628645 , 0.11194395,  0.04778854

,0.05965792 , 0.02774745,  0.07462718,  0.4593142 ,  0.01340093]

创建一个包含特征重要性的Dataframe用于可视化

cols = train.columns.values

print(cols)

print(rf_feature)

feature_dataframe = pd.DataFrame( {'features': cols,

'Random Forest feature importances': rf_features,

'Extra Trees  feature importances': et_features,

'AdaBoost feature importances': ada_features,

'Gradient Boost feature importances': gb_features

})

feature_dataframe.head()

['Pclass' 'Sex' 'Age' 'Parch' 'Fare' 'Embarked' 'Name_length' 'Has_Cabin'

'FamilySize' 'IsAlone' 'Title']

None

Out[109]:

AdaBoost feature importances    Extra Trees feature importances Gradient Boost feature importances  Random Forest feature importances   features

0   0.028000    0.121657    0.067961    0.104741    Pclass

1   0.008000    0.370983    0.038893    0.218370    Sex

2   0.012000    0.031296    0.072378    0.044327    Age

3   0.058667    0.015916    0.026286    0.022492    Parch

4   0.032000    0.055258    0.111944    0.054326    Fare

绘制散点图描述每个模型输出的特征重要度

trace = go.Scatter(

y = feature_dataframe['Random Forest feature importances'].values,

x = feature_dataframe['features'].values,

mode='markers',

marker=dict(

sizemode = 'diameter',

sizeref = 1,

size = 25,

#       size= feature_dataframe['AdaBoost feature importances'].values,

#color = np.random.randn(500), #set color equal to a variable

color = feature_dataframe['Random Forest feature importances'].values,

colorscale='Portland',

showscale=True

),

text = feature_dataframe['features'].values

)

data = [trace]

layout= go.Layout(

   autosize= True,

   title= 'Random Forest Feature Importance',

   hovermode= 'closest',

#     xaxis= dict(

#         title= 'Pop',

#         ticklen= 5,

#         zeroline= False,

#         gridwidth= 2,

#     ),

   yaxis=dict(

       title= 'Feature Importance',

       ticklen= 5,

       gridwidth= 2

   ),

   showlegend= False

)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig,filename='scatter2010')

# Scatter plot

trace = go.Scatter(

y = feature_dataframe['Extra Trees  feature importances'].values,

x = feature_dataframe['features'].values,

mode='markers',

marker=dict(

sizemode = 'diameter',

sizeref = 1,

size = 25,

#       size= feature_dataframe['AdaBoost feature importances'].values,

#color = np.random.randn(500), #set color equal to a variable

color = feature_dataframe['Extra Trees  feature importances'].values,

colorscale='Portland',

showscale=True

),

text = feature_dataframe['features'].values

)

data = [trace]

layout= go.Layout(

   autosize= True,

   title= 'Extra Trees Feature Importance',

   hovermode= 'closest',

#     xaxis= dict(

#         title= 'Pop',

#         ticklen= 5,

#         zeroline= False,

#         gridwidth= 2,

#     ),

   yaxis=dict(

       title= 'Feature Importance',

       ticklen= 5,

       gridwidth= 2

   ),

   showlegend= False

)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig,filename='scatter2010')

# Scatter plot

trace = go.Scatter(

y = feature_dataframe['AdaBoost feature importances'].values,

x = feature_dataframe['features'].values,

mode='markers',

marker=dict(

sizemode = 'diameter',

sizeref = 1,

size = 25,

#       size= feature_dataframe['AdaBoost feature importances'].values,

#color = np.random.randn(500), #set color equal to a variable

color = feature_dataframe['AdaBoost feature importances'].values,

colorscale='Portland',

showscale=True

),

text = feature_dataframe['features'].values

)

data = [trace]

layout= go.Layout(

   autosize= True,

   title= 'AdaBoost Feature Importance',

   hovermode= 'closest',

#     xaxis= dict(

#         title= 'Pop',

#         ticklen= 5,

#         zeroline= False,

#         gridwidth= 2,

#     ),

   yaxis=dict(

       title= 'Feature Importance',

       ticklen= 5,

       gridwidth= 2

   ),

   showlegend= False

)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig,filename='scatter2010')

# Scatter plot

trace = go.Scatter(

y = feature_dataframe['Gradient Boost feature importances'].values,

x = feature_dataframe['features'].values,

mode='markers',

marker=dict(

sizemode = 'diameter',

sizeref = 1,

size = 25,

#       size= feature_dataframe['AdaBoost feature importances'].values,

#color = np.random.randn(500), #set color equal to a variable

color = feature_dataframe['Gradient Boost feature importances'].values,

colorscale='Portland',

showscale=True

),

text = feature_dataframe['features'].values

)

data = [trace]

layout= go.Layout(

   autosize= True,

   title= 'Gradient Boosting Feature Importance',

   hovermode= 'closest',

#     xaxis= dict(

#         title= 'Pop',

#         ticklen= 5,

#         zeroline= False,

#         gridwidth= 2,

#     ),

   yaxis=dict(

       title= 'Feature Importance',

       ticklen= 5,

       gridwidth= 2

   ),

   showlegend= False

)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig,filename='scatter2010')

由于图片太多这里就不贴了 ,可以访问github

计算每个模型对每个特征重要性的平均值

feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 表示操作第二轴(横轴)

feature_dataframe.head(3)

Out[111]:

AdaBoost feature importances    Extra Trees feature importances Gradient Boost feature importances  Random Forest feature importances   features    mean

0   0.028   0.121657    0.067961    0.104741    Pclass  0.080590

1   0.008   0.370983    0.038893    0.218370    Sex 0.159062

2   0.012   0.031296    0.072378    0.044327    Age 0.040000

绘制的到的平均值

y = feature_dataframe['mean'].values

x = feature_dataframe['features'].values

data = [go.Bar(

           x= x,

            y= y,

           width = 0.5,

           marker=dict(

              color = feature_dataframe['mean'].values,

           colorscale='Portland',

           showscale=True,

           reversescale = False

           ),

           opacity=0.6

       )]

layout= go.Layout(

   autosize= True,

   title= 'Barplots of Mean Feature Importance',

   hovermode= 'closest',

#     xaxis= dict(

#         title= 'Pop',

#         ticklen= 5,

#         zeroline= False,

#         gridwidth= 2,

#     ),

   yaxis=dict(

       title= 'Feature Importance',

       ticklen= 5,

       gridwidth= 2

   ),

   showlegend= False

)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='bar-direct-labels')
上一篇下一篇

猜你喜欢

热点阅读