2020-09-21

2020-09-21  本文已影响0人  池中之于

金融风控练习赛——Task3 特征工程

1. 数据预处理

1.1 数据清洗

数据清洗主要用于对原始数据中缺失值,异常值进行处理。


#数据读取及分类

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

import datetime

from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import chi2

from sklearn.preprocessing import MinMaxScaler

import xgboost as xgb

import lightgbm as lgb

from catboost import CatBoostRegressor

import warnings

from sklearn.model_selection import StratifiedKFold, KFold

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss

warnings.filterwarnings('ignore')

data_train =pd.read_csv('../data/train.csv')

data_test_a = pd.read_csv('../data/testA.csv')

numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)

category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))

label = 'isDefault'

numerical_fea.remove(label)

# 数值型特征--平均数填充
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
# 类别型特征--众数填充
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())

1.2 数据分箱

data['loanAmnt_bin'] = pd.qcut(data['loanAmnt'], 10, labels=False)

2. 变量选择

主要以变量的线性相关性为例。

x_train = data_train.drop(['isDefault'], axis=1)
#计算协方差
data_corr = x_train.corrwith(data_train.isDefault) #计算相关性
result = pd.DataFrame(columns=['features', 'corr'])
result['features'] = data_corr.index
result['corr'] = data_corr.values

方差选择特征

from sklearn.feature_selection import VarianceThreshold
#其中参数threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(train,target_train)

3. 时间特征处理

#转化成时间格式
for data in [data_train, data_test_a]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

def to_employmentLength(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
for data in [data_train, data_test_a]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

处理类别特征

#查看类别特征
category_fea
#['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']

for data in [data_train, data_test_a]:
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

for data in [data_train, data_test_a]:
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})

for data in [data_train, data_test_a]:
    data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
上一篇下一篇

猜你喜欢

热点阅读