信用评分模型开发-基于R语言（9）

2019-05-06 本文已影响37人 Liam_ml

scorecard 信用评分包

scorecard包是在R中提供了一个完整的信用评分模型开发的解决方案。本节会对这一部分内容做一个详细的讲解

首先是现在安装:

install.packages(scorecard)
require(scorecard)

split_df 划分数据集

这个函数是用于划分数据集，使用方法如下:

split_df(dt, y = NULL, ratio = 0.7, seed = 618)

y表示样本的标签，或者说是因变量
ratio 代表的是训练集合与测试集合的比例

data(germancredit)

# Example I
dt_list = split_df(germancredit, y="creditability")
train = dt_list[[1]]
test = dt_list[[2]]


dim(germancredit)
## [1] 1000   21
dim(train)
## [1] 681  21
dim(test)
## [1] 319  21

IV 计算信息值

使用这个函数来计算特征的IV值，用于特征选择作为参考，使用方法如下：

iv(dt, y, x = NULL, positive = "bad|1", order = TRUE)

x表示因变量，默认表示计算所有的自变量的IV

order 表示根据IV进行排序
data(germancredit)

# information values
info_value = iv(germancredit, y = "creditability")
info_value
##                                                     variable   info_value
##  1:                      status.of.existing.checking.account 6.660115e-01
##  2:                                        duration.in.month 3.345035e-01
##  3:                                           credit.history 2.932335e-01
##  4:                                             age.in.years 2.596514e-01
##  5:                                savings.account.and.bonds 1.960096e-01
##  6:                                                  purpose 1.691951e-01
##  7:                                                 property 1.126383e-01
##  8:                                 present.employment.since 8.643363e-02
##  9:                                                  housing 8.329343e-02
## 10:                                  other.installment.plans 5.761454e-02
## 11:                                           foreign.worker 4.387741e-02
## 12:                                  personal.status.and.sex 4.268938e-02
## 13:                                            credit.amount 3.895727e-02
## 14:                              other.debtors.or.guarantors 3.201932e-02
## 15:      installment.rate.in.percentage.of.disposable.income 2.632209e-02
## 16:                  number.of.existing.credits.at.this.bank 1.326652e-02
## 17:                                                      job 8.762766e-03
## 18:                                                telephone 6.377605e-03
## 19:                                  present.residence.since 3.588773e-03
## 20: number.of.people.being.liable.to.provide.maintenance.for 4.339223e-05

var_filter 筛选变量

通过设定标准，使用这个函数可以通过特定的标准，信息值，缺失率，筛选特征，使用方法如下:

var_filter(dt, y, x = NULL, iv_limit = 0.02, missing_limit = 0.95,
  identical_limit = 0.95, var_rm = NULL, var_kp = NULL,
  return_rm_reason = FALSE, positive = "bad|1")

iv_limit 表示信息值超过多少，才保留此特征，默认是大于0.02
missing_limit 表示保留某个缺失率以下的特征，默认是0.95
identical_limit 表示，如果某个特征的值一样的比例小于某个比例，则保留，默认是0.95

data(germancredit)

# variable filter
dt_sel = var_filter(germancredit, y = "creditability")
names(dt_sel)
##  [1] "status.of.existing.checking.account"                
##  [2] "duration.in.month"                                  
##  [3] "credit.history"                                     
##  [4] "purpose"                                            
##  [5] "credit.amount"                                      
##  [6] "savings.account.and.bonds"                          
##  [7] "present.employment.since"                           
##  [8] "installment.rate.in.percentage.of.disposable.income"
##  [9] "personal.status.and.sex"                            
## [10] "other.debtors.or.guarantors"                        
## [11] "property"                                           
## [12] "age.in.years"                                       
## [13] "other.installment.plans"                            
## [14] "housing"                                            
## [15] "creditability"

woebin 进行WOE变换

使用这个函数进行WOE进行连续变量WOE分箱，使用方法如下：

woebin(dt, y, x = NULL, var_skip = NULL, breaks_list = NULL,
  special_values = NULL, stop_limit = 0.1, count_distr_limit = 0.05,
  bin_num_limit = 8, positive = "bad|1", no_cores = NULL,
  print_step = 0L, method = "tree", save_breaks_list = NULL,
  ignore_const_cols = TRUE, ignore_datetime_cols = TRUE,
  check_cate_num = TRUE, replace_blank_na = TRUE, ...)

method 是使用分箱的方法，默认是使用决策树
breaks list 可以制定自己的分箱规则
stop_limit 如果使用树方法，当信息值增益比小于stop_limit时停止分箱分段; 如果使用chimerge方法，当最小卡方大于’qchisq（1-stoplimit，1）’时停止合并。可接受的范围：0-0.5; 默认值为0.1。

bins2_tree = woebin(germancredit, y="creditability", method="tree")
bins2_tree$status.of.existing.checking.account
##                               variable
## 1: status.of.existing.checking.account
## 2: status.of.existing.checking.account
## 3: status.of.existing.checking.account
##                                                       bin count
## 1:                         ... < 0 DM%,%0 <= ... < 200 DM   543
## 2: ... >= 200 DM / salary assignments for at least 1 year    63
## 3:                                    no checking account   394
##    count_distr good bad   badprob        woe      bin_iv total_iv
## 1:       0.543  303 240 0.4419890  0.6142040 0.225500603 0.639372
## 2:       0.063   49  14 0.2222222 -0.4054651 0.009460853 0.639372
## 3:       0.394  348  46 0.1167513 -1.1762632 0.404410499 0.639372
##                                                    breaks
## 1:                         ... < 0 DM%,%0 <= ... < 200 DM
## 2: ... >= 200 DM / salary assignments for at least 1 year
## 3:                                    no checking account
##    is_special_values
## 1:             FALSE
## 2:             FALSE
## 3:             FALSE

woebin_ply 将原始数据转换成为WOE数据

WOE分箱的具体划分规则指定好了，使用woebin_ply将原始数据转化成为WOE数据，使用方法如下:

woebin_ply(dt, bins, no_cores = NULL, print_step = 0L,
  replace_blank_na = TRUE, ...)

dt 是原始数据
bins 是woebin 的返回结果

dt_woe = woebin_ply(germancredit, bins=bins2_tree)
head(dt_woe)
##    creditability status.of.existing.checking.account_woe
## 1:          good                                0.614204
## 2:           bad                                0.614204
## 3:          good                               -1.176263
## 4:          good                                0.614204
## 5:           bad                                0.614204
## 6:          good                               -1.176263
##    duration.in.month_woe credit.history_woe purpose_woe credit.amount_woe
## 1:            -1.3121864        -0.73374058  -0.4100628        0.03366128
## 2:             1.1349799         0.08831862  -0.4100628        0.39053946
## 3:            -0.3466246        -0.73374058   0.2799201       -0.25830746
## 4:             0.5245245         0.08831862   0.2799201        0.39053946
## 5:             0.1086883         0.08515781   0.2799201        0.39053946
## 6:             0.5245245         0.08831862   0.2799201        0.39053946
##    savings.account.and.bonds_woe present.employment.since_woe
## 1:                    -0.7621401                  -0.23556607
## 2:                     0.2713578                   0.03210325
## 3:                     0.2713578                  -0.39441527
## 4:                     0.2713578                  -0.39441527
## 5:                     0.2713578                   0.03210325
## 6:                    -0.7621401                   0.03210325
##    installment.rate.in.percentage.of.disposable.income_woe
## 1:                                               0.1039609
## 2:                                              -0.1554665
## 3:                                              -0.1554665
## 4:                                              -0.1554665
## 5:                                               0.1039609
## 6:                                              -0.1554665
##    personal.status.and.sex_woe other.debtors.or.guarantors_woe
## 1:                  -0.1655476                      0.02797385
## 2:                   0.2646926                      0.02797385
## 3:                  -0.1655476                      0.02797385
## 4:                  -0.1655476                     -0.58778666
## 5:                  -0.1655476                      0.02797385
## 6:                  -0.1655476                      0.02797385
##    present.residence.since_woe property_woe age.in.years_woe
## 1:                 -0.01359409  -0.46103496       -0.2123715
## 2:                  0.07015071  -0.46103496        0.5288441
## 3:                 -0.01359409  -0.46103496       -0.2123715
## 4:                 -0.01359409   0.02857337       -0.2123715
## 5:                 -0.01359409   0.58608236       -0.2123715
## 6:                 -0.01359409   0.58608236       -0.8724881
##    other.installment.plans_woe housing_woe
## 1:                  -0.1211786  -0.1941560
## 2:                  -0.1211786  -0.1941560
## 3:                  -0.1211786  -0.1941560
## 4:                  -0.1211786   0.4726044
## 5:                  -0.1211786   0.4726044
## 6:                  -0.1211786   0.4726044
##    number.of.existing.credits.at.this.bank_woe     job_woe
## 1:                                  -0.1347806 -0.02278003
## 2:                                   0.0748775 -0.02278003
## 3:                                   0.0748775 -0.07847162
## 4:                                   0.0748775 -0.02278003
## 5:                                  -0.1347806 -0.02278003
## 6:                                   0.0748775 -0.07847162
##    number.of.people.being.liable.to.provide.maintenance.for_woe
## 1:                                                            0
## 2:                                                            0
## 3:                                                            0
## 4:                                                            0
## 5:                                                            0
## 6:                                                            0
##    telephone_woe foreign.worker_woe
## 1:   -0.09863759                  0
## 2:    0.06469132                  0
## 3:    0.06469132                  0
## 4:    0.06469132                  0
## 5:    0.06469132                  0
## 6:   -0.09863759                  0

scorecard 构建评分卡

使用scorecard通过模型和woebin的结果构建出评分卡规则，使用方法如下：

scorecard(bins, model, points0 = 600, odds0 = 1/19, pdo = 50,
  basepoints_eq0 = FALSE)

bins 是woebin的返回结果
model 是构建好的逻辑回归模型
odds 见第七章
pdo 见第七章

dt_woe$creditability <- as.character(dt_woe$creditability)
dt_woe$creditability[as.character(dt_woe$creditability)=='good']=0
dt_woe$creditability[as.character(dt_woe$creditability)=='bad']=1
dt_woe$creditability <- as.factor(dt_woe$creditability)
l <- glm(creditability~.,data = dt_woe,family = binomial())
l <- step(l)
score <- scorecard(bins = bins2_tree,model = l)

score$status.of.existing.checking.account

##                               variable
## 1: status.of.existing.checking.account
## 2: status.of.existing.checking.account
## 3: status.of.existing.checking.account
##                                                       bin count
## 1:                         ... < 0 DM%,%0 <= ... < 200 DM   543
## 2: ... >= 200 DM / salary assignments for at least 1 year    63
## 3:                                    no checking account   394
##    count_distr good bad   badprob        woe      bin_iv total_iv
## 1:       0.543  303 240 0.4419890  0.6142040 0.225500603 0.639372
## 2:       0.063   49  14 0.2222222 -0.4054651 0.009460853 0.639372
## 3:       0.394  348  46 0.1167513 -1.1762632 0.404410499 0.639372
##                                                    breaks
## 1:                         ... < 0 DM%,%0 <= ... < 200 DM
## 2: ... >= 200 DM / salary assignments for at least 1 year
## 3:                                    no checking account
##    is_special_values points
## 1:             FALSE    -36
## 2:             FALSE     24
## 3:             FALSE     68

scorecard_ply

将一个新用户的原始数据获取这个用户的分数，使用方法如下：

scorecard_ply(dt, card, only_total_score = TRUE, print_step = 0L,
  replace_blank_na = TRUE, var_kp = NULL)

dt 训练模型的原始数据集
使用scorecard 建立起来的评分卡规则

resutl <- scorecard_ply(dt = germancredit,card = score)
resutl
##       score
##    1:   648
##    2:   314
##    3:   638
##    4:   439
##    5:   310
##   ---      
##  996:   535
##  997:   462
##  998:   559
##  999:   342
## 1000:   402