手机好文

信用评分模型开发-基于R语言(10)

2019-05-06  本文已影响60人  Liam_ml

案例

使用的数据集是德国的一个银行提供的数据集,这个数据集已经包含在了scorecard这个包里面,使用data(germancredit)就可以获取这个数据集合:

data(germancredit)
head(germancredit)
##   status.of.existing.checking.account duration.in.month
## 1                          ... < 0 DM                 6
## 2                   0 <= ... < 200 DM                48
## 3                 no checking account                12
## 4                          ... < 0 DM                42
## 5                          ... < 0 DM                24
## 6                 no checking account                36
##                                                credit.history
## 1 critical account/ other credits existing (not at this bank)
## 2                    existing credits paid back duly till now
## 3 critical account/ other credits existing (not at this bank)
## 4                    existing credits paid back duly till now
## 5                             delay in paying off in the past
## 6                    existing credits paid back duly till now

一共有20个特征,最后一列是样本的标签,bad代表坏客户,good代表好客户,因为数据已经准备好了,因此可以直接进行特征选择:

dt_f = var_filter(germancredit, y="creditability",iv_limit = 0.1)
## [INFO] filtering variables ...
names(dt_f)
## [1] "status.of.existing.checking.account"
## [2] "duration.in.month"                  
## [3] "credit.history"                     
## [4] "purpose"                            
## [5] "savings.account.and.bonds"          
## [6] "property"                           
## [7] "age.in.years"                       
## [8] "creditability"

筛选出8个特征

t_list = split_df(dt_f, y="creditability", ratio = 0.6, seed = 30)
label_list = lapply(dt_list, function(x) x$creditability)
head(dt_list)
bins = woebin(dt_f, y="creditability")
## [INFO] creating woe binning ...
bins$status.of.existing.checking.account
##                               variable
## 1: status.of.existing.checking.account
## 2: status.of.existing.checking.account
## 3: status.of.existing.checking.account
##                                                       bin count
## 1:                         ... < 0 DM%,%0 <= ... < 200 DM   543
## 2: ... >= 200 DM / salary assignments for at least 1 year    63
## 3:                                    no checking account   394
##    count_distr good bad   badprob        woe      bin_iv total_iv
## 1:       0.543  303 240 0.4419890  0.6142040 0.225500603 0.639372
## 2:       0.063   49  14 0.2222222 -0.4054651 0.009460853 0.639372
## 3:       0.394  348  46 0.1167513 -1.1762632 0.404410499 0.639372
##                                                    breaks
## 1:                         ... < 0 DM%,%0 <= ... < 200 DM
## 2: ... >= 200 DM / salary assignments for at least 1 year
## 3:                                    no checking account
##    is_special_values
## 1:             FALSE
## 2:             FALSE
## 3:             FALSE
bins$duration.in.month
##             variable       bin count count_distr good bad   badprob
## 1: duration.in.month  [-Inf,8)    87       0.087   78   9 0.1034483
## 2: duration.in.month    [8,16)   344       0.344  264  80 0.2325581
## 3: duration.in.month   [16,34)   399       0.399  270 129 0.3233083
## 4: duration.in.month   [34,44)   100       0.100   58  42 0.4200000
## 5: duration.in.month [44, Inf)    70       0.070   30  40 0.5714286
##           woe      bin_iv  total_iv breaks is_special_values
## 1: -1.3121864 0.106849463 0.2826181      8             FALSE
## 2: -0.3466246 0.038293766 0.2826181     16             FALSE
## 3:  0.1086883 0.004813339 0.2826181     34             FALSE
## 4:  0.5245245 0.029972827 0.2826181     44             FALSE
## 5:  1.1349799 0.102688661 0.2826181    Inf             FALSE
dt_woe_list = lapply(dt_list, function(x) woebin_ply(x, bins))
## [INFO] converting into woe values ... 
## [INFO] converting into woe values ...
head(dt_woe_list)
## $train
##      creditability status.of.existing.checking.account_woe
##   1:             0                                0.614204
##   2:             1                                0.614204
##   3:             0                               -1.176263
##   4:             0                                0.614204
##   5:             0                               -1.176263
##  ---                                                      
## 616:             0                               -1.176263
## 617:             0                                0.614204
## 618:             0                               -1.176263
## 619:             1                                0.614204
## 620:             0                                0.614204
##      duration.in.month_woe credit.history_woe purpose_woe
##   1:            -1.3121864        -0.73374058  -0.4100628
##   2:             1.1349799         0.08831862  -0.4100628
##   3:            -0.3466246        -0.73374058   0.2799201
##   4:             0.5245245         0.08831862   0.2799201
##   5:             0.5245245         0.08831862   0.2799201
##  ---                                                     
## 616:            -0.3466246         0.08831862   0.2799201
## 617:             0.1086883         0.08831862  -0.8056252
## 618:            -0.3466246         0.08831862  -0.4100628
## 619:             1.1349799         0.08831862  -0.4100628
## 620:             1.1349799        -0.73374058  -0.8056252
##      savings.account.and.bonds_woe property_woe age.in.years_woe
##   1:                    -0.7621401  -0.46103496       -0.2123715
##   2:                     0.2713578  -0.46103496        0.5288441
##   3:                     0.2713578  -0.46103496       -0.2123715
##   4:                     0.2713578   0.02857337       -0.2123715
##   5:                    -0.7621401   0.58608236       -0.8724881
##  ---                                                            
## 616:                     0.2713578  -0.46103496        0.1424546
## 617:                     0.2713578   0.02857337       -0.2123715
## 618:                     0.2713578   0.03419136       -0.2123715
## 619:                     0.2713578   0.58608236        0.5288441
## 620:                     0.1395519   0.03419136       -0.1609304
## 
## $test
##      creditability status.of.existing.checking.account_woe
##   1:             1                                0.614204
##   2:             0                               -1.176263
##   3:             1                                0.614204
##   4:             1                                0.614204
##   5:             0                                0.614204
##  ---                                                      
## 376:             1                                0.614204
## 377:             0                                0.614204
## 378:             0                               -1.176263
## 379:             0                                0.614204
## 380:             0                               -1.176263
##      duration.in.month_woe credit.history_woe purpose_woe
##   1:             0.1086883         0.08515781   0.2799201
##   2:            -0.3466246         0.08831862  -0.4100628
##   3:            -0.3466246         0.08831862   0.2799201
##   4:             0.1086883        -0.73374058   0.2799201
##   5:            -0.3466246         0.08831862   0.2799201
##  ---                                                     
## 376:             0.5245245         0.08831862  -0.8056252
## 377:            -0.3466246        -0.73374058   0.2799201
## 378:            -0.3466246         1.23407084  -0.4100628
## 379:             0.1086883         0.08831862  -0.4100628
## 380:            -0.3466246         0.08831862   0.2799201
##      savings.account.and.bonds_woe property_woe age.in.years_woe
##   1:                     0.2713578   0.58608236       -0.2123715
##   2:                    -0.7621401  -0.46103496       -0.2123715
##   3:                     0.2713578   0.03419136        0.5288441
##   4:                     0.2713578   0.03419136       -0.2123715
##   5:                     0.2713578   0.03419136        0.1424546
##  ---                                                            
## 376:                     0.2713578   0.02857337       -0.1609304
## 377:                     0.2713578   0.02857337        0.5288441
## 378:                     0.1395519   0.03419136        0.1424546
## 379:                    -0.7621401   0.03419136        0.5288441
## 380:                    -0.7621401   0.03419136       -0.2123715
m1 = glm( creditability ~ ., family = binomial(), data = dt_woe_list$train)
m_step = step(m1, direction="both", trace = FALSE)
m2 = eval(m_step$call)
summary(m_step)
## 
## Call:
## glm(formula = creditability ~ status.of.existing.checking.account_woe + 
##     duration.in.month_woe + credit.history_woe + purpose_woe + 
##     savings.account.and.bonds_woe + property_woe + age.in.years_woe, 
##     family = binomial(), data = dt_woe_list$train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8673  -0.7379  -0.4205   0.7899   2.5882  
## 
## Coefficients:
##                                         Estimate Std. Error z value
## (Intercept)                              -0.9302     0.1060  -8.773
## status.of.existing.checking.account_woe   0.7343     0.1334   5.502
## duration.in.month_woe                     0.9731     0.2173   4.479
## credit.history_woe                        0.8721     0.1960   4.451
## purpose_woe                               0.8589     0.2653   3.238
## savings.account.and.bonds_woe             0.7163     0.2522   2.840
## property_woe                              0.5991     0.3229   1.855
## age.in.years_woe                          0.9456     0.2918   3.240
##                                         Pr(>|z|)    
## (Intercept)                              < 2e-16 ***
## status.of.existing.checking.account_woe 3.74e-08 ***
## duration.in.month_woe                   7.50e-06 ***
## credit.history_woe                      8.56e-06 ***
## purpose_woe                              0.00120 ** 
## savings.account.and.bonds_woe            0.00450 ** 
## property_woe                             0.06355 .  
## age.in.years_woe                         0.00119 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 747.03  on 619  degrees of freedom
## Residual deviance: 583.26  on 612  degrees of freedom
## AIC: 599.26
## 
## Number of Fisher Scoring iterations: 5

到这里,模型就已经训练好了

pred_list = lapply(dt_woe_list, function(x) predict(m2, x, type='response'))
## performance

perf = perf_eva(pred = pred_list, label = label_list,show_plot =  c('ks', 'lift', 'gain', 'roc', 'lz', 'pr', 'f1', 'density'))
image.png
card = scorecard(bins, m2)
score_list = lapply(dt_list, function(x) scorecard_ply(x, card))

head(score_list)
## $train
##      score
##   1:   658
##   2:   331
##   3:   590
##   4:   361
##   5:   531
##  ---      
## 616:   514
## 617:   457
## 618:   559
## 619:   286
## 620:   441
## 
## $test
##      score
##   1:   367
##   2:   633
##   3:   372
##   4:   442
##   5:   398
##  ---      
## 376:   425
## 377:   424
## 378:   470
## 379:   435
## 380:   570
perf_psi(score = score_list, label = label_list)
image.png
上一篇下一篇

猜你喜欢

热点阅读