信用评分模型开发-基于R语言(10)
2019-05-06 本文已影响60人
Liam_ml
案例
使用的数据集是德国的一个银行提供的数据集,这个数据集已经包含在了scorecard这个包里面,使用data(germancredit)就可以获取这个数据集合:
data(germancredit)
head(germancredit)
## status.of.existing.checking.account duration.in.month
## 1 ... < 0 DM 6
## 2 0 <= ... < 200 DM 48
## 3 no checking account 12
## 4 ... < 0 DM 42
## 5 ... < 0 DM 24
## 6 no checking account 36
## credit.history
## 1 critical account/ other credits existing (not at this bank)
## 2 existing credits paid back duly till now
## 3 critical account/ other credits existing (not at this bank)
## 4 existing credits paid back duly till now
## 5 delay in paying off in the past
## 6 existing credits paid back duly till now
一共有20个特征,最后一列是样本的标签,bad代表坏客户,good代表好客户,因为数据已经准备好了,因此可以直接进行特征选择:
- 特征选择 选择IV大于0.1的特征值
dt_f = var_filter(germancredit, y="creditability",iv_limit = 0.1)
## [INFO] filtering variables ...
names(dt_f)
## [1] "status.of.existing.checking.account"
## [2] "duration.in.month"
## [3] "credit.history"
## [4] "purpose"
## [5] "savings.account.and.bonds"
## [6] "property"
## [7] "age.in.years"
## [8] "creditability"
筛选出8个特征
- 划分训练集合与测试集合
t_list = split_df(dt_f, y="creditability", ratio = 0.6, seed = 30)
label_list = lapply(dt_list, function(x) x$creditability)
head(dt_list)
- 进行WOE binning
bins = woebin(dt_f, y="creditability")
## [INFO] creating woe binning ...
bins$status.of.existing.checking.account
## variable
## 1: status.of.existing.checking.account
## 2: status.of.existing.checking.account
## 3: status.of.existing.checking.account
## bin count
## 1: ... < 0 DM%,%0 <= ... < 200 DM 543
## 2: ... >= 200 DM / salary assignments for at least 1 year 63
## 3: no checking account 394
## count_distr good bad badprob woe bin_iv total_iv
## 1: 0.543 303 240 0.4419890 0.6142040 0.225500603 0.639372
## 2: 0.063 49 14 0.2222222 -0.4054651 0.009460853 0.639372
## 3: 0.394 348 46 0.1167513 -1.1762632 0.404410499 0.639372
## breaks
## 1: ... < 0 DM%,%0 <= ... < 200 DM
## 2: ... >= 200 DM / salary assignments for at least 1 year
## 3: no checking account
## is_special_values
## 1: FALSE
## 2: FALSE
## 3: FALSE
bins$duration.in.month
## variable bin count count_distr good bad badprob
## 1: duration.in.month [-Inf,8) 87 0.087 78 9 0.1034483
## 2: duration.in.month [8,16) 344 0.344 264 80 0.2325581
## 3: duration.in.month [16,34) 399 0.399 270 129 0.3233083
## 4: duration.in.month [34,44) 100 0.100 58 42 0.4200000
## 5: duration.in.month [44, Inf) 70 0.070 30 40 0.5714286
## woe bin_iv total_iv breaks is_special_values
## 1: -1.3121864 0.106849463 0.2826181 8 FALSE
## 2: -0.3466246 0.038293766 0.2826181 16 FALSE
## 3: 0.1086883 0.004813339 0.2826181 34 FALSE
## 4: 0.5245245 0.029972827 0.2826181 44 FALSE
## 5: 1.1349799 0.102688661 0.2826181 Inf FALSE
- 将数据转变成为WOE形式
dt_woe_list = lapply(dt_list, function(x) woebin_ply(x, bins))
## [INFO] converting into woe values ...
## [INFO] converting into woe values ...
head(dt_woe_list)
## $train
## creditability status.of.existing.checking.account_woe
## 1: 0 0.614204
## 2: 1 0.614204
## 3: 0 -1.176263
## 4: 0 0.614204
## 5: 0 -1.176263
## ---
## 616: 0 -1.176263
## 617: 0 0.614204
## 618: 0 -1.176263
## 619: 1 0.614204
## 620: 0 0.614204
## duration.in.month_woe credit.history_woe purpose_woe
## 1: -1.3121864 -0.73374058 -0.4100628
## 2: 1.1349799 0.08831862 -0.4100628
## 3: -0.3466246 -0.73374058 0.2799201
## 4: 0.5245245 0.08831862 0.2799201
## 5: 0.5245245 0.08831862 0.2799201
## ---
## 616: -0.3466246 0.08831862 0.2799201
## 617: 0.1086883 0.08831862 -0.8056252
## 618: -0.3466246 0.08831862 -0.4100628
## 619: 1.1349799 0.08831862 -0.4100628
## 620: 1.1349799 -0.73374058 -0.8056252
## savings.account.and.bonds_woe property_woe age.in.years_woe
## 1: -0.7621401 -0.46103496 -0.2123715
## 2: 0.2713578 -0.46103496 0.5288441
## 3: 0.2713578 -0.46103496 -0.2123715
## 4: 0.2713578 0.02857337 -0.2123715
## 5: -0.7621401 0.58608236 -0.8724881
## ---
## 616: 0.2713578 -0.46103496 0.1424546
## 617: 0.2713578 0.02857337 -0.2123715
## 618: 0.2713578 0.03419136 -0.2123715
## 619: 0.2713578 0.58608236 0.5288441
## 620: 0.1395519 0.03419136 -0.1609304
##
## $test
## creditability status.of.existing.checking.account_woe
## 1: 1 0.614204
## 2: 0 -1.176263
## 3: 1 0.614204
## 4: 1 0.614204
## 5: 0 0.614204
## ---
## 376: 1 0.614204
## 377: 0 0.614204
## 378: 0 -1.176263
## 379: 0 0.614204
## 380: 0 -1.176263
## duration.in.month_woe credit.history_woe purpose_woe
## 1: 0.1086883 0.08515781 0.2799201
## 2: -0.3466246 0.08831862 -0.4100628
## 3: -0.3466246 0.08831862 0.2799201
## 4: 0.1086883 -0.73374058 0.2799201
## 5: -0.3466246 0.08831862 0.2799201
## ---
## 376: 0.5245245 0.08831862 -0.8056252
## 377: -0.3466246 -0.73374058 0.2799201
## 378: -0.3466246 1.23407084 -0.4100628
## 379: 0.1086883 0.08831862 -0.4100628
## 380: -0.3466246 0.08831862 0.2799201
## savings.account.and.bonds_woe property_woe age.in.years_woe
## 1: 0.2713578 0.58608236 -0.2123715
## 2: -0.7621401 -0.46103496 -0.2123715
## 3: 0.2713578 0.03419136 0.5288441
## 4: 0.2713578 0.03419136 -0.2123715
## 5: 0.2713578 0.03419136 0.1424546
## ---
## 376: 0.2713578 0.02857337 -0.1609304
## 377: 0.2713578 0.02857337 0.5288441
## 378: 0.1395519 0.03419136 0.1424546
## 379: -0.7621401 0.03419136 0.5288441
## 380: -0.7621401 0.03419136 -0.2123715
- 训练模型
m1 = glm( creditability ~ ., family = binomial(), data = dt_woe_list$train)
m_step = step(m1, direction="both", trace = FALSE)
m2 = eval(m_step$call)
summary(m_step)
##
## Call:
## glm(formula = creditability ~ status.of.existing.checking.account_woe +
## duration.in.month_woe + credit.history_woe + purpose_woe +
## savings.account.and.bonds_woe + property_woe + age.in.years_woe,
## family = binomial(), data = dt_woe_list$train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8673 -0.7379 -0.4205 0.7899 2.5882
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -0.9302 0.1060 -8.773
## status.of.existing.checking.account_woe 0.7343 0.1334 5.502
## duration.in.month_woe 0.9731 0.2173 4.479
## credit.history_woe 0.8721 0.1960 4.451
## purpose_woe 0.8589 0.2653 3.238
## savings.account.and.bonds_woe 0.7163 0.2522 2.840
## property_woe 0.5991 0.3229 1.855
## age.in.years_woe 0.9456 0.2918 3.240
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## status.of.existing.checking.account_woe 3.74e-08 ***
## duration.in.month_woe 7.50e-06 ***
## credit.history_woe 8.56e-06 ***
## purpose_woe 0.00120 **
## savings.account.and.bonds_woe 0.00450 **
## property_woe 0.06355 .
## age.in.years_woe 0.00119 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 747.03 on 619 degrees of freedom
## Residual deviance: 583.26 on 612 degrees of freedom
## AIC: 599.26
##
## Number of Fisher Scoring iterations: 5
到这里,模型就已经训练好了
- 模型评估
pred_list = lapply(dt_woe_list, function(x) predict(m2, x, type='response'))
## performance
perf = perf_eva(pred = pred_list, label = label_list,show_plot = c('ks', 'lift', 'gain', 'roc', 'lz', 'pr', 'f1', 'density'))
image.png
- 生成评分卡
card = scorecard(bins, m2)
- 计算用户的得分
score_list = lapply(dt_list, function(x) scorecard_ply(x, card))
head(score_list)
## $train
## score
## 1: 658
## 2: 331
## 3: 590
## 4: 361
## 5: 531
## ---
## 616: 514
## 617: 457
## 618: 559
## 619: 286
## 620: 441
##
## $test
## score
## 1: 367
## 2: 633
## 3: 372
## 4: 442
## 5: 398
## ---
## 376: 425
## 377: 424
## 378: 470
## 379: 435
## 380: 570
- 模型的稳定平评估
perf_psi(score = score_list, label = label_list)
image.png