风控数据分析学习笔记(三)Python实现woe自动分箱转化
(一)(https://www.jianshu.com/p/2759e090bd53)中提到了woe、iv等评分模型处理的一些原理;(二)(https://www.jianshu.com/p/b1b1344bd99f)中利用python建立了评分卡模型,但是其中部分变量的分箱依然是手动划分转化。本文将在(一)(二)的基础上利用python实现woe自动转化。
参考单调分箱:https://statcompute.wordpress.com/2012/12/08/monotonic-binning-with-python/
数据来源:https://www.kaggle.com/c/GiveMeSomeCredit/data
流程:
- 连续变量单调分箱
- 处理无法单调分箱的连续变量
- 类别型变量分箱
- 以上三种情况组合
- 查看个别变量分箱、woe、iv等情况
- 查看所有变量分箱、woe、iv等情况
- 转换原始数据为对应woe值
- 问题点总结
一、连续变量单调分箱
假设因变量为好坏用户,其中1为坏用户,0为好用户
实现连续型变量单调分箱,在等频的基础上:
其中DF是导入的数据,Y是因变量的字段名,X是自变量的字段名
def mono_woe(DF,Y,X):
r = 0
bad=Y.sum() #坏客户数(假设因变量列为1的是坏客户)
good=Y.count()-bad #好客户数
n=5
while np.abs(r) < 1:
d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates='drop')})
d2 = d1.groupby('Bucket', as_index = False)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
d3['min']=d2.min().X
d3['max'] = d2.max().X
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['bad_rate'] = d2.mean().Y
d3['group_rate']=d3['total']/(bad+good)
d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
iv=d3['iv'].sum()
d3['iv_sum']=iv
woe=list(d3['woe'].round(6)) #返回woe可以看到每组对应的woe值
cut=list(d3['min'].round(6))
cut1=list(d3['max'].round(6))
cut.append(cut1[-1]+1) #返回cut可以看到分箱切割点
x_woe=pd.cut(X,cut,right=False,labels=woe)
return d3,cut,woe,iv,x_woe
取apply_request_sum这个变量,只返回d3后可以看到分箱、woe、iv的各种情况,其中iv_sum是变量的iv值
二、处理无法单调分箱的连续变量
利用一的方法可以实现连续变量分箱,但由于是基于等频,所以如果某个连续型变量含有高频值的时候,会无法进行分割。例如在输入一个包含高频值得变量,返回d3会出现这种情况:
针对此种类型的连续变量:
def unmono_woe(DF,Y,X):
bad=Y.sum() #坏客户数(假设因变量列为1的是坏客户)
good=Y.count()-bad #好客户数
dn1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.cut(X, 50)})
dn2 = dn1.groupby('Bucket', as_index = False)
dn3 = pd.DataFrame(dn2.X.min(), columns = ['min'])
dn3['min']=dn2.min().X
dn3['max'] = dn2.max().X
dn3['sum'] = dn2.sum().Y
dn3['total'] = dn2.count().Y
while (1):
if (len(dn3)>5):
dn3_min_index = dn3[dn3.total == min(dn3.total)].index.values[0]
if (dn3_min_index!=0): #最小值非第一行的情况
dn3.iloc[dn3_min_index-1, 1] =dn3.iloc[dn3_min_index, 1]
dn3.iloc[dn3_min_index-1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index-1, 2]
dn3.iloc[dn3_min_index-1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index-1, 3]
dn3=dn3.drop([dn3_min_index])
dn3= dn3.reset_index(drop=True)
else: #最小值是第一行的情况
dn3.iloc[dn3_min_index+1, 0] =dn3.iloc[dn3_min_index, 0]
dn3.iloc[dn3_min_index+1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index+1, 2]
dn3.iloc[dn3_min_index+1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index+1, 3]
dn3=dn3.drop([dn3_min_index])
dn3= dn3.reset_index(drop=True)
else:
break
dn3['bad_rate'] =dn3['sum']/dn3['total']
dn3['group_rate']=dn3['total']/(bad+good)
dn3['woe']=np.log((dn3['bad_rate']/(1-dn3['bad_rate']))/(bad/good))
dn3['iv']=(dn3['sum']/bad-((dn3['total']-dn3['sum'])/good))*dn3['woe']
iv=dn3['iv'].sum()
dn3['iv_sum']=iv
woe=list(dn3['woe'].round(6))
cut=list(dn3['min'].round(6))
cut1=list(dn3['max'].round(6))
cut.append(cut1[-1]+1)
x_woe=pd.cut(X,cut,right=False,labels=woe)
return dn3
取apply_reject_sum这个变量,此时返回dn3:
三、类别型变量分箱
def cate_woe(DF,Y,X):
bad=Y.sum() #坏客户数
good=Y.count()-bad #好客户数
d1 = pd.DataFrame({"X": X, "Y": Y})
d2 = d1.groupby('X', as_index =True)
d3 = pd.DataFrame()
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
for c in range(d3.shape[0])[::-1]:
if ((d3.iloc[c,1]-d3.iloc[c,0])==0) or (d3.iloc[c,0]==0):
d3.iloc[c-1,0]=d3.iloc[c-1,0]+d3.iloc[c,0]
d3.iloc[c-1,1]=d3.iloc[c-1,1]+d3.iloc[c,1]
d3.drop(d3.index[c],inplace=True)
else:
continue
d3['min']=d3.index
d3['max'] = d3.index
d3['bad_rate'] =d3['sum']/d3['total']
d3['group_rate']=d3['total']/(bad+good)
d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
iv=d3['iv'].sum()
d3['iv_sum']=iv
d3=d3[['min','max','sum','total','bad_rate','group_rate','woe','iv','iv_sum']]
woe=list(d3['woe'].round(6))
cut=list(d3.index)
x_woe=X.replace(cut,woe)
return d3
取max_apply_request_amount_level这个类别型变量,返回d3:
四、以上三种情况组合
目的实现连续和类别变量自动识别,单调分箱和无法单调分箱变量识别,根据判断情况进行woe分箱。
def woe_single(DF,Y,X):
if X.nunique()>11:
r = 0
bad=Y.sum() #坏客户数(假设因变量列为1的是坏客户)
good=Y.count()-bad #好客户数
n=5
while np.abs(r) < 1:
d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates='drop')})
d2 = d1.groupby('Bucket', as_index = False)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
d3['min']=d2.min().X
d3['max'] = d2.max().X
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['bad_rate'] = d2.mean().Y
d3['group_rate']=d3['total']/(bad+good)
d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
iv=d3['iv'].sum()
if iv!=0.0 and len(d2)>1:
d3['iv_sum']=iv
woe=list(d3['woe'].round(6))
cut=list(d3['min'].round(6))
cut1=list(d3['max'].round(6))
cut.append(cut1[-1]+1)
x_woe=pd.cut(X,cut,right=False,labels=woe)
return d3,cut,woe,iv,x_woe
else:
dn1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.cut(X, 100)})
dn2 = dn1.groupby('Bucket', as_index = False)
dn3 = pd.DataFrame(dn2.X.min(), columns = ['min'])
dn3['min']=dn2.min().X
dn3['max'] = dn2.max().X
dn3['sum'] = dn2.sum().Y
dn3['total'] = dn2.count().Y
while (1):
if (len(dn3)>4):
dn3_min_index = dn3[dn3.total == min(dn3.total)].index.values[0]
if (dn3_min_index!=0): #最小值非第一行的情况
dn3.iloc[dn3_min_index-1, 1] =dn3.iloc[dn3_min_index, 1]
dn3.iloc[dn3_min_index-1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index-1, 2]
dn3.iloc[dn3_min_index-1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index-1, 3]
dn3=dn3.drop([dn3_min_index])
dn3= dn3.reset_index(drop=True)
else: #最小值是第一行的情况
dn3.iloc[dn3_min_index+1, 0] =dn3.iloc[dn3_min_index, 0]
dn3.iloc[dn3_min_index+1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index+1, 2]
dn3.iloc[dn3_min_index+1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index+1, 3]
dn3=dn3.drop([dn3_min_index])
dn3= dn3.reset_index(drop=True)
else:
break
dn3['bad_rate'] =dn3['sum']/dn3['total']
dn3['group_rate']=dn3['total']/(bad+good)
dn3['woe']=np.log((dn3['bad_rate']/(1-dn3['bad_rate']))/(bad/good))
dn3['iv']=(dn3['sum']/bad-((dn3['total']-dn3['sum'])/good))*dn3['woe']
iv=dn3['iv'].sum()
dn3['iv_sum']=iv
woe=list(dn3['woe'].round(6))
cut=list(dn3['min'].round(6))
cut1=list(dn3['max'].round(6))
cut.append(cut1[-1]+1)
x_woe=pd.cut(X,cut,right=False,labels=woe)
return dn3,cut,woe,iv,x_woe
else :
bad=Y.sum() #坏客户数
good=Y.count()-bad #好客户数
d1 = pd.DataFrame({"X": X, "Y": Y})
d2 = d1.groupby('X', as_index =True)
d3 = pd.DataFrame()
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
for c in range(d3.shape[0])[::-1]:
if ((d3.iloc[c,1]-d3.iloc[c,0])==0) or (d3.iloc[c,0]==0):
d3.iloc[c-1,0]=d3.iloc[c-1,0]+d3.iloc[c,0]
d3.iloc[c-1,1]=d3.iloc[c-1,1]+d3.iloc[c,1]
d3.drop(d3.index[c],inplace=True)
else:
continue
d3['min']=d3.index
d3['max'] = d3.index
d3['bad_rate'] =d3['sum']/d3['total']
d3['group_rate']=d3['total']/(bad+good)
d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
iv=d3['iv'].sum()
d3['iv_sum']=iv
d3=d3[['min','max','sum','total','bad_rate','group_rate','woe','iv','iv_sum']]
woe=list(d3['woe'].round(6))
cut=list(d3.index)
x_woe=X.replace(cut,woe)
return d3,cut,woe,iv,x_woe
五、查看个别变量分箱、woe、iv等情况
选取apply_request_sum这个变量来看下情况:
(1)整体分箱woe、iv情况:
woe_single(df,df.overdue_7,df.apply_request_sum)[0]
(2)分割点:
woe_single(df,df.overdue_7,df.overdue_repay_sum)[1]
(3)woe:
woe_single(df,df.overdue_7,df.overdue_repay_sum)[2]
(4)iv:
woe_single(df,df.overdue_7,df.overdue_repay_sum)[3]
六、查看所有变量分箱、woe、iv等情况
woe=pd.DataFrame(columns=['min','max','sum','total','bad_rate','group_rate','woe','iv','iv_sum'])
for i in df.columns[5:]:
w=pd.DataFrame(index=[i])
w=pd.concat([w,woe_single(df,df.overdue_7,df[i])[0]])
frames = [woe,w]
woe=pd.concat(frames)
woe=woe[['min','max','sum','total','bad_rate','group_rate','woe','iv','iv_sum']] #所有变量输出
woe
也可以导出为excel文件,方便分析:
woe.to_excel('E:\woe_all_new.xlsx')
七、转换原始数据为对应woe值
直接定义一个新的函数:
def woe_all(DF,Y,X):
for i in X:
if DF[i].nunique()>11:
r = 0
bad=Y.sum() #坏客户数(假设因变量列为1的是坏客户)
good=Y.count()-bad #好客户数
n=5
while np.abs(r) < 1:
d1 = pd.DataFrame({"X": DF[i], "Y": Y, "Bucket": pd.qcut(DF[i], n,duplicates='drop')})
d2 = d1.groupby('Bucket', as_index = False)
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
n = n - 1
d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
d3['min']=d2.min().X
d3['max'] = d2.max().X
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['bad_rate'] = d2.mean().Y
d3['group_rate']=d3['total']/(bad+good)
d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
iv=d3['iv'].sum()
if iv!=0.0:
woe=list(d3['woe'].round(5))
cut=list(d3['min'].round(5))
cut1=list(d3['max'].round(5))
cut.append(cut1[-1]+1)
x_woe=pd.cut(DF[i],cut,right=False,labels=woe)
DF[i]=x_woe
else:
dn1 = pd.DataFrame({"X": DF[i], "Y": Y, "Bucket": pd.cut(DF[i], 20)})
dn2 = dn1.groupby('Bucket', as_index = False)
dn3 = pd.DataFrame(dn2.X.min(), columns = ['min'])
dn3['min']=dn2.min().X
dn3['max'] = dn2.max().X
dn3['sum'] = dn2.sum().Y
dn3['total'] = dn2.count().Y
dn3=dn3.dropna()
dn3= dn3.reset_index(drop=True)
while (1):
if (len(dn3)>4):
dn3_min_index = dn3[dn3.total == min(dn3.total)].index.values[0]
if (dn3_min_index!=0): #最小值非第一行的情况
dn3.iloc[dn3_min_index-1, 1] =dn3.iloc[dn3_min_index, 1]
dn3.iloc[dn3_min_index-1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index-1, 2]
dn3.iloc[dn3_min_index-1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index-1, 3]
dn3=dn3.drop([dn3_min_index])
dn3= dn3.reset_index(drop=True)
else: #最小值是第一行的情况
dn3.iloc[dn3_min_index+1, 0] =dn3.iloc[dn3_min_index, 0]
dn3.iloc[dn3_min_index+1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index+1, 2]
dn3.iloc[dn3_min_index+1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index+1, 3]
dn3=dn3.drop([dn3_min_index])
dn3= dn3.reset_index(drop=True)
else:
break
dn3['bad_rate'] =dn3['sum']/dn3['total']
dn3['group_rate']=dn3['total']/(bad+good)
dn3['woe']=np.log((dn3['bad_rate']/(1-dn3['bad_rate']))/(bad/good))
dn3['iv']=(dn3['sum']/bad-((dn3['total']-dn3['sum'])/good))*dn3['woe']
iv=dn3['iv'].sum()
woe=list(dn3['woe'].round(5))
cut=list(dn3['min'].round(5))
cut1=list(dn3['max'].round(5))
cut.append(cut1[-1]+1)
x_woe=pd.cut(DF[i],cut,right=False,labels=woe)
DF[i]=x_woe
else :
bad=Y.sum() #坏客户数
good=Y.count()-bad #好客户数
d1 = pd.DataFrame({"X": DF[i], "Y": Y})
d2 = d1.groupby('X', as_index =True)
d3 = pd.DataFrame()
d3['sum'] = d2.sum().Y
d3['total'] = d2.count().Y
for c in range(d3.shape[0])[::-1]:
if ((d3.iloc[c,1]-d3.iloc[c,0])==0) or (d3.iloc[c,0]==0):
d3.iloc[c-1,0]=d3.iloc[c-1,0]+d3.iloc[c,0]
d3.iloc[c-1,1]=d3.iloc[c-1,1]+d3.iloc[c,1]
d3.drop(d3.index[c],inplace=True)
else:
continue
d3['bad_rate'] =d3['sum']/d3['total']
d3['group_rate']=d3['total']/(bad+good)
d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
iv=d3['iv'].sum()
woe=list(d3['woe'].round(5))
cut=list(d3.index)
x_woe=DF[i].replace(cut,woe)
DF[i]=x_woe
return DF
转换后保存到excel中:
woe=woe_all(df,df.overdue_7,df.iloc[:,1:])
woe.to_csv('E:\woe_all.csv')
八、问题点总结
目前单调分箱的连续型变量和类别变量基本与手动划分一致,高频型的连续变量还有待改进,例如在实际应用中,有时会出现如下报错:
查看分组情况:
发现该变量存在离群值,且有99.3%的变量值为0,所以实际上该变量的分析意义不大,可以删掉。由此,在进行转化之前建议优先进行数据清洗,排除离群值的干扰以及无意义的变量。