风控数据分析学习笔记（三）Python实现woe自动分箱转化

2018-08-31 本文已影响3674人 3e93a92f77da

（一）（https://www.jianshu.com/p/2759e090bd53）中提到了woe、iv等评分模型处理的一些原理；（二）（https://www.jianshu.com/p/b1b1344bd99f）中利用python建立了评分卡模型，但是其中部分变量的分箱依然是手动划分转化。本文将在（一）（二）的基础上利用python实现woe自动转化。
参考单调分箱：https://statcompute.wordpress.com/2012/12/08/monotonic-binning-with-python/
数据来源：https://www.kaggle.com/c/GiveMeSomeCredit/data

流程：

连续变量单调分箱
处理无法单调分箱的连续变量
类别型变量分箱
以上三种情况组合
查看个别变量分箱、woe、iv等情况
查看所有变量分箱、woe、iv等情况
转换原始数据为对应woe值
问题点总结

一、连续变量单调分箱

假设因变量为好坏用户，其中1为坏用户，0为好用户
实现连续型变量单调分箱，在等频的基础上：
其中DF是导入的数据，Y是因变量的字段名，X是自变量的字段名

def mono_woe(DF,Y,X):
    r = 0
    bad=Y.sum()      #坏客户数(假设因变量列为1的是坏客户)
    good=Y.count()-bad  #好客户数
    n=5
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates='drop')})
        d2 = d1.groupby('Bucket', as_index = False)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
    d3['min']=d2.min().X    
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['bad_rate'] = d2.mean().Y
    d3['group_rate']=d3['total']/(bad+good)
    d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
    d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
    iv=d3['iv'].sum()
    d3['iv_sum']=iv
    woe=list(d3['woe'].round(6))   #返回woe可以看到每组对应的woe值
    cut=list(d3['min'].round(6))
    cut1=list(d3['max'].round(6))
    cut.append(cut1[-1]+1)   #返回cut可以看到分箱切割点
    x_woe=pd.cut(X,cut,right=False,labels=woe)
    return  d3,cut,woe,iv,x_woe

取apply_request_sum这个变量，只返回d3后可以看到分箱、woe、iv的各种情况，其中iv_sum是变量的iv值

二、处理无法单调分箱的连续变量

利用一的方法可以实现连续变量分箱，但由于是基于等频，所以如果某个连续型变量含有高频值的时候，会无法进行分割。例如在输入一个包含高频值得变量，返回d3会出现这种情况：

针对此种类型的连续变量：

def unmono_woe(DF,Y,X):
    bad=Y.sum()      #坏客户数(假设因变量列为1的是坏客户)
    good=Y.count()-bad  #好客户数
    dn1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.cut(X, 50)})
    dn2 = dn1.groupby('Bucket', as_index = False)
    dn3 = pd.DataFrame(dn2.X.min(), columns = ['min'])
    dn3['min']=dn2.min().X    
    dn3['max'] = dn2.max().X
    dn3['sum'] = dn2.sum().Y
    dn3['total'] = dn2.count().Y
    while (1):
            if  (len(dn3)>5):
                dn3_min_index = dn3[dn3.total == min(dn3.total)].index.values[0]
                if (dn3_min_index!=0):    #最小值非第一行的情况
                    dn3.iloc[dn3_min_index-1, 1] =dn3.iloc[dn3_min_index, 1] 
                    dn3.iloc[dn3_min_index-1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index-1, 2]
                    dn3.iloc[dn3_min_index-1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index-1, 3]
                    dn3=dn3.drop([dn3_min_index])
                    dn3= dn3.reset_index(drop=True)
                else:    #最小值是第一行的情况
                    dn3.iloc[dn3_min_index+1, 0] =dn3.iloc[dn3_min_index, 0] 
                    dn3.iloc[dn3_min_index+1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index+1, 2]
                    dn3.iloc[dn3_min_index+1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index+1, 3]
                    dn3=dn3.drop([dn3_min_index])
                    dn3= dn3.reset_index(drop=True)
            else:
                break
    dn3['bad_rate'] =dn3['sum']/dn3['total']
    dn3['group_rate']=dn3['total']/(bad+good)
    dn3['woe']=np.log((dn3['bad_rate']/(1-dn3['bad_rate']))/(bad/good))
    dn3['iv']=(dn3['sum']/bad-((dn3['total']-dn3['sum'])/good))*dn3['woe']
    iv=dn3['iv'].sum()
    dn3['iv_sum']=iv
    woe=list(dn3['woe'].round(6)) 
    cut=list(dn3['min'].round(6))
    cut1=list(dn3['max'].round(6))
    cut.append(cut1[-1]+1)
    x_woe=pd.cut(X,cut,right=False,labels=woe)
    return  dn3

取apply_reject_sum这个变量，此时返回dn3：

三、类别型变量分箱

def cate_woe(DF,Y,X):
    bad=Y.sum()      #坏客户数
    good=Y.count()-bad  #好客户数
    d1 = pd.DataFrame({"X": X, "Y": Y})
    d2 = d1.groupby('X', as_index =True)
    d3 = pd.DataFrame()
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    for c in range(d3.shape[0])[::-1]:
        if ((d3.iloc[c,1]-d3.iloc[c,0])==0) or (d3.iloc[c,0]==0):
            d3.iloc[c-1,0]=d3.iloc[c-1,0]+d3.iloc[c,0]
            d3.iloc[c-1,1]=d3.iloc[c-1,1]+d3.iloc[c,1]
            d3.drop(d3.index[c],inplace=True)
        else:
            continue
        
    d3['min']=d3.index  
    d3['max'] = d3.index
    d3['bad_rate'] =d3['sum']/d3['total']
    d3['group_rate']=d3['total']/(bad+good)
    d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
    d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
    iv=d3['iv'].sum()
    d3['iv_sum']=iv
    d3=d3[['min','max','sum','total','bad_rate','group_rate','woe','iv','iv_sum']]
    woe=list(d3['woe'].round(6))
    cut=list(d3.index)
    x_woe=X.replace(cut,woe)
    return  d3

取max_apply_request_amount_level这个类别型变量，返回d3：

四、以上三种情况组合

目的实现连续和类别变量自动识别，单调分箱和无法单调分箱变量识别，根据判断情况进行woe分箱。

def woe_single(DF,Y,X):
    if X.nunique()>11:
        r = 0
        bad=Y.sum()      #坏客户数(假设因变量列为1的是坏客户)
        good=Y.count()-bad  #好客户数
        n=5
        while np.abs(r) < 1:
            d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates='drop')})
            d2 = d1.groupby('Bucket', as_index = False)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1
        d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
        d3['min']=d2.min().X    
        d3['max'] = d2.max().X
        d3['sum'] = d2.sum().Y
        d3['total'] = d2.count().Y
        d3['bad_rate'] = d2.mean().Y
        d3['group_rate']=d3['total']/(bad+good)
        d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
        d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
        iv=d3['iv'].sum()
        if iv!=0.0 and len(d2)>1:
            d3['iv_sum']=iv
            woe=list(d3['woe'].round(6))
            cut=list(d3['min'].round(6))
            cut1=list(d3['max'].round(6))
            cut.append(cut1[-1]+1)
            x_woe=pd.cut(X,cut,right=False,labels=woe)
            return  d3,cut,woe,iv,x_woe
        else:
            dn1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.cut(X, 100)})
            dn2 = dn1.groupby('Bucket', as_index = False)
            dn3 = pd.DataFrame(dn2.X.min(), columns = ['min'])
            dn3['min']=dn2.min().X    
            dn3['max'] = dn2.max().X
            dn3['sum'] = dn2.sum().Y
            dn3['total'] = dn2.count().Y
            while (1):
                    if  (len(dn3)>4):
                        dn3_min_index = dn3[dn3.total == min(dn3.total)].index.values[0]
                        if (dn3_min_index!=0):    #最小值非第一行的情况
                            dn3.iloc[dn3_min_index-1, 1] =dn3.iloc[dn3_min_index, 1] 
                            dn3.iloc[dn3_min_index-1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index-1, 2]
                            dn3.iloc[dn3_min_index-1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index-1, 3]
                            dn3=dn3.drop([dn3_min_index])
                            dn3= dn3.reset_index(drop=True)
                        else:    #最小值是第一行的情况
                            dn3.iloc[dn3_min_index+1, 0] =dn3.iloc[dn3_min_index, 0] 
                            dn3.iloc[dn3_min_index+1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index+1, 2]
                            dn3.iloc[dn3_min_index+1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index+1, 3]
                            dn3=dn3.drop([dn3_min_index])
                            dn3= dn3.reset_index(drop=True)
                    else:
                        break
            dn3['bad_rate'] =dn3['sum']/dn3['total']
            dn3['group_rate']=dn3['total']/(bad+good)
            dn3['woe']=np.log((dn3['bad_rate']/(1-dn3['bad_rate']))/(bad/good))
            dn3['iv']=(dn3['sum']/bad-((dn3['total']-dn3['sum'])/good))*dn3['woe']
            
            iv=dn3['iv'].sum()
            dn3['iv_sum']=iv
            woe=list(dn3['woe'].round(6)) 
            cut=list(dn3['min'].round(6))
            cut1=list(dn3['max'].round(6))
            cut.append(cut1[-1]+1)
            x_woe=pd.cut(X,cut,right=False,labels=woe)
            return  dn3,cut,woe,iv,x_woe
    else : 
        bad=Y.sum()      #坏客户数
        good=Y.count()-bad  #好客户数
        d1 = pd.DataFrame({"X": X, "Y": Y})
        d2 = d1.groupby('X', as_index =True)
        d3 = pd.DataFrame()
        
        d3['sum'] = d2.sum().Y
        d3['total'] = d2.count().Y
        for c in range(d3.shape[0])[::-1]:
            if ((d3.iloc[c,1]-d3.iloc[c,0])==0) or (d3.iloc[c,0]==0):
                d3.iloc[c-1,0]=d3.iloc[c-1,0]+d3.iloc[c,0]
                d3.iloc[c-1,1]=d3.iloc[c-1,1]+d3.iloc[c,1]
                d3.drop(d3.index[c],inplace=True)
            else:
                continue
        
        d3['min']=d3.index  
        d3['max'] = d3.index
        d3['bad_rate'] =d3['sum']/d3['total']
        d3['group_rate']=d3['total']/(bad+good)
        d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
        d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
        iv=d3['iv'].sum()
        d3['iv_sum']=iv
        d3=d3[['min','max','sum','total','bad_rate','group_rate','woe','iv','iv_sum']]
        
        
        woe=list(d3['woe'].round(6))
        cut=list(d3.index)
        x_woe=X.replace(cut,woe)
        return d3,cut,woe,iv,x_woe

五、查看个别变量分箱、woe、iv等情况

选取apply_request_sum这个变量来看下情况：
（1）整体分箱woe、iv情况：

woe_single(df,df.overdue_7,df.apply_request_sum)[0]

（2）分割点：

woe_single(df,df.overdue_7,df.overdue_repay_sum)[1]

（3）woe：

woe_single(df,df.overdue_7,df.overdue_repay_sum)[2]

（4）iv:

woe_single(df,df.overdue_7,df.overdue_repay_sum)[3]

六、查看所有变量分箱、woe、iv等情况

woe=pd.DataFrame(columns=['min','max','sum','total','bad_rate','group_rate','woe','iv','iv_sum'])
for i in df.columns[5:]:
    w=pd.DataFrame(index=[i])
    w=pd.concat([w,woe_single(df,df.overdue_7,df[i])[0]])
    frames = [woe,w]
    woe=pd.concat(frames)
woe=woe[['min','max','sum','total','bad_rate','group_rate','woe','iv','iv_sum']]     #所有变量输出
woe

也可以导出为excel文件，方便分析：

woe.to_excel('E:\woe_all_new.xlsx')

七、转换原始数据为对应woe值

直接定义一个新的函数：

def woe_all(DF,Y,X):
    for i in X:
        if DF[i].nunique()>11:
            r = 0
            bad=Y.sum()      #坏客户数(假设因变量列为1的是坏客户)
            good=Y.count()-bad  #好客户数
            n=5
            while np.abs(r) < 1:
                d1 = pd.DataFrame({"X": DF[i], "Y": Y, "Bucket": pd.qcut(DF[i], n,duplicates='drop')})
                d2 = d1.groupby('Bucket', as_index = False)
                r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
                n = n - 1
            d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
            d3['min']=d2.min().X    
            d3['max'] = d2.max().X
            d3['sum'] = d2.sum().Y
            d3['total'] = d2.count().Y
            d3['bad_rate'] = d2.mean().Y
            d3['group_rate']=d3['total']/(bad+good)
            d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
            d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
            iv=d3['iv'].sum()
            if iv!=0.0:
                woe=list(d3['woe'].round(5))
                cut=list(d3['min'].round(5))
                cut1=list(d3['max'].round(5))
                cut.append(cut1[-1]+1)
                x_woe=pd.cut(DF[i],cut,right=False,labels=woe)
                DF[i]=x_woe
            else:
                dn1 = pd.DataFrame({"X": DF[i], "Y": Y, "Bucket": pd.cut(DF[i], 20)})
                dn2 = dn1.groupby('Bucket', as_index = False)
                dn3 = pd.DataFrame(dn2.X.min(), columns = ['min'])
                dn3['min']=dn2.min().X    
                dn3['max'] = dn2.max().X
                dn3['sum'] = dn2.sum().Y
                dn3['total'] = dn2.count().Y
                dn3=dn3.dropna()
                dn3= dn3.reset_index(drop=True)
                while (1):
                    if  (len(dn3)>4):
                        dn3_min_index = dn3[dn3.total == min(dn3.total)].index.values[0]
                        if (dn3_min_index!=0):    #最小值非第一行的情况
                            dn3.iloc[dn3_min_index-1, 1] =dn3.iloc[dn3_min_index, 1] 
                            dn3.iloc[dn3_min_index-1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index-1, 2]
                            dn3.iloc[dn3_min_index-1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index-1, 3]
                            dn3=dn3.drop([dn3_min_index])
                            dn3= dn3.reset_index(drop=True)
                        else:    #最小值是第一行的情况
                            dn3.iloc[dn3_min_index+1, 0] =dn3.iloc[dn3_min_index, 0] 
                            dn3.iloc[dn3_min_index+1, 2] = dn3.iloc[dn3_min_index, 2] +dn3.iloc[dn3_min_index+1, 2]
                            dn3.iloc[dn3_min_index+1, 3] = dn3.iloc[dn3_min_index, 3] +dn3.iloc[dn3_min_index+1, 3]
                            dn3=dn3.drop([dn3_min_index])
                            dn3= dn3.reset_index(drop=True)
                    else:
                        break
                dn3['bad_rate'] =dn3['sum']/dn3['total']
                dn3['group_rate']=dn3['total']/(bad+good)
                dn3['woe']=np.log((dn3['bad_rate']/(1-dn3['bad_rate']))/(bad/good))
                dn3['iv']=(dn3['sum']/bad-((dn3['total']-dn3['sum'])/good))*dn3['woe']
                iv=dn3['iv'].sum()
                woe=list(dn3['woe'].round(5)) 
                cut=list(dn3['min'].round(5))
                cut1=list(dn3['max'].round(5))
                cut.append(cut1[-1]+1)
                x_woe=pd.cut(DF[i],cut,right=False,labels=woe)
                DF[i]=x_woe
        else : 
            bad=Y.sum()      #坏客户数
            good=Y.count()-bad  #好客户数
            d1 = pd.DataFrame({"X": DF[i], "Y": Y})
            d2 = d1.groupby('X', as_index =True)
            d3 = pd.DataFrame()
            d3['sum'] = d2.sum().Y
            d3['total'] = d2.count().Y
            for c in range(d3.shape[0])[::-1]:
                if ((d3.iloc[c,1]-d3.iloc[c,0])==0) or (d3.iloc[c,0]==0):
                    d3.iloc[c-1,0]=d3.iloc[c-1,0]+d3.iloc[c,0]
                    d3.iloc[c-1,1]=d3.iloc[c-1,1]+d3.iloc[c,1]
                    d3.drop(d3.index[c],inplace=True)
                else:
                    continue
            d3['bad_rate'] =d3['sum']/d3['total']
            d3['group_rate']=d3['total']/(bad+good)
            d3['woe']=np.log((d3['bad_rate']/(1-d3['bad_rate']))/(bad/good))
            d3['iv']=(d3['sum']/bad-((d3['total']-d3['sum'])/good))*d3['woe']
            iv=d3['iv'].sum()
            woe=list(d3['woe'].round(5))
            cut=list(d3.index)
            x_woe=DF[i].replace(cut,woe)
            DF[i]=x_woe
    return DF

转换后保存到excel中：

woe=woe_all(df,df.overdue_7,df.iloc[:,1:])   
woe.to_csv('E:\woe_all.csv')

八、问题点总结

目前单调分箱的连续型变量和类别变量基本与手动划分一致，高频型的连续变量还有待改进，例如在实际应用中，有时会出现如下报错：

查看分组情况：

发现该变量存在离群值，且有99.3%的变量值为0，所以实际上该变量的分析意义不大，可以删掉。由此，在进行转化之前建议优先进行数据清洗，排除离群值的干扰以及无意义的变量。