每天AI你三千遍

统计特征IV值算法实现demo

2018-04-21  本文已影响89人  AI_Engine

import numpy as np

from math import log

from math import e

import os,sys

reload(sys)

sys.setdefaultencoding('utf-8')

#获取数据并以列表的形式返回

def achieve_data(path):

    feaValue_list = []

    if os.path.isdir(path):

        file_list = os.listdir(path)

        for f in file_list:

            if f.endswith('.txt'):

                fea_array = np.genfromtxt(path+'\\'+f,dtype= float)

            feaValue_list.extend(fea_array)

    return feaValue_list

#计算每个特征的分组临界值点,这里分为4组。

def cal_stage_vlaue(sample_list):

    all_fea_step = []

    sample_array_len = len(sample_list)

    step_num = sample_array_len/4

    stepValue1 = stepValue2 = stepValue3 = stepValue4 = 0

    fea_num = len(sample_list[0])

    sample_array = np.array(sample_list)

    for i in range(0,fea_num):

        col_arg = np.argsort(sample_array[:,i])

        col_arg_array = sample_array[col_arg]

        stepValue1 = col_arg_array[step_num][i]

        stepValue2 = col_arg_array[2*step_num][i]

        stepValue3 = col_arg_array[3*step_num][i]

        stepValue4 = col_arg_array[-1][i]

        one_fea_step = [0,stepValue1,stepValue2,stepValue3,stepValue4]

        all_fea_step.append(one_fea_step)

    return all_fea_step

#计算每个特征的不同分组情况下,黑白样本分别覆盖的数量

def cal_proportion(stage_array,white_list,black_list):

    fea_num = len(stage_array)

    all_fea_pro_list = []

    for i in range(fea_num):

        white_count = len(white_list)

        white_num_1 = white_num_2 = white_num_3 = white_num_4 = 0

        for j in range(white_count):

            if white_list[j][i] >= stage_array[i][0] and white_list[j][i] < stage_array[i][1]:

                white_num_1 = white_num_1+1

            elif white_list[j][i] >= stage_array[i][1] and white_list[j][i] < stage_array[i][2]:

                white_num_2 = white_num_2+1

            elif white_list[j][i] >= stage_array[i][2] and white_list[j][i] < stage_array[i][3]:

                white_num_3 = white_num_3+1

            elif white_list[j][i] >= stage_array[i][3] and white_list[j][i] <= stage_array[i][4]:

                white_num_4 = white_num_4+1

            else:

                pass

        white_fea_pro_list = [white_num_1,white_num_2,white_num_3,white_num_4]

        black_count = len(black_list)

        black_num_1 = black_num_2 = black_num_3 = black_num_4 = 0

        for k in range(black_count):

            if black_list[k][i] >= stage_array[i][0] and black_list[k][i] < stage_array[i][1]:

                black_num_1 = black_num_1 + 1

            elif black_list[k][i] >= stage_array[i][1] and black_list[k][i] < stage_array[i][2]:

                black_num_2 = black_num_2 + 1

            elif black_list[k][i] >= stage_array[i][2] and black_list[k][i] < stage_array[i][3]:

                black_num_3 = black_num_3 + 1

            elif black_list[k][i] >= stage_array[i][3] and black_list[k][i] < stage_array[i][4]

                black_num_4 = black_num_4 + 1

            else:

                pass

        black_fea_pro_list = [black_num_1, black_num_2, black_num_3, black_num_4]

        one_fea_pro_list = []

        one_fea_pro_list.append(black_fea_pro_list)

        one_fea_pro_list.append(white_fea_pro_list)

        all_fea_pro_list.append(one_fea_pro_list)

    #print all_fea_pro_list

    all_fea_pro_list = fix_pro(all_fea_pro_list)

    return all_fea_pro_list

#这里是人工处理黑白样本在分组中覆盖数量为0的情况。

def fix_pro(pro_list):

    num1 = len(pro_list)

    for i in range(num1):

        num2 = len(pro_list[i])

        for j in range(num2):

            num3 = len(pro_list[i][j])

            for k in range(num3):

                if pro_list[i][j][k] == 0:

                    pro_list[i][j][k] = 1

    return pro_list

#计算每个特征的IV值

def cal_IV(propor_array):

    propor_array_count = len(propor_array)

    IV_list = []

    for i in range(propor_array_count):

        pro_part_array = propor_array[i].T

        print pro_part_array

        black_sum,white_sum = pro_part_array.sum(axis=0)

        print black_sum,white_sum

        DB1 = float(pro_part_array[0][0])/ float(black_sum)

        DB2 = float(pro_part_array[1][0])/ float(black_sum)

        DB3 = float(pro_part_array[2][0])/ float(black_sum)

        DB4 = float(pro_part_array[3][0])/ float(black_sum)

        DG1 = float(pro_part_array[0][1])/ float(white_sum)

        DG2 = float(pro_part_array[1][1])/ float(white_sum)

        DG3 = float(pro_part_array[2][1])/ float(white_sum)

        DG4 = float(pro_part_array[3][1])/ float(white_sum)

        IV1 = (DG1-DB1) * log(float(DG1/DB1),e)

        IV2 = (DG2-DB2) * log(float(DG2/DB2),e)

        IV3 = (DG3-DB3) * log(float(DG3/DB3),e)

        IV4 = (DG4-DB4) * log(float(DG4/DB4),e)

        IV = IV1+IV2+IV3+IV4

        IV_list.append(IV)

    return IV_list

上一篇下一篇

猜你喜欢

热点阅读