统计特征IV值算法实现demo
import numpy as np
from math import log
from math import e
import os,sys
reload(sys)
sys.setdefaultencoding('utf-8')
#获取数据并以列表的形式返回
def achieve_data(path):
feaValue_list = []
if os.path.isdir(path):
file_list = os.listdir(path)
for f in file_list:
if f.endswith('.txt'):
fea_array = np.genfromtxt(path+'\\'+f,dtype= float)
feaValue_list.extend(fea_array)
return feaValue_list
#计算每个特征的分组临界值点,这里分为4组。
def cal_stage_vlaue(sample_list):
all_fea_step = []
sample_array_len = len(sample_list)
step_num = sample_array_len/4
stepValue1 = stepValue2 = stepValue3 = stepValue4 = 0
fea_num = len(sample_list[0])
sample_array = np.array(sample_list)
for i in range(0,fea_num):
col_arg = np.argsort(sample_array[:,i])
col_arg_array = sample_array[col_arg]
stepValue1 = col_arg_array[step_num][i]
stepValue2 = col_arg_array[2*step_num][i]
stepValue3 = col_arg_array[3*step_num][i]
stepValue4 = col_arg_array[-1][i]
one_fea_step = [0,stepValue1,stepValue2,stepValue3,stepValue4]
all_fea_step.append(one_fea_step)
return all_fea_step
#计算每个特征的不同分组情况下,黑白样本分别覆盖的数量
def cal_proportion(stage_array,white_list,black_list):
fea_num = len(stage_array)
all_fea_pro_list = []
for i in range(fea_num):
white_count = len(white_list)
white_num_1 = white_num_2 = white_num_3 = white_num_4 = 0
for j in range(white_count):
if white_list[j][i] >= stage_array[i][0] and white_list[j][i] < stage_array[i][1]:
white_num_1 = white_num_1+1
elif white_list[j][i] >= stage_array[i][1] and white_list[j][i] < stage_array[i][2]:
white_num_2 = white_num_2+1
elif white_list[j][i] >= stage_array[i][2] and white_list[j][i] < stage_array[i][3]:
white_num_3 = white_num_3+1
elif white_list[j][i] >= stage_array[i][3] and white_list[j][i] <= stage_array[i][4]:
white_num_4 = white_num_4+1
else:
pass
white_fea_pro_list = [white_num_1,white_num_2,white_num_3,white_num_4]
black_count = len(black_list)
black_num_1 = black_num_2 = black_num_3 = black_num_4 = 0
for k in range(black_count):
if black_list[k][i] >= stage_array[i][0] and black_list[k][i] < stage_array[i][1]:
black_num_1 = black_num_1 + 1
elif black_list[k][i] >= stage_array[i][1] and black_list[k][i] < stage_array[i][2]:
black_num_2 = black_num_2 + 1
elif black_list[k][i] >= stage_array[i][2] and black_list[k][i] < stage_array[i][3]:
black_num_3 = black_num_3 + 1
elif black_list[k][i] >= stage_array[i][3] and black_list[k][i] < stage_array[i][4]
black_num_4 = black_num_4 + 1
else:
pass
black_fea_pro_list = [black_num_1, black_num_2, black_num_3, black_num_4]
one_fea_pro_list = []
one_fea_pro_list.append(black_fea_pro_list)
one_fea_pro_list.append(white_fea_pro_list)
all_fea_pro_list.append(one_fea_pro_list)
#print all_fea_pro_list
all_fea_pro_list = fix_pro(all_fea_pro_list)
return all_fea_pro_list
#这里是人工处理黑白样本在分组中覆盖数量为0的情况。
def fix_pro(pro_list):
num1 = len(pro_list)
for i in range(num1):
num2 = len(pro_list[i])
for j in range(num2):
num3 = len(pro_list[i][j])
for k in range(num3):
if pro_list[i][j][k] == 0:
pro_list[i][j][k] = 1
return pro_list
#计算每个特征的IV值
def cal_IV(propor_array):
propor_array_count = len(propor_array)
IV_list = []
for i in range(propor_array_count):
pro_part_array = propor_array[i].T
print pro_part_array
black_sum,white_sum = pro_part_array.sum(axis=0)
print black_sum,white_sum
DB1 = float(pro_part_array[0][0])/ float(black_sum)
DB2 = float(pro_part_array[1][0])/ float(black_sum)
DB3 = float(pro_part_array[2][0])/ float(black_sum)
DB4 = float(pro_part_array[3][0])/ float(black_sum)
DG1 = float(pro_part_array[0][1])/ float(white_sum)
DG2 = float(pro_part_array[1][1])/ float(white_sum)
DG3 = float(pro_part_array[2][1])/ float(white_sum)
DG4 = float(pro_part_array[3][1])/ float(white_sum)
IV1 = (DG1-DB1) * log(float(DG1/DB1),e)
IV2 = (DG2-DB2) * log(float(DG2/DB2),e)
IV3 = (DG3-DB3) * log(float(DG3/DB3),e)
IV4 = (DG4-DB4) * log(float(DG4/DB4),e)
IV = IV1+IV2+IV3+IV4
IV_list.append(IV)
return IV_list