大数据人工智能/模式识别/机器学习精华专题机器学习与数据挖掘

走近数据算法——逻辑回归(Logistic Regression

2018-08-15  本文已影响202人  我叫钱小钱

特别声明:本文仅学习交流,感兴趣的朋友可以留言互相讨论。


# 1、StandardScaler就是z-score方法  
# 2、MaxMinScaler就是min_max方法
from sklearn.preprocessing import StandardScaler 
source_data['New_Amount'] =  StandardScaler().fit_transform(source_data['Amount'].values.reshape(-1,1))
# StandardScaler参数只接受矩阵
data = source_data.drop(columns=['Amount','Time'])
# 检查正负样本label差异
pd.value_counts(source_data['Class'])

样本重采样是对样本Label数量不均衡处理,这里可以有2种处理方式:欠采样(代表算法:EasyEnsemble)过采样(代表算法:SMOTE) ,两种方法概念其实很简单。

1、欠采样:即去除一些反例使得正反例数目接近,再进行学习。由于丢弃很多反例,会使得训练集远小于初始训练集,所以有可能导致欠拟合。

2、过采样:增加一些正例使得正反例数目接近,然后再学习。需要注意的是不能只是对初始正例样本重复采样,否则导致严重的过拟合,SMOTE算法中用用到了另一种机器学习的算法KNN(K邻近取样算法)

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# 数据集中正反lable相差悬殊,需进行重采样
# 重采样常用方法有2种,1、欠采样 代表算法:EasyEnsemble 2、过采样 代表算法:SMOTE

# 先对数据进行切分
labels = data['Class']
features = data.drop(columns=['Class'])

x_train, x_test, y_train , y_test = train_test_split(features,
                                                    labels,
                                                    test_size = 0.3,
                                                    random_state = 0)

# 这里进行SMOTE过采样的方法
over_sample = SMOTE(random_state=0)
os_x_train , os_y_train = over_sample.fit_sample(x_train ,y_train)
os_x_train = pd.DataFrame(os_x_train)
os_y_train = pd.DataFrame(os_y_train)
from sklearn.metrics import roc_curve, auc
# y = np.array([1,1,2,2])
# pred = np.array([0.1, 0.4, 0.35, 0.8])
fpr, tpr, thresholds = roc_curve(y, pred, pos_label=1)
fpr      # array([ 0. ,  0.5,  0.5,  1. ])
tpr      # array([ 0.5,  0.5,  1. ,  1. ])
thresholds      #array([ 0.8 ,  0.4 ,  0.35,  0.1 ])
auc(fpr, tpr)

那么这张图我们现在就相当好理解了吧。


以下两个链接对混淆矩阵及涉及到的指标讲述的非常明白易懂,还不清楚的可以点开看阅读。
混淆矩阵及评估指标相关扩展资料1,外链
混淆矩阵及评估指标相关扩展资料2,外链
import itertools
def plot_confusion_matrix(cm, 
                          classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#         print("Normalized confusion matrix")
    else:
        pass
#         print('Confusion matrix, without normalization')

    im = plt.imshow(cm, interpolation='nearest', cmap=cmap, vmax = cm.max() / 3.)
    plt.title(title,fontsize = 16)
    plt.colorbar(im,shrink=0.63,pad=0.05) 
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0,fontsize = 14)
    plt.yticks(tick_marks, classes,fontsize = 14,rotation=90)

#     fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j]),
                 horizontalalignment="center",
                 fontsize = 16,
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
#     plt.ylabel('True label',fontsize = 14)
#     plt.xlabel('Predicted label',fontsize = 14)

# 模型评估标准 re_call = TP/(TP+NF)
plt.figure(figsize=(12,6)) 

for i,t in enumerate(range(3,9),start=1):
    thresholds = t / 10
    y_proba = lr.predict_proba(x_test.values)[:,1] > thresholds
    plt.subplot(2,3,i)
    matrix = confusion_matrix(y_test, y_proba)
    plot_confusion_matrix(matrix, classes=['Label_0','Label_1'],title = 'Thresholds > %s' % thresholds)
#     print(classification_report(y_test, y_proba))
# 交叉验证准备,切分数据集
def kfold_result_info(x,y,n=5):
    os_x_train = x
    os_x_train = y
    # K-fold交叉验证
    kfold = KFold(len(os_y_train),n_folds= n ,shuffle=False,random_state=0)
    for num in range(5):
    #   c_param 为参数λ惩罚力度权重,代入不断尝试,构造参数
        c_param = 0.01 * 10 ** num
        print('Regularization L1 Param %s :' %(c_param))
        recall_list = []
        
        for i ,val_idx in enumerate(kfold, start=1):
    #       建立逻辑回归测试模型
            lr = LogisticRegression(C = c_param, penalty='l1')
            lr.fit(os_x_train.iloc[val_idx[0],:], os_y_train.iloc[val_idx[0],:].values.ravel())
            os_pre_result = lr.predict(os_x_train.iloc[val_idx[1],:].values)
            # 模型评估标准 re_call = TP/(TP+NF)
            re_call = recall_score(os_y_train.iloc[val_idx[1],:].values,os_pre_result)
            recall_list.append(re_call)
            
            print('\t Cross-val ReCall_Score: %s' % re_call)
        
        mean_score = np.mean(recall_list)
        print('\nRound%s Mean Score : %s' % (num + 1, mean_score))
        print('\n-----------------------------------------------')

上一篇 下一篇

猜你喜欢

热点阅读