13.ROC-AUC-KS-聚类

2020-04-08  本文已影响0人  羽天驿

一、ROC-AUC-KS

一、ROC--受试者特征曲线

ROC.png
DAIMA.png

(1)调用方法实现:

代码:

import numpy as np
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
y_true = [1,1,1,1,1,1,0,0,0,1,1,0,0,1,1,0,0,1,1,0,0]#真实类别
# 概率,分类问题,都是概率问题
# KNN分类是不是概率问题??? 是概率问题,找最近的几个邻居,投票决定,多少,概率
# LogisticRegression是不是概率问题??? sigmoid函数线性方程转换成概率了
# 随机森林,是不是概率问题??? 多颗决策树,出谋策划,投票,概率
# DecisionTreeClassifer是不是分类问题?? 不是概率问题,信息熵,gini系数,构建了一颗二叉树
# proba意思,样本属于类别1的概率是多少!属于类别0的概率是:1-proba
proba = np.array([0.42,0.73,0.55,0.37,0.57,0.70,0.25,0.23,0.46,0.62,0.76,0.46,0.55,0.56,0.56,0.38,0.37,0.73,0.77,0.21,0.39])

fpr,tpr,thresholds = roc_curve(y_true,proba)#调用这个方法,返回3个值,fpr(横坐标),tpr(纵坐标)
plt.plot(fpr,tpr,color = 'red')
plt.xlabel('FPR',fontsize = 15)
plt.ylabel('TPR',fontsize = 15)
plt.fill_between(fpr,tpr,color = 'green',alpha = 0.3)
<matplotlib.collections.PolyCollection at 0x18366e0b888>
output_1_1.png
# 概率转化为类别,阈值是0.5
y_ = (proba >= 0.5).astype(np.int8)
display(y_,(y_ == y_true).mean())
array([0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0],
      dtype=int8)



0.8571428571428571
# 概率转化为类别,阈值是0.8
y_ = (proba >= 0.8).astype(np.int8)
display(y_,(y_ == y_true).mean())
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int8)



0.42857142857142855
# 概率转化为类别,阈值是0.4
y_ = (proba >= 0.4).astype(np.int8)
display(y_ ,(y_==y_true).mean())
array([1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0],
      dtype=int8)



0.8095238095238095
for threshold in np.arange(0.1,0.9,0.05):
    y_ = (proba >= threshold).astype(np.int8)
    print(threshold,(y_ == y_true).mean())
0.1 0.5714285714285714
0.15000000000000002 0.5714285714285714
0.20000000000000004 0.5714285714285714
0.25000000000000006 0.7142857142857143
0.30000000000000004 0.7142857142857143
0.3500000000000001 0.7142857142857143
0.40000000000000013 0.8095238095238095
0.45000000000000007 0.7619047619047619
0.5000000000000001 0.8571428571428571
0.5500000000000002 0.8571428571428571
0.6000000000000002 0.7142857142857143
0.6500000000000001 0.6666666666666666
0.7000000000000002 0.6190476190476191
0.7500000000000002 0.5238095238095238
0.8000000000000002 0.42857142857142855
0.8500000000000002 0.42857142857142855

(2)自己代码实现:

代码:

import numpy as np

import matplotlib.pyplot as plt

# 混淆矩阵
from sklearn.metrics import confusion_matrix

# AUC ---->area under curve(ROC) 曲线下的面积
from sklearn.metrics import auc#计算ROC曲线和横纵坐标围成的面积,面积越大越好,最大1
y_true = [1,1,1,1,1,1,0,0,0,1,1,0,0,1,1,0,0,1,1,0,0]#真实类别
proba = np.array([0.42,0.73,0.55,0.37,0.57,0.70,0.25,0.23,0.46,0.62,0.76,0.46,0.55,0.56,0.56,0.38,0.37,0.73,0.77,0.21,0.39])
#阈值
thresholds = np.arange(0,1.0,0.05)
thresholds
array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
       0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95])
y_pred = (proba >= 0.4).astype(np.int8)
cm = confusion_matrix(y_true,y_pred)
cm
array([[ 6,  3],
       [ 1, 11]], dtype=int64)
tprs = []
fprs = []
# 阈值,程序员,自己给定的。
for t in thresholds:
#     根据阈值,将概率proba转化成类别
    y_pred = (proba >=t).astype(np.int8)
    cm = confusion_matrix(y_true,y_pred)# cm是两行两列的数据
    tpr = cm[0,0]/(cm[0,0] + cm[0,1])
    tprs.append(tpr)
    fpr = cm[1,0]/(cm[1,0] + cm[1,1])
    fprs.append(fpr)
tprs = np.asarray(tprs)
fprs = np.asarray(fprs)
plt.plot(fprs,tprs,color = 'red')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.fill_between(fprs,tprs,color ='green',alpha = 0.3)
area = auc(fprs,tprs)
print('AUC:',area)
AUC: 0.912037037037037
output_4_1.png image.png

(3)ROC-AUV具体的使用方法:

案例代码:

(3)ROC--AUC多分类案例

image.png
### 导包,加载数据


```python
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

# one-hot 独热编码输出
from sklearn.preprocessing import label_binarize#二进制形式,表示标签
from scipy import interp
from sklearn.metrics import roc_auc_score

# 加载数据
iris = datasets.load_iris()
X = iris.data
y = iris.target

原数据加噪声,将数据拆分

# 数据变的复杂一点了
X = np.c_[X, np.random.randn(150, 800)] # 添加了800个随机噪声,800个属性

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5)

声明算法,训练,预测概率

classifier = KNeighborsClassifier()
classifier.fit(X_train,y_train)
y_score = classifier.predict_proba(X_test)

计算fpr,tpr,roc-auc

fpr = dict()
tpr = dict()
roc_auc = dict()
# label_binarize one-hot独热编码!
y_test = label_binarize(y_test, classes=[0, 1, 2])
for i in range(n_classes):
#     每个类别,计算一下 fpr和tpr
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# 计算 micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.reshape(-1), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print(roc_auc)
{0: 0.73070987654321, 1: 0.36663879598662213, 2: 0.5972, 'micro': 0.5669333333333333}

image.png image.png
image.png

(3)、多分类)

导包,加载数据

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

# one-hot 独热编码输出
from sklearn.preprocessing import label_binarize#二进制形式,表示标签
from scipy import interp
from sklearn.metrics import roc_auc_score

# 加载数据
iris = datasets.load_iris()
X = iris.data
y = iris.target

原数据加噪声,将数据拆分

# 数据变的复杂一点了
X = np.c_[X, np.random.randn(150, 800)] # 添加了800个随机噪声,800个属性
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5)

声明算法,训练,预测概率

classifier = LogisticRegression()
classifier.fit(X_train,y_train)
y_score = classifier.predict_proba(X_test)

计算fpr,tpr,roc-auc

fpr = dict()
tpr = dict()
roc_auc = dict()
# label_binarize one-hot独热编码!
y_test = label_binarize(y_test, classes=[0, 1, 2])
for i in range(3):
#     每个类别,计算一下 fpr和tpr
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# 计算 micro-average ROC curve and ROC area
# 计算,计算平均值!
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.reshape(-1), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Then interpolate all ROC curves at this points
fpr_mean = np.linspace(0,1.0,50)
tpr_mean = np.zeros_like(fpr_mean)
for i in range(3):
    tpr_mean += np.interp(fpr_mean, fpr[i], tpr[i])/3
fpr["macro"] = fpr_mean
tpr["macro"] = tpr_mean
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print(roc_auc)
print((roc_auc[0] + roc_auc[1] + roc_auc[2])/3)
{0: 0.8385185185185186, 1: 0.39869281045751637, 2: 0.7231040564373898, 'micro': 0.6441777777777777, 'macro': 0.6529721412374474}
0.653438461804475

绘制具体一个类别的ROC曲线,相当于二分类

plt.figure()
lw = 2
plt.plot(fpr[1], tpr[1], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend()
plt.show()
output_9_0.png

绘制多标签的ROC-AUC曲线

# Plot all ROC curves
plt.figure(figsize=(9,6))
# 绘制ROC的平均值,以micro方式进行计算
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]),
          marker  = '*', linewidth=2)

# 绘制ROC平均值,以macro方式进行的
plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=2)

for i in range(3):
    plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('多分类受试者工作特征曲线',fontproperties = 'KaiTi',size = 20)
plt.legend(loc="lower right")
plt.show()
output_11_0.png
上一篇 下一篇

猜你喜欢

热点阅读