Sklearn:Ensemble, 调参(网格搜索),交叉验证,
自己在尝试用机器学习的方法进行疾病的风险预测的研究。针对文本和数值型数据,从大的方面来说,主要是分类和回归两类。很多医学文献会提到用Logistic Regression 这个二分类器作基础的疾病风险预测,回答是否的问题。对于生存期的预测,则采用Cox Regreesion。既然是应用到医学数据上,一般都要求有比较高的准确率,很强的可解释性,还要有比较好的泛化性能。有时候,这三个要求可能是有冲突的。比较好的可解释性,一般是基于规则的、Logistic Regression(LR)、决策树(Decision Tree, DT)、贝叶斯方法(Bayes)等,然而,以上方法往往准确率不够高。准确率高的,比如支持向量机(SVM),多层感知器(人工神经网络,ANN/MLP),深度学习(Deep Learning , DL)等,可解释性很差。
我本身倾向于在可解释性做一些文章,再加上LR有很广泛的应用,所以打算对LR进行一些改进。正巧,偶然间看到有人使用GBDT对原始的特征进行处理,然后将处理后的特征输入LR。这篇论文是Facebook在2014年基于上百万条公司数据进行的研究和实验,对相关研究有启发作用。这么做的原因,文章中也有解释,主要是用boosted decision tree 来对特征做一个non-linear transformation,然后再输入LR(图1)。
图1
boosted decision tree 可以是GBDT(Gradient Boosting Decision Tree), 也可以是Xgboost,两个并没有显著区别。如果图方便,sklearn 中有现成的GBDT和LR,可以直接调用。如果使用Xgboost,也有官方的Python 包。
但是,和网上其他几位博主的实验类似,我使用两个疾病数据集进行验证,发现GBDT+LR并没有显著提升作用,甚至不如GBDT的单一算法。这其中的原因,值得研究。另外,Facebook那篇文章用的评价指标也有意思,没有重点说常用的AUC,F1 Score之类的,只选用了他们看好的Normalized Entropy(NE),他们给出的选择理由可以从原文中找。
下面是我基于sklearn中自带的乳腺癌的数据集进行的一个测试。
1.加载数据
#load dataset from the library
import sklearn
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
data = load_breast_cancer(return_X_y=True)
data_p = []
data_n = []
for i in range(len(data[1])):
if data[1][i] == 1:
data_p.append(data[0][i])
else:
data_n.append(data[0][i])
data_p = pd.DataFrame(np.array(data_p))
data_n = np.array(data_n)
data_p = data_p.sample(data_n.shape[0], axis=0)
label_p = [1]*data_p.shape[0]
label_n = [0]*data_n.shape[0]
data_comb = np.vstack((data_p,data_n))
label_p.extend(label_n)
label = np.array(label_p)
print(label)
print(data_comb.shape)
2. 模型搭建及调参
#Cross-Validation & GridSearchCV
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier,ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer,accuracy_score, precision_score, recall_score,f1_score,confusion_matrix,classification_report,roc_auc_score
import xgboost as xgb
x,y = shuffle(data_comb,label, random_state=10)
x = StandardScaler().fit_transform(x)
print(x)
print(y)
'''
#lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(x, y)
lsvc = LogisticRegression().fit(x,y)
print(lsvc.coef_)
model = SelectFromModel(lsvc, prefit=True)
#print(model.threshold_)
x_new = model.transform(x)
print(x_new.shape)
'''
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#parameter = {'C': [1, 10, 100, 1000],'solver':['liblinear','newton-cg','lbfgs','sag','saga']}
#parameter for SVM
#parameter = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]},{'kernel': ['linear'], 'C': [1, 10, 100, 1000]},{'kernel':['poly'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]},{'kernel':['sigmoid'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]}]
#parameter for DecisonTree
#parameter = {'criterion':['gini','entropy'],'max_features':['sqrt','log2']}
#parameter for RandomForest
#parameter = {'n_estimators':range(5,100,5),'criterion':['gini','entropy'],'max_depth':[1,2,3,4,5,6,7,8,9,10],'max_features':['sqrt','log2']}
#parameter for xgboost
#parameter = {'learning_rate': np.linspace(0.01,0.3,num=30),'n_estimators':range(5,300,5),'max_depth': [1,2,3, 4, 5, 6, 7, 8, 9, 10]}
#parameter for adaboost
#parameter = {'n_estimators': range(5,300,5),"learning_rate":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}
#parameter for bagging
#parameter = {'n_estimators': range(5,100,5)}
#parameter for GBDT
parameter = {'learning_rate': np.linspace(0.10,0.3,num=20),'n_estimators':range(10,100,5),'max_depth': [3, 4, 5, 6, 7, 8]}
#parameter for MLP
'''
parameter = {
'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
}
'''
#scoring = {'roc_auc':'roc_auc','accuracy':make_scorer(accuracy_score),'precision':make_scorer(precision_score),'recall':make_scorer(recall_score),'f1':make_scorer(f1_score)}
scoring = {'roc_auc':'roc_auc'}
#clf = LogisticRegression(max_iter=500)
#clf = SVC()
#clf = DecisionTreeClassifier()
#clf = RandomForestClassifier()
#clf = xgb.XGBClassifier( booster='gbtree')
#clf = AdaBoostClassifier()
#clf = BaggingClassifier(LogisticRegression())
clf = GradientBoostingClassifier()
#clf = MLPClassifier(max_iter=500)
#clf = GaussianNB()
model = GridSearchCV(clf,parameter,cv=5,scoring=scoring,n_jobs=-1,verbose=2,return_train_score=False,refit = 'roc_auc')
#fit = model.fit(x_train,y_train)
fit = model.fit(x,y)
#results = fit.cv_results_
#print("results : ",results)
print("best estimator : ",fit.best_estimator_)
print("best score : ",fit.best_score_)
print("best parameters : ",fit.best_params_)
3.GBDT和LR原始模型
#Cross-Validation & GridSearchCV
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier,ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV, cross_val_score,cross_validate,StratifiedKFold
from sklearn.metrics import make_scorer,accuracy_score, precision_score, recall_score,f1_score,confusion_matrix,classification_report,roc_auc_score
import xgboost as xgb
x,y = shuffle(data_comb,label, random_state=10)
x = StandardScaler().fit_transform(x)
print(x.shape)
#lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(x, y)
#lsvc = LogisticRegression().fit(x,y)
lsvc = GradientBoostingClassifier(random_state=0).fit(x,y)
#print(lsvc.feature_importances_)
#print(lsvc.coef_)
model = SelectFromModel(lsvc, threshold=0.0001, prefit=True)
#print(model.threshold_)
x_new = model.transform(x)
x_new = StandardScaler().fit_transform(x_new)
print(x_new.shape)
skf = StratifiedKFold(n_splits=5)
#scoring = {'roc_auc':'roc_auc','accuracy':make_scorer(accuracy_score),'precision':make_scorer(precision_score),'recall':make_scorer(recall_score),'f1':make_scorer(f1_score)}
#scoring = {'roc_auc':'roc_auc'}
AUC =[]
Accuracy=[]
Sensitivity =[]
Specificity =[]
F1 = []
#for train_index, test_index in skf.split(x_new, y):
for train_index, test_index in skf.split(x_new, y):
#x_train, x_test = x_new[train_index], x_new[test_index]
x_train, x_test = x_new[train_index], x_new[test_index]
y_train, y_test = y[train_index], y[test_index]
#clf = LogisticRegression(max_iter=1000,solver='liblinear', C=1)
#clf = SVC(C=1, kernel='rbf', probability=True)
#clf = DecisionTreeClassifier(criterion='entropy',max_features='log2')
#clf = RandomForestClassifier()
#clf = xgb.XGBClassifier()
#clf = AdaBoostClassifier()
#clf = BaggingClassifier(LogisticRegression())
#clf = GradientBoostingClassifier(learning_rate=0.268, max_depth=3,n_estimators=85)
clf = GradientBoostingClassifier()
#clf = MLPClassifier(activation='relu', alpha=0.05, hidden_layer_sizes=(50,50), learning_rate='constant',solver='adam')
#clf = GaussianNB()
fit = clf.fit(x_train,y_train)
y_pred = fit.predict(x_test)
y_test_pred_proba = fit.predict_proba(x_test)
y_true = y_test
confusion = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = confusion.ravel()
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test,y_test_pred_proba[:,1])
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
F1.append(f1)
AUC.append(roc_auc)
Accuracy.append(accuracy)
Sensitivity.append(sensitivity)
Specificity.append(specificity)
AUC.append(np.mean(AUC))
F1.append(np.mean(F1))
Accuracy.append(np.mean(Accuracy))
Sensitivity.append(np.mean(Sensitivity))
Specificity.append(np.mean(Specificity))
print("Sensitivity : ",Sensitivity)
print("Specificity : ",Specificity)
print("Accuracy : ",Accuracy)
print("F1 Score : ",F1)
print("AUC : ",AUC)
4.GBDT+LR
import numpy as np
np.random.seed(10)
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
GradientBoostingClassifier)
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier,ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer,accuracy_score, precision_score, recall_score,f1_score,confusion_matrix,classification_report,roc_auc_score
n_estimator = 5
x,y = shuffle(data_comb,label, random_state=10)
x = StandardScaler().fit_transform(x)
#lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(x, y)
#lsvc = LogisticRegression().fit(x,y)
lsvc = GradientBoostingClassifier(random_state=0).fit(x,y)
#print(lsvc.feature_importances_)
#print(lsvc.coef_)
model = SelectFromModel(lsvc, threshold=0.0001, prefit=True)
#print(model.threshold_)
x_new = model.transform(x)
#x_new = StandardScaler().fit_transform(x_new)
print(x_new.shape)
AUC =[]
Accuracy=[]
Sensitivity =[]
Specificity =[]
F1 = []
skf = StratifiedKFold(n_splits=5, random_state=10)
for train_index, test_index in skf.split(x_new, y):
#x_train, x_test = x_new[train_index], x_new[test_index]
X_train, X_test = x_new[train_index], x_new[test_index]
y_train, y_test = y[train_index], y[test_index]
X_train, X_train_lr, y_train, y_train_lr = train_test_split(
X_train, y_train, test_size=0.5,random_state =10)
"""
X_train_original = X_train[:,:43]
X_train_lr_original = X_train_lr[:,:43]
X_test_original = X_test[:,:43]
X_train_cross = X_train[:,43:]
X_train_lr_cross = X_train_lr[:,43:]
X_test_cross = X_test[:,43:]
"""
# Supervised transformation based on gradient boosted trees
grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = OneHotEncoder(categories='auto')
grd_lm = LogisticRegression(solver='liblinear', max_iter=1000)
#grd_lm = GradientBoostingClassifier()
#grd_lm = DecisionTreeClassifier()
grd.fit(X_train, y_train)
print("*******************apply*****************************")
grd_enc.fit(grd.apply(X_train)[:, :, 0])
onehot_train_lr = grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]).toarray()
total_train_lr = np.hstack((onehot_train_lr, X_train_lr))
print(total_train_lr.shape)
#total_train_lr = StandardScaler().fit_transform(total_train_lr)
#total_train_lr = StandardScaler().fit_transform(total_train_lr)
grd_lm.fit(total_train_lr, y_train_lr)
#grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
onehot_test = grd_enc.transform(grd.apply(X_test)[:, :, 0]).toarray()
total_test = np.hstack(( onehot_test, X_test))
#total_test = StandardScaler().fit_transform(total_test)
#total_test = StandardScaler().fit_transform(total_test)
"""
y_pred_grd_lm_proba = grd_lm.predict_proba(
grd_enc.transform(grd.apply(X_test)[:, :, 0]))
y_pred_grd_lm = grd_lm.predict(
grd_enc.transform(grd.apply(X_test)[:, :, 0]))
"""
y_pred_grd_lm_proba = grd_lm.predict_proba(total_test)
y_pred_grd_lm = grd_lm.predict(total_test)
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
f1 = f1_score(y_test, y_pred_grd_lm)
confusion = confusion_matrix(y_test, y_pred_grd_lm)
tn, fp, fn, tp = confusion.ravel()
accuracy = accuracy_score(y_test, y_pred_grd_lm)
roc_auc = roc_auc_score(y_test,y_pred_grd_lm_proba[:,1])
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
F1.append(f1)
AUC.append(roc_auc)
Accuracy.append(accuracy)
Sensitivity.append(sensitivity)
Specificity.append(specificity)
AUC.append(np.mean(AUC))
F1.append(np.mean(F1))
Accuracy.append(np.mean(Accuracy))
Sensitivity.append(np.mean(Sensitivity))
Specificity.append(np.mean(Specificity))
print("Sensitivity : ",Sensitivity)
print("Specificity : ",Specificity)
print("Accuracy : ",Accuracy)
print("F1 Score : ",F1)
print("AUC : ",AUC)