课题研究

2022-09-12 XGBoost

2022-09-14  本文已影响0人  千容安
from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime
from sklearn.model_selection import train_test_split
import xgboost as xgb
xgb.set_config(verbosity=0)
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs #自创数据集
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc
feature = pd.read_csv('C:/Users/Administrator.DESKTOP-4UQ3Q0K/Desktop/TCR_CMV_pred-master/code/feature_ac_h1.tsv',sep='\t')

X = feature[['TCR_sum', 'uniques']]
y = feature['status']

X = np.array(X)
y = np.array(y)

from sklearn.preprocessing import StandardScaler
#将所有数据转化为(0,1)间的正态分布
X = StandardScaler().fit_transform(X)

Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420) #随机抽样

#在sklearn下建模#

clf = XGBC().fit(Xtrain,Ytrain)
ypred = clf.predict(Xtest)

clf.score(Xtest,Ytest) #默认模型评估指标 - 准确率

cm(Ytest,ypred,labels=[1,0]) #多数类写在前面
#15和23是分类正确,10和0是分类错误。15个是少数类中分类正确的,0个是少数类中分类错误的

recall(Ytest,ypred)

auc(Ytest,clf.predict_proba(Xtest)[:,1])

#负/正样本比例
clf_ = XGBC(scale_pos_weight=3).fit(Xtrain,Ytrain)  #scale_pos_weight:正类/负类的数,处理样本不均衡的问题
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)
#随着样本权重逐渐增加,模型的recall,auc和准确率如何变化?
for i in [1,2,3,4,5]:
    clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain)
    ypred_ = clf_.predict(Xtest)
    print(i)
    print("\tAccuracy:{}".format(clf_.score(Xtest,Ytest)))
    print("\tRecall:{}".format(recall(Ytest,ypred_)))
    print("\tAUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))
dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)

#看看xgboost库自带的predict接口
param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100

bst = xgb.train(param, dtrain, num_round)

preds = bst.predict(dtest)

#自己设定阈值
ypred = preds.copy()

ypred[preds > 0.5] = 1

ypred[ypred != 1] = 0

开始调scale_pos_weight

#写明参数
scale_pos_weight = [180,200,220]
names = ["negative vs positive: 180"
         ,"negative vs positive: 200"
         ,"negative vs positive: 220"]

[*zip(names,scale_pos_weight)]

#导入模型评估指标
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, roc_auc_score as auc

for name,i in zip(names,scale_pos_weight):
    param = {'silent':True,'objective':'binary:logistic'
            ,"eta":0.1,"scale_pos_weight":i}
    num_round = 100
    clf = xgb.train(param, dtrain, num_round)
    preds = clf.predict(dtest)
    ypred = preds.copy()
    ypred[preds > 0.5] = 1
    ypred[ypred != 1] = 0
    print(name)
    print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
    print("\tRecall:{}".format(recall(Ytest,ypred)))
    print("\tAUC:{}".format(auc(Ytest,preds)))
#当然我们也可以尝试不同的阈值
for name,i in zip(names,scale_pos_weight):
    for thres in [0.3,0.5,0.7,0.9]:
        param= {'silent':True,'objective':'binary:logistic'
                ,"eta":0.1,"scale_pos_weight":i}
        clf = xgb.train(param, dtrain, num_round)
        preds = clf.predict(dtest)
        ypred = preds.copy()
        ypred[preds > thres] = 1
        ypred[ypred != 1] = 0
        print("{},thresholds:{}".format(name,thres))
        print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
        print("\tRecall:{}".format(recall(Ytest,ypred)))
        print("\tAUC:{}".format(auc(Ytest,preds)))
#最好的是negative vs positive: 200,thresholds:0.5
#   Accuracy:0.85416
#   Recall:0.81818
#   AUC:0.935353

调阈值好像没什么进步

def plot_learning_curve(estimator,title, X, y, 
                        ax=None, #选择子图
                        ylim=None, #设置纵坐标的取值范围
                        cv=None, #交叉验证
                        n_jobs=None #设定索要使用的线程
                       ):
    
    from sklearn.model_selection import learning_curve
    import matplotlib.pyplot as plt
    import numpy as np
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y
                                                            ,shuffle=True
                                                            ,cv=cv
                                                            ,random_state=420
                                                            ,n_jobs=n_jobs)      
    if ax == None:
        ax = plt.gca()
    else:
        ax = plt.figure()
    ax.set_title(title)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")
    ax.grid() #绘制网格,不是必须
    ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-'
            , color="r",label="Training score")
    ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-'
            , color="g",label="Test score")
    ax.legend(loc="best")
    return ax
cv = KFold(n_splits=10, shuffle = True, random_state=42) #交叉验证模式
plot_learning_curve(XGBC(n_estimators=100,random_state=420)
                    ,"xgb",Xtrain,Ytrain,ax=None,cv=cv)
plt.show()

选择n_estimators(学习曲线)

#=====【TIME WARNING:25 seconds】=====#

axisx = range(10,1010,50)
rs = []
for i in axisx:
    reg = XGBC(n_estimators=i,random_state=420)
    rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
#======【TIME WARNING: 20s】=======#
axisx = range(10,50,5)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBC(n_estimators=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    #记录1-偏差
    rs.append(cvresult.mean())
    #记录方差
    var.append(cvresult.var())
    #计算泛化误差的可控部分
    ge.append((1 - cvresult.mean())**2+cvresult.var())
#打印R2最高所对应的参数取值,并打印这个参数下的方差
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
#打印方差最低时对应的参数取值,并打印这个参数下的R2
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
#打印泛化误差可控部分的参数取值,并打印这个参数下的R2,方差以及泛化误差的可控部分
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()

这来表示在10到50之间每隔5取一棵树时,15棵树的效果是最好的。
于是在1到25之间每1取一棵树

axisx = range(1,25,1)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBC(n_estimators=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)*0.01
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
#添加方差线
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
上一篇下一篇

猜你喜欢

热点阅读