1.xgboost_cheatsheet
2021-01-30 本文已影响0人
许志辉Albert
1. xgboost速查表
1.1内置建模方式
- xgb.train训练方式
- DMatrix数据形态,不是DataFrame
1.1.1 读取libsvm格式数据并指定参数建模
import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
# 基本例子,从libsvm文件中读取数据,做二分类
# 数据是libsvm的格式
#1 3:1 10:1 11:1 21:1 30:1 34:1 36:1 40:1 41:1 53:1 58:1 65:1 69:1 77:1 86:1 88:1 92:1 95:1 102:1 105:1 117:1 124:1
#0 3:1 10:1 20:1 21:1 23:1 34:1 36:1 39:1 41:1 53:1 56:1 65:1 69:1 77:1 86:1 88:1 92:1 95:1 102:1 106:1 116:1 120:1
#0 1:1 10:1 19:1 21:1 24:1 34:1 36:1 39:1 42:1 53:1 56:1 65:1 69:1 77:1 86:1 88:1 92:1 95:1 102:1 106:1 116:1 122:1
dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('./data/agaricus.txt.test')
#超参数设定
param = {'max_depth':2 , 'eta' :1 , 'silent':1 , 'objective' : 'binary :logistic'}
# 设定watchlist用于查看模型状态
watchlist = [(dtest , 'eval') , (dtrain , 'train')]
num_round = 2
bst = xgb.train(param , dtrain , num_round ,watchlist)
#使用模型预测
preds = bst.predict(dtest)
#准确率判断
labels = dtest.get_label()
print('错误率为%f' % \
(sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
# 模型存储
bst.save_model('./model/0001.model')
1
1.1.2 配合pandas DataFrame格式数据建模
# 皮马印第安人糖尿病数据集 包含很多字段:怀孕次数 口服葡萄糖耐量试验中血浆葡萄糖浓度 舒张压(mm Hg) 三头肌组织褶厚度(mm)
# 2小时血清胰岛素(μU/ ml) 体重指数(kg/(身高(m)^2) 糖尿病系统功能 年龄(岁)
import pandas as pd
data = pd.read_csv('./data/Pima-Indians-Diabetes.csv')
data.head()
2
import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
#基本例子,从csv文件中读取数据,做二分类
#用pandas读入数据
data = pd.read_csv('./data/Pima-Indians-Diabetes.csv')
#做数据切分
train , test = train_test_split(data)
#转换成Dmatrix格式
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
target_column = 'Outcome'
# 取出numpy array去初始化DMatrix对象
xgtrain = xgb.DMatrix(train[feature_columns].values, train[target_column].values)
xgtest = xgb.DMatrix(test[feature_columns].values, test[target_column].values)
#参数设定
param = {'max_depth':5, 'eta':0.1, 'silent':1, 'subsample':0.7, 'colsample_bytree':0.7, 'objective':'binary:logistic' }
# 设定watchlist用于查看模型状态
watchlist = [(xgtest,'eval'), (xgtrain,'train')]
num_round = 10
bst = xgb.train(param, xgtrain, num_round, watchlist)
# 使用模型预测
preds = bst.predict(xgtest)
# 判断准确率
labels = xgtest.get_label()
print('错误类为%f' % \
(sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
# 模型存储
bst.save_model('./model/0002.model')
3
1.2预估器建模方式(sklearn形态)
import warnings
warings.filterwarning('ignore')
import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
from skelarn.model import train_test_split
from sklearn.externals import joblib
#基本例子,从csv文件读取数据,做二分类
#用pandas读入数据
data = pd.read_csv('./data/Pima-Indians-Diabetes.csv')
#做数据切分
train , test = train_test_split(data)
#取出特征X和分类目标y的部分
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
target_column = 'Outcome'
train_X = train[feature_columns].values
train_y = train[target_column].values
test_X = test[feature_columns].values
test_y = test[target_column].values
#初始化模型
xgb_classfier = xgb.XGBClassifier(n_estimators = 20 , max_depth = 4 , learing_rate = 0.1 , subsample = 0.7 , colsample_bytree = 0.7)
#拟合模型
xgb_classifier.fit(train_X , train_y)
#使用模型预测
preds = xgb_classifier.predict(test_X)
#判断准确率
print('错误率为%f' %(preds != test_y).sum() / float(test_y.shape[0])))
#模型存储
joblib.dump(xgb_classifier , './model/0003.model')
4
1.3 内置建模方式:交叉验证与高级功能
1.3.1 交叉验证
xgb.cv(param , dtrain , num_round , nfold = 5 ,metrics = {'error'} , seed = 0)
5
1.3.2 添加预处理的交叉验证
# 计算正负样本比,调整样本权重
def fpreproc(dtrain , dtest , param):
label = dtrain.get_label()
ratio = float(np.sum(label == 0 ) / np.sum(label = 1)
param['scale_pos_weight'] = ratio
return (dtrain , dtest , param)
# 先做预处理 , 计算样本权重 , 再做交叉验证
xgb.cv(param , dtrain , num_round , nfold = 5 , metrics = {'auc'} , seed = 0 , fpreproc = fpreproc)
6
1.3.3 自定义损失函数与评估准则
print('使用自定义函数进行交叉验证')
#自定义损失函数,需要提供损失函数的一阶导与二阶导
def logregobj(preds , dtrain):
labels = dtrain.get_label()
preds = 1.0/(1.0 + np.exp(-preds))
grad = preds - labels
hess = preds *(1.0 - preds)
return grad , hess
#自定义评估准则 , 评估预测值和标准答案之间的差距
def evalerror(preds , dtrain):
labels = dtrain.get_label()
return 'error' , float(sum(lables != (preds > 0.0))) / len(labels)
watchlist = [(dtest , 'eval' ) , (dtrain , 'train')]
param = {'max_depth' :3 , 'eta' :0.1 ,'silent' :1}
num_round = 5
#自定义损失函数训练
bst = xgb.train(param , dtrain , num_round , watchlist, logregobj , evalerror)
#交叉验证
xgb.cv(param , dtrain , num_round , nfold = 5 , seed = 0 ,obj = logregobj , feval = evalerror)
7
1.3.4 只用前n颗树预测
import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
#基本例子,从CSV文件中读取数据做二分类
#用pandas导入数据
data = pd.read_csv('./data/Pima-Indians-Diabetes.csv')
#做数据切分
train , test = train_test_split(data)
#转换成Dmatrix格式
feature_columns = ['Pregnanies' , 'Glucose','BloodPressure' , 'SkinThinkness' , 'Insulin' , 'BMI' , 'DiabetesPedigreeFunction' , 'Age']
target_column = 'Outcome'
xgtrain = xgb.DMatrix(train[feature_columns].values , train[target_column].values)
xgtest = xgb.DMatrix(test[feature_columns] . values , test[target_columns].values)
#参数设定
param = {'max_depth' :5 ,'eta' : 0.1 , 'silent' : 1 ,'subsample' :'0.7' , 'colsample_bytree' :0.7 , 'objective' :'binary:logistic' }
#设定watchlist用于查看模型
watchlist = [(xgtest , 'eval') ,(xgtrain , 'train')]
num_round = 10
bst = xgb.train(param , xgtrain , num_round , watchlist)
#只用第一颗树预测
ypred1 = bst.predict(xgtest , ntree_limit = 1 )
#只用前9颗树预测
ypred2 = bst.predict(xgtest , ntree_limit = 9 )
label = xgtest.get_label()
print('用前1颗树预测的错误率为 %f' % (np.sum((ypred1>0.5)!=label) /float(len(label))))
print('用前9颗树预测的错误率为 %f' % (np.sum((ypred2>0.5)!=label) /float(len(label))))
8
1.4 预估器建模方式:sklearn与xgboost配合使用
1.4.1 Xgboost建模 sklearn评估
import pickle
import xgboost as xgb
import numpy as np
from sklearn.model_selection import KFold , train_test_split , GridSearchCV
from sklearn.metrics import confusion_matrix , mean_squared_error
from sklearn.datasets import load_iris , load_digits , load_boston
rng = np.random.RandomState(31337)
#二分类:混淆矩阵
print('数字0和1的二分类问题')
digits = load_digits(2)
y = digits['target']
X = digits['data']
#数据切分对象
kf = KFold(n_splits = 2 , shuffle = True , random_state = rng)
print('在2折数据上的交叉验证')
#2折交叉验证
for train_index , test_index in kf.split(X):
xgb_model = xgb.XGBClassifier().fit(X[train_index] , y[train_index])
preditcions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print('混淆矩阵:')
print(confusion_matrix(actuals, predictions))
#多分类:混淆矩阵
print('\nIris:多分类')
y = iris['target']
X = iris['data']
kf = KFold(n_splits = 2 , shuffle = True , random_state = rng)
print('在2折数据上交叉验证')
for train_index , test_index in kf.split(X):
xgb_model = xgb.XGBClassifier().fit(X[train_index] , y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print('混淆矩阵')
print(confusion_matrxi(actuals , predictions))
#回归问题:MSE
print('\n波士顿放假回归预测问题')
boston = load_boston()
y = boston['target']
x = boston['data']
kf = KFold(n_splits = 2 , shuffle = True , random_state = rng)
print('在2折数据上的交叉验证')
for train_index, test_index in kf.split(X):
xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
predictions = xgb_model.predict(X[test_index])
actuals = y[test_index]
print("MSE:",mean_squared_error(actuals, predictions))
9
1.4.2 网格搜索交叉验证查找最优参数
print('参数最优化:')
y = boston['target']
x = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model , {'max_depth' : [2,4,6] , 'n_estimators' : [20 , 100,200]} , verbose = 1 )
clf.fit(X , y)
print(clf.best_score_)
print(clf.best_params_)
10
1.4.3 early-stoping早停
#在训练集上学习模型,一颗一颗树添加,在验证集上看效果,当效果不再提升,停止树的添加与生长
X = digits['data']
y = digits['target']
X_train , X_val , y_train , y_val = train_test_split(X , y ,random_state= 0)
clf = xgb.XGBClassifier()
clf.fit(X_train , y_train , early_stopping_round = 10 ,eval_metric = 'auc' , eval_set = [(X_val , y_val)])
11
12
1.4.4 特征重要度
iris = load_iris()
y = iris['traget']
x = iris['data']
xgb_model = xgb.XGBClassifier().fit(X , y)
print('特征排序')
feature_names = ['sepal_length' , 'sepal_width' , 'petal_length' ,'petal_width']
feature_importances = xgb_model.feature_importances_
indices = np.argsort(feature_importances)[: :-1]
for index in indices:
print("特征 %s 重要度为 %f" %(feature_names[index], feature_importances[index]))
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
plt.title("feature importances")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
13
14
1.4.5 并行加速
import os
if __name__ == "__main__":
try:
from multiprocessing import set_start_method
except ImportError:
raise ImportError("Unable to import multiprocessing.set_start_method."
" This example only runs on Python 3.4")
#set_start_method("forkserver")
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_boston
import xgboost as xgb
rng = np.random.RandomState(31337)
print("Parallel Parameter optimization")
boston = load_boston()
os.environ["OMP_NUM_THREADS"] = "2" # or to whatever you want
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
'n_estimators': [50, 100, 200]}, verbose=1,
n_jobs=2)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)
15