1.xgboost_cheatsheet

2021-01-30  本文已影响0人  许志辉Albert

1. xgboost速查表

1.1内置建模方式

1.1.1 读取libsvm格式数据并指定参数建模

import numpy as np
import pandas as pd
import pickle 
import xgboost as xgb  

# 基本例子,从libsvm文件中读取数据,做二分类
# 数据是libsvm的格式
#1 3:1 10:1 11:1 21:1 30:1 34:1 36:1 40:1 41:1 53:1 58:1 65:1 69:1 77:1 86:1 88:1 92:1 95:1 102:1 105:1 117:1 124:1
#0 3:1 10:1 20:1 21:1 23:1 34:1 36:1 39:1 41:1 53:1 56:1 65:1 69:1 77:1 86:1 88:1 92:1 95:1 102:1 106:1 116:1 120:1
#0 1:1 10:1 19:1 21:1 24:1 34:1 36:1 39:1 42:1 53:1 56:1 65:1 69:1 77:1 86:1 88:1 92:1 95:1 102:1 106:1 116:1 122:1

dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('./data/agaricus.txt.test')

#超参数设定
param = {'max_depth':2 , 'eta' :1 , 'silent':1 , 'objective' : 'binary :logistic'}

# 设定watchlist用于查看模型状态
watchlist = [(dtest , 'eval') , (dtrain , 'train')]
num_round = 2 
bst = xgb.train(param , dtrain , num_round ,watchlist)

#使用模型预测
preds = bst.predict(dtest)

#准确率判断
labels = dtest.get_label()
print('错误率为%f' % \
       (sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))

# 模型存储
bst.save_model('./model/0001.model')
1

1.1.2 配合pandas DataFrame格式数据建模

# 皮马印第安人糖尿病数据集 包含很多字段:怀孕次数 口服葡萄糖耐量试验中血浆葡萄糖浓度 舒张压(mm Hg) 三头肌组织褶厚度(mm) 
# 2小时血清胰岛素(μU/ ml) 体重指数(kg/(身高(m)^2) 糖尿病系统功能 年龄(岁)
import pandas as pd
data = pd.read_csv('./data/Pima-Indians-Diabetes.csv')
data.head()
2
import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split

#基本例子,从csv文件中读取数据,做二分类

#用pandas读入数据
data  = pd.read_csv('./data/Pima-Indians-Diabetes.csv')

#做数据切分
train , test = train_test_split(data)

#转换成Dmatrix格式
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
target_column = 'Outcome'
# 取出numpy array去初始化DMatrix对象
xgtrain = xgb.DMatrix(train[feature_columns].values, train[target_column].values)
xgtest = xgb.DMatrix(test[feature_columns].values, test[target_column].values)

#参数设定
param = {'max_depth':5, 'eta':0.1, 'silent':1, 'subsample':0.7, 'colsample_bytree':0.7, 'objective':'binary:logistic' }

# 设定watchlist用于查看模型状态
watchlist  = [(xgtest,'eval'), (xgtrain,'train')]
num_round = 10
bst = xgb.train(param, xgtrain, num_round, watchlist)

# 使用模型预测
preds = bst.predict(xgtest)

# 判断准确率
labels = xgtest.get_label()
print('错误类为%f' % \
       (sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))

# 模型存储
bst.save_model('./model/0002.model')
3

1.2预估器建模方式(sklearn形态)

import warnings 
warings.filterwarning('ignore')
import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
from skelarn.model import train_test_split
from sklearn.externals import joblib

#基本例子,从csv文件读取数据,做二分类

#用pandas读入数据
data = pd.read_csv('./data/Pima-Indians-Diabetes.csv')

#做数据切分
train , test = train_test_split(data)

#取出特征X和分类目标y的部分
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
target_column = 'Outcome'
train_X = train[feature_columns].values
train_y = train[target_column].values
test_X = test[feature_columns].values
test_y = test[target_column].values

#初始化模型
xgb_classfier = xgb.XGBClassifier(n_estimators = 20 , max_depth = 4 , learing_rate = 0.1 , subsample = 0.7 , colsample_bytree = 0.7)

#拟合模型
xgb_classifier.fit(train_X , train_y)

#使用模型预测
preds  = xgb_classifier.predict(test_X)

#判断准确率
print('错误率为%f' %(preds != test_y).sum() / float(test_y.shape[0])))

#模型存储
joblib.dump(xgb_classifier , './model/0003.model')
4

1.3 内置建模方式:交叉验证与高级功能

1.3.1 交叉验证

xgb.cv(param , dtrain , num_round , nfold = 5 ,metrics = {'error'} , seed = 0)
5

1.3.2 添加预处理的交叉验证

# 计算正负样本比,调整样本权重
def fpreproc(dtrain , dtest , param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0 ) / np.sum(label = 1)
    param['scale_pos_weight']  = ratio
    return (dtrain , dtest , param)


# 先做预处理 , 计算样本权重 , 再做交叉验证
xgb.cv(param , dtrain , num_round , nfold = 5 , metrics = {'auc'} , seed = 0 , fpreproc = fpreproc)

6

1.3.3 自定义损失函数与评估准则

print('使用自定义函数进行交叉验证')
#自定义损失函数,需要提供损失函数的一阶导与二阶导
def logregobj(preds , dtrain):
    labels = dtrain.get_label()
    preds = 1.0/(1.0 + np.exp(-preds))
    grad  = preds - labels 
    hess = preds *(1.0 - preds)
    return grad , hess 

#自定义评估准则 , 评估预测值和标准答案之间的差距
def evalerror(preds , dtrain):
    labels = dtrain.get_label()
    return 'error' , float(sum(lables != (preds > 0.0))) / len(labels) 

watchlist = [(dtest , 'eval' ) , (dtrain , 'train')]
param = {'max_depth' :3 , 'eta' :0.1 ,'silent' :1}
num_round = 5

#自定义损失函数训练
bst  = xgb.train(param , dtrain , num_round , watchlist, logregobj , evalerror)

#交叉验证
xgb.cv(param , dtrain , num_round , nfold = 5 , seed = 0 ,obj = logregobj , feval = evalerror)
7

1.3.4 只用前n颗树预测

import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split

#基本例子,从CSV文件中读取数据做二分类

#用pandas导入数据
data = pd.read_csv('./data/Pima-Indians-Diabetes.csv')

#做数据切分
train , test = train_test_split(data)

#转换成Dmatrix格式
feature_columns = ['Pregnanies' , 'Glucose','BloodPressure' , 'SkinThinkness' , 'Insulin' , 'BMI' , 'DiabetesPedigreeFunction' , 'Age']
target_column = 'Outcome'
xgtrain =   xgb.DMatrix(train[feature_columns].values , train[target_column].values)
xgtest = xgb.DMatrix(test[feature_columns] . values , test[target_columns].values)

#参数设定
param = {'max_depth' :5 ,'eta' : 0.1 , 'silent' : 1 ,'subsample' :'0.7' , 'colsample_bytree' :0.7 , 'objective' :'binary:logistic' }

#设定watchlist用于查看模型
watchlist = [(xgtest , 'eval') ,(xgtrain , 'train')] 
num_round = 10 
bst = xgb.train(param , xgtrain , num_round , watchlist)

#只用第一颗树预测
ypred1 = bst.predict(xgtest , ntree_limit = 1 )

#只用前9颗树预测
ypred2 = bst.predict(xgtest , ntree_limit = 9 )
label = xgtest.get_label()
print('用前1颗树预测的错误率为 %f' % (np.sum((ypred1>0.5)!=label) /float(len(label))))
print('用前9颗树预测的错误率为 %f' % (np.sum((ypred2>0.5)!=label) /float(len(label))))
8

1.4 预估器建模方式:sklearn与xgboost配合使用

1.4.1 Xgboost建模 sklearn评估

import pickle 
import xgboost as xgb

import numpy as np
from sklearn.model_selection  import KFold , train_test_split , GridSearchCV
from sklearn.metrics import confusion_matrix , mean_squared_error
from sklearn.datasets import load_iris , load_digits , load_boston

rng = np.random.RandomState(31337)


#二分类:混淆矩阵
print('数字0和1的二分类问题')
digits = load_digits(2)
y = digits['target']
X = digits['data']

#数据切分对象
kf = KFold(n_splits = 2 , shuffle = True , random_state = rng)
print('在2折数据上的交叉验证')

#2折交叉验证
for train_index , test_index in kf.split(X):
    xgb_model = xgb.XGBClassifier().fit(X[train_index] , y[train_index])
    preditcions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print('混淆矩阵:')
    print(confusion_matrix(actuals, predictions))

#多分类:混淆矩阵
print('\nIris:多分类')
y = iris['target']
X = iris['data']
kf = KFold(n_splits = 2 , shuffle = True , random_state = rng)
print('在2折数据上交叉验证')
for train_index , test_index in kf.split(X):
    xgb_model = xgb.XGBClassifier().fit(X[train_index] , y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print('混淆矩阵')
    print(confusion_matrxi(actuals , predictions))

#回归问题:MSE
print('\n波士顿放假回归预测问题')
boston = load_boston()
y = boston['target']
x = boston['data']
kf = KFold(n_splits = 2 , shuffle = True , random_state = rng)
print('在2折数据上的交叉验证')
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print("MSE:",mean_squared_error(actuals, predictions))

9

1.4.2 网格搜索交叉验证查找最优参数

print('参数最优化:')
y = boston['target']
x = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model , {'max_depth' : [2,4,6] , 'n_estimators' : [20 , 100,200]} , verbose = 1 )

clf.fit(X , y)
print(clf.best_score_)
print(clf.best_params_)
10

1.4.3 early-stoping早停

#在训练集上学习模型,一颗一颗树添加,在验证集上看效果,当效果不再提升,停止树的添加与生长

X = digits['data']
y = digits['target']

X_train , X_val , y_train , y_val = train_test_split(X , y ,random_state= 0)
clf = xgb.XGBClassifier()
clf.fit(X_train , y_train , early_stopping_round = 10 ,eval_metric = 'auc' , eval_set = [(X_val , y_val)])
11
12

1.4.4 特征重要度

iris = load_iris()
y = iris['traget']
x = iris['data']
xgb_model = xgb.XGBClassifier().fit(X , y)

print('特征排序')
feature_names = ['sepal_length' , 'sepal_width' , 'petal_length' ,'petal_width']
feature_importances = xgb_model.feature_importances_
indices = np.argsort(feature_importances)[: :-1]

for index in indices:
    print("特征 %s 重要度为 %f" %(feature_names[index], feature_importances[index]))

%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
plt.title("feature importances")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
13 14

1.4.5 并行加速

import os

if __name__ == "__main__":
    try:
        from multiprocessing import set_start_method
    except ImportError:
        raise ImportError("Unable to import multiprocessing.set_start_method."
                          " This example only runs on Python 3.4")
    #set_start_method("forkserver")

    import numpy as np
    from sklearn.model_selection import GridSearchCV
    from sklearn.datasets import load_boston
    import xgboost as xgb

    rng = np.random.RandomState(31337)

    print("Parallel Parameter optimization")
    boston = load_boston()

    os.environ["OMP_NUM_THREADS"] = "2"  # or to whatever you want
    y = boston['target']
    X = boston['data']
    xgb_model = xgb.XGBRegressor()
    clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                   'n_estimators': [50, 100, 200]}, verbose=1,
                       n_jobs=2)
    clf.fit(X, y)
    print(clf.best_score_)
    print(clf.best_params_)
15
上一篇下一篇

猜你喜欢

热点阅读