gbdt 源码分析
1. 源码分析
源码阅读的是Python著名的库sklearn里的代码。sklearn里gbdt(sklearn/ensemble/gradient_boosting.py)相关的类有 GradientBoostingRegressor
和GradientBoostingClassifier
,共同的父类是BaseGradientBoosting
.boost的基本实现在BaseGradientBoosting
里。主要的几个参数是(更详细的看sklearn的文档):
loss : {'ls', 'lad', 'huber', 'quantile'}, optional (default='ls')
loss function to be optimized. 'ls' refers to least squares
regression. 'lad' (least absolute deviation) is a highly robust
loss function solely based on order information of the input
variables. 'huber' is a combination of the two. 'quantile'
allows quantile regression (use `alpha` to specify the quantile).
learning_rate : float, optional (default=0.1)
learning rate shrinks the contribution of each tree by `learning_rate`.
There is a trade-off between learning_rate and n_estimators.
n_estimators : int (default=100)
The number of boosting stages to perform. Gradient boosting
is fairly robust to over-fitting so a large number usually
results in better performance.
max_depth : integer, optional (default=3)
maximum depth of the individual regression estimators. The maximum
depth limits the number of nodes in the tree. Tune this parameter
for best performance; the best value depends on the interaction
of the input variables.
....
init : BaseEstimator, None, optional (default=None)
An estimator object that is used to compute the initial
predictions. ``init`` has to provide ``fit`` and ``predict``.
If None it uses ``loss.init_estimator``.
....
BaseGradientBoosting
主要的相关函数是fit
函数。
初始化
sklearn提供了多种estimator
来做算法的第一步-初始化的工作。默认选用的是MeanEstimator
,即使用均值来作为初始的预测值。其中fit()
函数是计算了mean值,predict()
将X样本的所有初始预测值y_pred
设为之前计算的mean值。(其他初始化方法类似)
class MeanEstimator(BaseEstimator):
"""An estimator predicting the mean of the training targets."""
def fit(self, X, y, sample_weight=None):
if sample_weight is None:
self.mean = np.mean(y)
else:
self.mean = np.average(y, weights=sample_weight)
def predict(self, X):
check_is_fitted(self, 'mean')
y = np.empty((X.shape[0], 1), dtype=np.float64)
y.fill(self.mean)
return y
计算残差
sklearn提供了多种损失函数LossFunction
,默认为最小平方损失LeastSquaresError
,在损失函数中通过计算负梯度来计算‘‘伪残差’’。LeastSquaresError
的负梯度negative_gradient(
是y-y_pred
,还有一个重要的函数是update_terminal_regions
用来迭代的时候更新最终的预测值y_pred
class LeastSquaresError(RegressionLossFunction):
"""Loss function for least squares (LS) estimation.
Terminal regions need not to be updated for least squares. """
def init_estimator(self):
return MeanEstimator()
def __call__(self, y, pred, sample_weight=None):
if sample_weight is None:
return np.mean((y - pred.ravel()) ** 2.0)
else:
return (1.0 / sample_weight.sum() *
np.sum(sample_weight * ((y - pred.ravel()) ** 2.0)))
def negative_gradient(self, y, pred, **kargs):
return y - pred.ravel()
def update_terminal_regions(self, tree, X, y, residual, y_pred,
sample_weight, sample_mask,
learning_rate=1.0, k=0):
"""Least squares does not need to update terminal regions.
But it has to update the predictions.
"""
# update predictions
y_pred[:, k] += learning_rate * tree.predict(X).ravel()
def _update_terminal_region(self, tree, terminal_regions, leaf, X, y,
residual, pred, sample_weight):
pass
迭代构建模型
BaseGradientBoosting
的最重要的函数是fit()
函数。fit()
的开始是check_input、check_params等检查的功能。在check_params
检查参数的时候初始化了损失函数self.loss_
。回归树‘‘self.n_classes_=1’’(不等于1是多分类的情况),这对应这self.loss_.K=1。
----_check_params()--------
if self.loss == 'deviance':
loss_class = (MultinomialDeviance
if len(self.classes_) > 2
else BinomialDeviance)
else:
loss_class = LOSS_FUNCTIONS[self.loss]
if self.loss in ('huber', 'quantile'):
self.loss_ = loss_class(self.n_classes_, self.alpha)
else:
self.loss_ = loss_class(self.n_classes_)
接着判断是否初始化过(主要是区分是否是热启动的),如果没有初始化(我们主要看不是热启动的情况,下面代码中的if),
--------fit()----------
if not self._is_initialized():
# init state
self._init_state()
# fit initial model - FIXME make sample_weight optional
self.init_.fit(X, y, sample_weight)
# init predictions
y_pred = self.init_.predict(X)
begin_at_stage = 0
else:
# add more estimators to fitted model
# invariant: warm_start = True
if self.n_estimators < self.estimators_.shape[0]:
raise ValueError('n_estimators=%d must be larger or equal to '
'estimators_.shape[0]=%d when '
'warm_start==True'
% (self.n_estimators,
self.estimators_.shape[0]))
begin_at_stage = self.estimators_.shape[0]
y_pred = self._decision_function(X)
self._resize_state()
初始化的时候,主要初始化最开始的模型,在_init_state()
创建初始模型用的estimator
----_init_state()----
def _init_state(self):
"""Initialize model state and allocate model state data structures. """
if self.init is None:
self.init_ = self.loss_.init_estimator()
elif isinstance(self.init, six.string_types):
self.init_ = INIT_ESTIMATORS[self.init]()
else:
self.init_ = self.init
self.estimators_ = np.empty((self.n_estimators, self.loss_.K),
dtype=np.object)
self.train_score_ = np.zeros((self.n_estimators,), dtype=np.float64)
# do oob?
if self.subsample < 1.0:
self.oob_improvement_ = np.zeros((self.n_estimators),
dtype=np.float64)
利用创建的self.init_
得到模型迭代之前最初始的y_pred。
然后是在_fit_stages()
函数里构建boosting的过程。
_fit_stages()
中最重要的是迭代的更新每一个基础的learner。
------_fit_stages()------
# perform boosting iterations
i = begin_at_stage
for i in range(begin_at_stage, self.n_estimators):
# subsampling
if do_oob:
sample_mask = _random_sample_mask(n_samples, n_inbag,
random_state)
# OOB score before adding this stage
old_oob_score = loss_(y[~sample_mask],
y_pred[~sample_mask],
sample_weight[~sample_mask])
# fit next stage of trees
y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
sample_mask, random_state, X_idx_sorted,
X_csc, X_csr)
其中_fit_stage
是每一次的迭代,创建一棵决策回归树进行学习的过程。具体见代码中的中文注释。
def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
random_state, X_idx_sorted, X_csc=None, X_csr=None):
"""Fit another stage of ``n_classes_`` trees to the boosting model. """
assert sample_mask.dtype == np.bool
loss = self.loss_
original_y = y
#回归树与二分类K=1
for k in range(loss.K):
if loss.is_multi_class:
y = np.array(original_y == k, dtype=np.float64)
#得到真正的y与当前的y_pred的残差(残差用的是负梯度计算的。)
residual = loss.negative_gradient(y, y_pred, k=k,
sample_weight=sample_weight)
# induce regression tree on residuals
#以残差作为当前这棵树要预测的目标值
tree = DecisionTreeRegressor(
criterion=self.criterion,
splitter='best',
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
min_weight_fraction_leaf=self.min_weight_fraction_leaf,
max_features=self.max_features,
max_leaf_nodes=self.max_leaf_nodes,
random_state=random_state,
presort=self.presort)
if self.subsample < 1.0:
# no inplace multiplication!
sample_weight = sample_weight * sample_mask.astype(np.float64)
#以残差作为当前这棵树要预测的目标值
if X_csc is not None:
tree.fit(X_csc, residual, sample_weight=sample_weight,
check_input=False, X_idx_sorted=X_idx_sorted)
else:
tree.fit(X, residual, sample_weight=sample_weight,
check_input=False, X_idx_sorted=X_idx_sorted)
# update tree leaves
#调用‘‘ loss.update_terminal_regions’’用当前训练的这棵树的对X的预测结果来更新整个模型的y_pred
if X_csr is not None:
loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred,
sample_weight, sample_mask,
self.learning_rate, k=k)
else:
loss.update_terminal_regions(tree.tree_, X, y, residual, y_pred,
sample_weight, sample_mask,
self.learning_rate, k=k)
# add tree to ensemble
#保留这一棵树
self.estimators_[i, k] = tree
#返回这一次迭代的更新了以后的y_pred
return y_pred
这就是整个fit
也就是整个模型构建的过程。
除了构建模型,即fit.GradientBoostingRegressor
还要实现自己另外一个重要的函数predict
,通过构建的模型进行预测。
def predict(self, X):
"""Predict regression target for X.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.
Returns
-------
y : array of shape = [n_samples]
The predicted values.
"""
X = check_array(X, dtype=DTYPE, order="C")
return self._decision_function(X).ravel()
主要调用了self._decision_function
这个也是基类BaseGradientBoosting
的函数。
def _decision_function(self, X):
# for use in inner loop, not raveling the output in single-class case,
# not doing input validation.
score = self._init_decision_function(X)
predict_stages(self.estimators_, X, self.learning_rate, score)
return score
其中score = self._init_decision_function(X)
是利用self._init
初始化(如同训练模型时的那样)最开始的y_pred。
def _init_decision_function(self, X):
"""Check input and compute prediction of ``init``. """
self._check_initialized()
X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)
if X.shape[1] != self.n_features:
raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format(
self.n_features, X.shape[1]))
score = self.init_.predict(X).astype(np.float64)
return score
然后是predict_stages
利用之前构建的模型(多棵树)迭代地获得最终的预测值。