反向传播实现
sklearn源码中反向传播算法的实现
sklearn/neural_network/multilayer_perceptron.py
参数初始化
n_samples, n_features = X.shape # 样本数,输入特征维度
n_outputs_ = y.shape[1] # 输出维度
layer_units = ([n_features] + hidden_layer_sizes + [self.n_outputs_]) # 网络总层数
# Initialize coefficient and intercept layers
self.coefs_ = []
self.intercepts_ = []
for i in range(self.n_layers_ - 1):
coef_init, intercept_init = self._init_coef(layer_units[I], layer_units[i + 1])
self.coefs_.append(coef_init)
self.intercepts_.append(intercept_init)
# Initialize lists
activations = [X] # 网络第一层激活值就是输入特征
activations.extend(np.empty((batch_size, n_fan_out))
for n_fan_out in layer_units[1:])
deltas = [np.empty_like(a_layer) for a_layer in activations]
coef_grads = [np.empty((n_fan_in_, n_fan_out_)) for n_fan_in_,
n_fan_out_ in zip(layer_units[:-1],
layer_units[1:])]
intercept_grads = [np.empty(n_fan_out_) for n_fan_out_ in
layer_units[1:]]
参数定义
def _backprop(self, X, y, activations, deltas, coef_grads,
intercept_grads):
"""Compute the MLP loss function and its corresponding derivatives
with respect to each parameter: weights and bias vectors.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The input data.
y : array-like, shape (n_samples,)
The target values.
activations : list, length = n_layers - 1
The ith element of the list holds the values of the ith layer.
deltas : list, length = n_layers - 1
The ith element of the list holds the difference between the
activations of the i + 1 layer and the backpropagated error.
More specifically, deltas are gradients of loss with respect to z
in each layer, where z = wx + b is the value of a particular layer
before passing through the activation function
coef_grad : list, length = n_layers - 1
The ith element contains the amount of change used to update the
coefficient parameters of the ith layer in an iteration.
intercept_grads : list, length = n_layers - 1
The ith element contains the amount of change used to update the
intercept parameters of the ith layer in an iteration.
Returns
-------
loss : float
coef_grads : list, length = n_layers - 1
intercept_grads : list, length = n_layers - 1
"""
输入
X:特征,形状(n_samples, n_features)
y:标签,形状(n_samples,)
activations[i]:神经网络的第i层激活值
deltas[i]:第i+1层网络的神经元误差
coef_grad[i]:第i层网络权重的梯度
intercept_grads[i]:第i层网络偏置项的梯度
输出
loss,coef_grads,intercept_grads
n_samples = X.shape[0] #一次迭代的样本数量
# Forward propagate
# 前向传播,得到网络每层的激活值
activations = self._forward_pass(activations)
# Get loss
loss_func_name = self.loss
if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic':
loss_func_name = 'binary_log_loss'
# 根据预测y与真实y,计算loss
loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1])
# Add L2 regularization term to loss
# loss加上权重的L2正则项
values = np.sum(
np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_]))
loss += (0.5 * self.alpha) * values / n_samples
# Backward propagate
# 输出层误差值的下标
last = self.n_layers_ - 2
# The calculation of delta[last] here works with following
# combinations of output activation and loss function:
# sigmoid and binary cross entropy, softmax and categorical cross
# entropy, and identity with squared loss
# 按照identity激活函数和squared loss,计算输出层的误差
deltas[last] = activations[-1] - y
# Compute gradient for the last layer
# 根据激活值和误差,计算输出层的参数梯度
coef_grads, intercept_grads = self._compute_loss_grad(
last, n_samples, activations, deltas, coef_grads, intercept_grads)
# Iterate over the hidden layers
# 在隐藏层间反向传播误差
for i in range(self.n_layers_ - 2, 0, -1):
deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
inplace_derivative = DERIVATIVES[self.activation]
# 根据隐藏层的激活函数,计算该层误差
inplace_derivative(activations[i], deltas[i - 1])
# 根据激活值和误差,计算该层的参数梯度
coef_grads, intercept_grads = self._compute_loss_grad(
i - 1, n_samples, activations, deltas, coef_grads,
intercept_grads)
# 得到每层参数的梯度,传入优化算法进行参数更新
return loss, coef_grads, intercept_grads
计算参数梯度
def _compute_loss_grad(self, layer, n_samples, activations, deltas,
coef_grads, intercept_grads):
"""Compute the gradient of loss with respect to coefs and intercept for
specified layer.
This function does backpropagation for the specified one layer.
"""
# 权重的梯度
coef_grads[layer] = safe_sparse_dot(activations[layer].T,
deltas[layer])
# 加上L2正则项的梯度
coef_grads[layer] += (self.alpha * self.coefs_[layer])
coef_grads[layer] /= n_samples
intercept_grads[layer] = np.mean(deltas[layer], 0)
return coef_grads, intercept_grads
前向传播
def _forward_pass(self, activations):
"""Perform a forward pass on the network by computing the values
of the neurons in the hidden layers and the output layer.
Parameters
----------
activations : list, length = n_layers - 1
The ith element of the list holds the values of the ith layer.
with_output_activation : bool, default True
If True, the output passes through the output activation
function, which is either the softmax function or the
logistic function
"""
# 隐藏层激活函数
hidden_activation = ACTIVATIONS[self.activation]
# Iterate over the hidden layers
for i in range(self.n_layers_ - 1):
activations[i + 1] = safe_sparse_dot(activations[i],
self.coefs_[i])
activations[i + 1] += self.intercepts_[i]
# For the hidden layers
if (i + 1) != (self.n_layers_ - 1):
activations[i + 1] = hidden_activation(activations[i + 1])
# For the last layer
output_activation = ACTIVATIONS[self.out_activation_]
activations[i + 1] = output_activation(activations[i + 1])
# 返回神经网络每层激活值
return activations
sklearn/neural_network/_base.py
激活函数
ACTIVATIONS = {'identity': identity, 'tanh': tanh, 'logistic': logistic, 'relu': relu, 'softmax': softmax}
def identity(X):
"""Simply return the input array.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Data, where n_samples is the number of samples
and n_features is the number of features.
Returns
-------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Same as the input data.
"""
# 相当于没有激活函数
return X
def logistic(X):
"""Compute the logistic function inplace.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The input data.
Returns
-------
X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
The transformed data.
"""
# sigmoid激活函数
return logistic_sigmoid(X, out=X)
def relu(X):
"""Compute the rectified linear unit function inplace.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The input data.
Returns
-------
X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
The transformed data.
"""
# 将输入值X在0~max之间的不变,小于0的变为0
np.clip(X, 0, np.finfo(X.dtype).max, out=X)
return X
DERIVATIVES = {'identity': inplace_identity_derivative,
'tanh': inplace_tanh_derivative,
'logistic': inplace_logistic_derivative,
'relu': inplace_relu_derivative}
def inplace_relu_derivative(Z, delta):
"""Apply the derivative of the relu function.
It exploits the fact that the derivative is a simple function of the output
value from rectified linear units activation function.
Parameters
----------
Z : {array-like, sparse matrix}, shape (n_samples, n_features)
The data which was output from the rectified linear units activation
function during the forward pass.
delta : {array-like}, shape (n_samples, n_features)
The backpropagated error signal to be modified inplace.
"""
delta[Z == 0] = 0
传入第i层激活值 Z=activations[i], 第i-1层误差 deltas[i - 1]
当激活值Z=relu(X)=0时,说明X<0,relu(X)导数为0,误差为0;
激活值>0时,relu(X)导数为1,误差不变
def inplace_logistic_derivative(Z, delta):
"""Apply the derivative of the logistic sigmoid function.
It exploits the fact that the derivative is a simple function of the output
value from logistic function.
Parameters
----------
Z : {array-like, sparse matrix}, shape (n_samples, n_features)
The data which was output from the logistic activation function during
the forward pass.
delta : {array-like}, shape (n_samples, n_features)
The backpropagated error signal to be modified inplace.
"""
delta *= Z
delta *= (1 - Z)
传入Z=sigmoid(X)为激活值,sigmoid(X)的导数=sigmoid(X)(1-sigmoid(X)) = Z(1-Z)
LOSS_FUNCTIONS = {'squared_loss': squared_loss, 'log_loss': log_loss, 'binary_log_loss': binary_log_loss}
def binary_log_loss(y_true, y_prob):
"""Compute binary logistic loss for classification.
This is identical to log_loss in binary classification case,
but is kept for its use in multilabel case.
Parameters
----------
y_true : array-like or label indicator matrix
Ground truth (correct) labels.
y_prob : array-like of float, shape = (n_samples, n_classes)
Predicted probabilities, as returned by a classifier's
predict_proba method.
Returns
-------
loss : float
The degree to which the samples are correctly predicted.
"""
y_prob = np.clip(y_prob, 1e-10, 1 - 1e-10)
return -np.sum(y_true * np.log(y_prob) +
(1 - y_true) * np.log(1 - y_prob)) / y_prob.shape[0]