Deep Belief Network
参考资料:
http://deeplearning.net/tutorial/DBN.html
Part 1
class DBN(object):
"""Deep Belief Network
在几个RBM上相互得到一个DBN堆叠.。
RBM隐藏层中的`i`层 会变层`i+1`层的输入。
得到的第一层RBM作为输入,而最后一层为输出。
至于分类器, the DBN is treated as a MLP by adding a logistic
regression layer on top.
"""
def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
hidden_layers_sizes=[500, 500], n_outs=10):
"""这个类支持可变数量的层。
:type numpy_rng: numpy.random.RandomState
:param numpy_rng: 用来绘制初始权重的随机数值生成器
:type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
:param theano_rng: Theano 随机生成器;如果没有给出,则 根据从“rng”中抽取的种子生成一个
:type n_ins: int 浮点
:param n_ins: DBN输入的维度
:type hidden_layers_sizes: list of ints
:param hidden_layers_sizes: intermediate layers size, 必须包括至少一个值
:type n_outs: int
:param n_outs: DBN输出的维度
"""
self.sigmoid_layers = []
self.rbm_layers = []
self.params = []
self.n_layers = len(hidden_layers_sizes)
assert self.n_layers > 0
if not theano_rng:
theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30))
# 为数据分配符号变量
# 数据是 rasterized images
self.x = T.matrix('x')
#标签是 1D vector of [int] labels
self.y = T.ivector('y')
Part 2
for i in range(self.n_layers):
# 构建 sigmoidal layer
# 输入的大小就是隐藏层的数量
# units of the layer below or the input size if we are on
# 第一层
if i == 0:
input_size = n_ins
else:
input_size = hidden_layers_sizes[i - 1]
# the input to this layer is either the activation of the
# hidden layer below or the input of the DBN if you are on
# 第一层
if i == 0:
layer_input = self.x
else:
layer_input = self.sigmoid_layers[-1].output
sigmoid_layer = HiddenLayer(rng=numpy_rng,
input=layer_input,
n_in=input_size,
n_out=hidden_layers_sizes[i],
activation=T.nnet.sigmoid)
# 加层
self.sigmoid_layers.append(sigmoid_layer)
# ... 要搞清楚sigmoid层的参数是DBN的参数
# RBM的可视的偏差是那些 RBMs的参数
self.params.extend(sigmoid_layer.params)
# 构建RBM 共享这一层的权重
rbm_layer = RBM(numpy_rng=numpy_rng,
theano_rng=theano_rng,
input=layer_input,
n_visible=input_size,
n_hidden=hidden_layers_sizes[i],
W=sigmoid_layer.W,
hbias=sigmoid_layer.b)
self.rbm_layers.append(rbm_layer)
Part 3
self.logLayer = LogisticRegression(
input=self.sigmoid_layers[-1].output,
n_in=hidden_layers_sizes[-1],
n_out=n_outs)
self.params.extend(self.logLayer.params)
# 计算第二阶段训练的损失,定义为逻辑回归层--输出层的负对数似然
self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
# compute the gradients with respect to the model parameters
# symbolic variable that points to the number of errors made on the
# minibatch given by self.x and self.y
self.errors = self.logLayer.errors(self.y)
Part 4
def pretraining_functions(self, train_set_x, batch_size, k):
'''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes.
:type train_set_x: theano.tensor.TensorType
:param train_set_x: Shared var. that contains all datapoints used for training the RBM
:type batch_size: int
:param batch_size: size of a [mini]batch
:param k: number of Gibbs steps to do in CD-k / PCD-k
'''
# index to a [mini]batch
index = T.lscalar('index') # index to a minibatch
Part 5
learning_rate = T.scalar('lr') # learning rate to use
# begining of a batch, given `index`
batch_begin = index * batch_size
# ending of a batch given `index`
batch_end = batch_begin + batch_size
pretrain_fns = []
for rbm in self.rbm_layers:
# get the cost and the updates list
# using CD-k here (persisent=None) for training each RBM.
# TODO: change cost function to reconstruction error
cost, updates = rbm.get_cost_updates(learning_rate,
persistent=None, k=k)
# compile the theano function
fn = theano.function(
inputs=[index, theano.In(learning_rate, value=0.1)],
outputs=cost,
updates=updates,
givens={
self.x: train_set_x[batch_begin:batch_end]
}
)
# append `fn` to the list of functions
pretrain_fns.append(fn)
return pretrain_fns
Part 6
def build_finetune_functions(self, datasets, batch_size, learning_rate):
'''Generates a function `train` that implements one step of
finetuning, a function `validate` that computes the error on a
batch from the validation set, and a function `test` that
computes the error on a batch from the testing set
:type datasets: list of pairs of theano.tensor.TensorType
:param datasets: It is a list that contain all the datasets;
the has to contain three pairs, `train`,
`valid`, `test` in this order, where each pair
is formed of two Theano variables, one for the
datapoints, the other for the labels
:type batch_size: int
:param batch_size: size of a minibatch
:type learning_rate: float
:param learning_rate: learning rate used during finetune stage
'''
(train_set_x, train_set_y) = datasets[0]
(valid_set_x, valid_set_y) = datasets[1]
(test_set_x, test_set_y) = datasets[2]
# compute number of minibatches for training, validation and testing
n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
n_valid_batches //= batch_size
n_test_batches = test_set_x.get_value(borrow=True).shape[0]
n_test_batches //= batch_size
index = T.lscalar('index') # index to a [mini]batch
# compute the gradients with respect to the model parameters
gparams = T.grad(self.finetune_cost, self.params)
# compute list of fine-tuning updates
updates = []
for param, gparam in zip(self.params, gparams):
updates.append((param, param - gparam * learning_rate))
train_fn = theano.function(
inputs=[index],
outputs=self.finetune_cost,
updates=updates,
givens={
self.x: train_set_x[
index * batch_size: (index + 1) * batch_size
],
self.y: train_set_y[
index * batch_size: (index + 1) * batch_size
]
}
)
test_score_i = theano.function(
[index],
self.errors,
givens={
self.x: test_set_x[
index * batch_size: (index + 1) * batch_size
],
self.y: test_set_y[
index * batch_size: (index + 1) * batch_size
]
}
)
valid_score_i = theano.function(
[index],
self.errors,
givens={
self.x: valid_set_x[
index * batch_size: (index + 1) * batch_size
],
self.y: valid_set_y[
index * batch_size: (index + 1) * batch_size
]
}
)
# Create a function that scans the entire validation set
def valid_score():
return [valid_score_i(i) for i in range(n_valid_batches)]
# Create a function that scans the entire test set
def test_score():
return [test_score_i(i) for i in range(n_test_batches)]
return train_fn, valid_score, test_score
Part 7
numpy_rng = numpy.random.RandomState(123)
print('... building the model')
# construct the Deep Belief Network
dbn = DBN(numpy_rng=numpy_rng, n_ins=28 * 28,
hidden_layers_sizes=[1000, 1000, 1000],
n_outs=10)
Part 8
#########################
# PRETRAINING THE MODEL #
#########################
print('... getting the pretraining functions')
pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x,
batch_size=batch_size,
k=k)
print('... pre-training the model')
start_time = timeit.default_timer()
# Pre-train layer-wise
for i in range(dbn.n_layers):
# go through pretraining epochs
for epoch in range(pretraining_epochs):
# go through the training set
c = []
for batch_index in range(n_train_batches):
c.append(pretraining_fns[i](index=batch_index,
lr=pretrain_lr))
print('Pre-training layer %i, epoch %d, cost ' % (i, epoch), end=' ')
print(numpy.mean(c, dtype='float64'))
end_time = timeit.default_timer()
With the default parameters, the code runs for 100 pre-training epochs with mini-batches of size 10. This corresponds to performing 500,000 unsupervised parameter updates. We use an unsupervised learning rate of 0.01, with a supervised learning rate of 0.1. The DBN itself consists of three hidden layers with 1000 units per layer. With early-stopping, this configuration achieved a minimal validation error of 1.27 with corresponding test error of 1.34 after 46 supervised epochs.
On an Intel(R) Xeon(R) CPU X5560 running at 2.80GHz, using a multi-threaded MKL library (running on 4 cores), pretraining took 615 minutes with an average of 2.05 mins/(layer * epoch). Fine-tuning took only 101 minutes or approximately 2.20 mins/epoch.
Hyper-parameters were selected by optimizing on the validation error. We tested unsupervised learning rates in {10 -1,10-5}
and supervised learning rates in {10-1,10-4}
. We did not use any form of regularization besides early-stopping, nor did we optimize over the number of pretraining updates.