Deep Belief Network

2019-01-04 本文已影响0人雅婷小麦

参考资料：

http://deeplearning.net/tutorial/DBN.html

Part 1

class DBN(object):

"""Deep Belief Network

在几个RBM上相互得到一个DBN堆叠.。

RBM隐藏层中的`i`层会变层`i+1`层的输入。

得到的第一层RBM作为输入，而最后一层为输出。

至于分类器， the DBN is treated as a MLP by adding a logistic

regression layer on top.

"""

def __init__(self, numpy_rng, theano_rng=None, n_ins=784,

hidden_layers_sizes=[500, 500], n_outs=10):

"""这个类支持可变数量的层。

:type numpy_rng: numpy.random.RandomState

:param numpy_rng: 用来绘制初始权重的随机数值生成器

:type theano_rng: theano.tensor.shared_randomstreams.RandomStreams

:param theano_rng: Theano 随机生成器;如果没有给出，则根据从“rng”中抽取的种子生成一个

:type n_ins: int 浮点

:param n_ins: DBN输入的维度

:type hidden_layers_sizes: list of ints

:param hidden_layers_sizes: intermediate layers size, 必须包括至少一个值

:type n_outs: int

:param n_outs: DBN输出的维度

"""

self.sigmoid_layers = []

self.rbm_layers = []

self.params = []

self.n_layers = len(hidden_layers_sizes)

assert self.n_layers > 0

if not theano_rng:

theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30))

# 为数据分配符号变量

# 数据是 rasterized images

self.x = T.matrix('x')

#标签是 1D vector of [int] labels

self.y = T.ivector('y')

Part 2

for i in range(self.n_layers):

# 构建 sigmoidal layer

# 输入的大小就是隐藏层的数量

# units of the layer below or the input size if we are on

# 第一层

if i == 0:

input_size = n_ins

else:

input_size = hidden_layers_sizes[i - 1]

# the input to this layer is either the activation of the

# hidden layer below or the input of the DBN if you are on

# 第一层

if i == 0:

layer_input = self.x

else:

layer_input = self.sigmoid_layers[-1].output

sigmoid_layer = HiddenLayer(rng=numpy_rng,

input=layer_input,

n_in=input_size,

n_out=hidden_layers_sizes[i],

activation=T.nnet.sigmoid)

# 加层

self.sigmoid_layers.append(sigmoid_layer)

# ... 要搞清楚sigmoid层的参数是DBN的参数

# RBM的可视的偏差是那些 RBMs的参数

self.params.extend(sigmoid_layer.params)

# 构建RBM 共享这一层的权重

rbm_layer = RBM(numpy_rng=numpy_rng,

theano_rng=theano_rng,

input=layer_input,

n_visible=input_size,

n_hidden=hidden_layers_sizes[i],

W=sigmoid_layer.W,

hbias=sigmoid_layer.b)

self.rbm_layers.append(rbm_layer)

Part 3

self.logLayer = LogisticRegression(

input=self.sigmoid_layers[-1].output,

n_in=hidden_layers_sizes[-1],

n_out=n_outs)

self.params.extend(self.logLayer.params)

# 计算第二阶段训练的损失,定义为逻辑回归层--输出层的负对数似然

self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

# compute the gradients with respect to the model parameters

# symbolic variable that points to the number of errors made on the

# minibatch given by self.x and self.y

self.errors = self.logLayer.errors(self.y)

Part 4

def pretraining_functions(self, train_set_x, batch_size, k):

'''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes.

:type train_set_x: theano.tensor.TensorType

:param train_set_x: Shared var. that contains all datapoints used for training the RBM

:type batch_size: int

:param batch_size: size of a [mini]batch

:param k: number of Gibbs steps to do in CD-k / PCD-k

'''

# index to a [mini]batch

index = T.lscalar('index') # index to a minibatch

Part 5

learning_rate = T.scalar('lr') # learning rate to use

# begining of a batch, given `index`

batch_begin = index * batch_size

# ending of a batch given `index`

batch_end = batch_begin + batch_size

pretrain_fns = []

for rbm in self.rbm_layers:

# get the cost and the updates list

# using CD-k here (persisent=None) for training each RBM.

# TODO: change cost function to reconstruction error

cost, updates = rbm.get_cost_updates(learning_rate,

persistent=None, k=k)

# compile the theano function

fn = theano.function(

inputs=[index, theano.In(learning_rate, value=0.1)],

outputs=cost,

updates=updates,

givens={

self.x: train_set_x[batch_begin:batch_end]

}

)

# append `fn` to the list of functions

pretrain_fns.append(fn)

return pretrain_fns

Part 6

def build_finetune_functions(self, datasets, batch_size, learning_rate):

'''Generates a function `train` that implements one step of

finetuning, a function `validate` that computes the error on a

batch from the validation set, and a function `test` that

computes the error on a batch from the testing set

:type datasets: list of pairs of theano.tensor.TensorType

:param datasets: It is a list that contain all the datasets;

the has to contain three pairs, `train`,

`valid`, `test` in this order, where each pair

is formed of two Theano variables, one for the

datapoints, the other for the labels

:type batch_size: int

:param batch_size: size of a minibatch

:type learning_rate: float

:param learning_rate: learning rate used during finetune stage

'''

(train_set_x, train_set_y) = datasets[0]

(valid_set_x, valid_set_y) = datasets[1]

(test_set_x, test_set_y) = datasets[2]

# compute number of minibatches for training, validation and testing

n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]

n_valid_batches //= batch_size

n_test_batches = test_set_x.get_value(borrow=True).shape[0]

n_test_batches //= batch_size

index = T.lscalar('index') # index to a [mini]batch

# compute the gradients with respect to the model parameters

gparams = T.grad(self.finetune_cost, self.params)

# compute list of fine-tuning updates

updates = []

for param, gparam in zip(self.params, gparams):

updates.append((param, param - gparam * learning_rate))

train_fn = theano.function(

inputs=[index],

outputs=self.finetune_cost,

updates=updates,

givens={

self.x: train_set_x[

index * batch_size: (index + 1) * batch_size

self.y: train_set_y[

index * batch_size: (index + 1) * batch_size

]

}

)

test_score_i = theano.function(

[index],

self.errors,

givens={

self.x: test_set_x[

index * batch_size: (index + 1) * batch_size

self.y: test_set_y[

index * batch_size: (index + 1) * batch_size

]

}

)

valid_score_i = theano.function(

[index],

self.errors,

givens={

self.x: valid_set_x[

index * batch_size: (index + 1) * batch_size

self.y: valid_set_y[

index * batch_size: (index + 1) * batch_size

]

}

)

# Create a function that scans the entire validation set

def valid_score():

return [valid_score_i(i) for i in range(n_valid_batches)]

# Create a function that scans the entire test set

def test_score():

return [test_score_i(i) for i in range(n_test_batches)]

return train_fn, valid_score, test_score

Part 7

numpy_rng = numpy.random.RandomState(123)

print('... building the model')

# construct the Deep Belief Network

dbn = DBN(numpy_rng=numpy_rng, n_ins=28 * 28,

hidden_layers_sizes=[1000, 1000, 1000],

n_outs=10)

Part 8

#########################

# PRETRAINING THE MODEL #

#########################

print('... getting the pretraining functions')

pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x,

batch_size=batch_size,

k=k)

print('... pre-training the model')

start_time = timeit.default_timer()

# Pre-train layer-wise

for i in range(dbn.n_layers):

# go through pretraining epochs

for epoch in range(pretraining_epochs):

# go through the training set

c = []

for batch_index in range(n_train_batches):

c.append(pretraining_fns[i](index=batch_index,

lr=pretrain_lr))

print('Pre-training layer %i, epoch %d, cost ' % (i, epoch), end=' ')

print(numpy.mean(c, dtype='float64'))

end_time = timeit.default_timer()

With the default parameters, the code runs for 100 pre-training epochs with mini-batches of size 10. This corresponds to performing 500,000 unsupervised parameter updates. We use an unsupervised learning rate of 0.01, with a supervised learning rate of 0.1. The DBN itself consists of three hidden layers with 1000 units per layer. With early-stopping, this configuration achieved a minimal validation error of 1.27 with corresponding test error of 1.34 after 46 supervised epochs.

On an Intel(R) Xeon(R) CPU X5560 running at 2.80GHz, using a multi-threaded MKL library (running on 4 cores), pretraining took 615 minutes with an average of 2.05 mins/(layer * epoch). Fine-tuning took only 101 minutes or approximately 2.20 mins/epoch.

Hyper-parameters were selected by optimizing on the validation error. We tested unsupervised learning rates in {10 -1，10-5}

and supervised learning rates in {10-1,10-4}

. We did not use any form of regularization besides early-stopping, nor did we optimize over the number of pretraining updates.

Deep Belief Network

猜你喜欢

热点阅读