附录:实现代码
# -*- coding:utf-8 -*- -
"""
This tutorial introduces the multilayer perceptron using Theano.
A multilayer perceptron is a logistic regressor where
instead of feeding the input to the logistic regression you insert a
intermediate layer, called the hidden layer, that has a nonlinear
activation function (usually tanh or sigmoid) . One can use many such
hidden layers making the architecture deep. The tutorial will also tackle
the problem of MNIST digit classification.
.. math::
f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
References:
- textbooks: "Pattern Recognition and Machine Learning" -
Christopher M. Bishop, section 5
"""
"""
本文仅在源代码注释基础上加入了部分中文注释便于理解
logistic_sgd相关定义请参阅
http://deeplearning.net/tutorial/gettingstarted.html#gettingstarted
"""
from __future__ import print_function
__docformat__ = 'restructedtext en'
import os
import sys
import timeit
import numpy
import theano
import theano.tensor as T
from logistic_sgd import LogisticRegression, load_data
# start-snippet-1
class HiddenLayer(object):
def __init__(self, rng, input, n_in, n_out, W=None, b=None,
activation=T.tanh):
"""
定义多层神经网络中的一个隐结点
sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
and the bias vector b is of shape (n_out,).
NOTE : 默认的非线性变换函数函数为tahn形式,激活函数表达为: tanh(dot(input,W) + b)
:type rng: numpy.random.RandomState
:param rng: 用于生成权重的随机种子
:type input: theano.tensor.dmatrix
:param input: 用于定义网络形状的张量表达 (n_examples, n_in)
:type n_in: int
:param n_in: 输入结点的维数
:type n_out: int
:param n_out: 隐结点数目
:type activation: theano.Op or function
:param activation: 用于隐层的非线性函数
"""
self.input = input
# end-snippet-1
#如果W没有定义(初始化)则自动将其初始化
if W is None:
#按照5.2所提范围将其初始化
W_values = numpy.asarray(
rng.uniform(
low=-numpy.sqrt(6. / (n_in + n_out)),
high=numpy.sqrt(6. / (n_in + n_out)),
size=(n_in, n_out)
),
dtype=theano.config.floatX
)
#如果是sigmiod函数(logist函数,由于sigmiod函数具有一定歧义性,本文沿用吴恩达 《机器学习》中的称呼)
#则将范围扩大四倍
if activation == theano.tensor.nnet.sigmoid:
W_values *= 4
W = theano.shared(value=W_values, name='W', borrow=True)
#偏置b初始化为0
if b is None:
b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
b = theano.shared(value=b_values, name='b', borrow=True)
self.W = W
self.b = b
#定义输入形式 input*W+b
lin_output = T.dot(input, self.W) + self.b
self.output = (
lin_output if activation is None
else activation(lin_output)
)
# parameters of the model
self.params = [self.W, self.b]
# start-snippet-2
class MLP(object):
"""
多层神经网络定义
在本例中仅包含一个隐层
"""
def __init__(self, rng, input, n_in, n_hidden, n_out):
"""多层神经网络初始化函数
:type rng: numpy.random.RandomState
:param rng: 初始化权值的随机种子
:type input: theano.tensor.TensorType
:param input: 描述输入结构(最小batch)
:type n_in: int
:param n_in: 输入单元数目,数据维度
:type n_hidden: int
:param n_hidden: 隐层单元数目
:type n_out: int
:param n_out:输出单元数目,输出维度
"""
"""
初始化隐层
激活函数设定为tanh
"""
self.hiddenLayer = HiddenLayer(
rng=rng,
input=input,
n_in=n_in,
n_out=n_hidden,
activation=T.tanh
)
# 逻辑回归层(输出层)
self.logRegressionLayer = LogisticRegression(
input=self.hiddenLayer.output,
n_in=n_hidden,
n_out=n_out
)
# end-snippet-2 start-snippet-3
# 定义L1模用于正则化输出
self.L1 = (
abs(self.hiddenLayer.W).sum()
+ abs(self.logRegressionLayer.W).sum()
)
# 定义L2模用于正则化输出
# square of L2 norm to be small
self.L2_sqr = (
(self.hiddenLayer.W ** 2).sum()
+ (self.logRegressionLayer.W ** 2).sum()
)
# 回归层似然度函数
self.negative_log_likelihood = (
self.logRegressionLayer.negative_log_likelihood
)
# 误差函数(于logist_sgd中定义)
self.errors = self.logRegressionLayer.errors
# 目标优化参数
self.params = self.hiddenLayer.params + self.logRegressionLayer.params
# end-snippet-3
self.input = input
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
"""
:type learning_rate: float
:param learning_rate: 学习步长
:type L1_reg: float
:param L1_reg: L1正则的权重
:type L2_reg: float
:param L2_reg: L2正则的权重
:type n_epochs: int
:param n_epochs: 最大迭代次数
:type dataset: string
:param dataset: 数据集路径
若没有,请在 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
下载
"""
datasets = load_data(dataset)
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x, test_set_y = datasets[2]
# 计算用于训练的最小batch
n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size
######################
# BUILD ACTUAL MODEL #
######################
print('... building the model')
# allocate symbolic variables for the data
index = T.lscalar() # index to a [mini]batch
x = T.matrix('x') # the data is presented as rasterized images
y = T.ivector('y') # the labels are presented as 1D vector of
# [int] labels
rng = numpy.random.RandomState(1234)
# construct the MLP class
classifier = MLP(
rng=rng,
input=x,
n_in=28 * 28,
n_hidden=n_hidden,
n_out=10
)
# start-snippet-4
# 定义损失函数
cost = (
classifier.negative_log_likelihood(y)
+ L1_reg * classifier.L1
+ L2_reg * classifier.L2_sqr
)
# end-snippet-4
# 计算minibatch上的模型误差
test_model = theano.function(
inputs=[index],
outputs=classifier.errors(y),
givens={
x: test_set_x[index * batch_size:(index + 1) * batch_size],
y: test_set_y[index * batch_size:(index + 1) * batch_size]
}
)
validate_model = theano.function(
inputs=[index],
outputs=classifier.errors(y),
givens={
x: valid_set_x[index * batch_size:(index + 1) * batch_size],
y: valid_set_y[index * batch_size:(index + 1) * batch_size]
}
)
# start-snippet-5
# 计算梯度
# the resulting gradients will be stored in a list gparams
gparams = [T.grad(cost, param) for param in classifier.params]
# specify how to update the parameters of the model as a list of
# (variable, update expression) pairs
# 定义如何更新梯度
# B = [b1, b2, b3, b4], zip generates a list C of same size, where each
# element is a pair formed from the two lists :
# C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
updates = [
(param, param - learning_rate * gparam)
for param, gparam in zip(classifier.params, gparams)
]
# 计算误差并更新梯度
train_model = theano.function(
inputs=[index],
outputs=cost,
updates=updates,
givens={
x: train_set_x[index * batch_size: (index + 1) * batch_size],
y: train_set_y[index * batch_size: (index + 1) * batch_size]
}
)
# end-snippet-5
###############
# TRAIN MODEL #
###############
print('... training')
# 为了预防过拟合的early-stop校验
# 在校验集合发现继续训练导致正确率下降时,终止训练
patience = 10000 # look as this many examples regardless
patience_increase = 2 # wait this much longer when a new best is
# found
improvement_threshold = 0.995 # a relative improvement of this much is
# considered significant
validation_frequency = min(n_train_batches, patience // 2)
# go through this many
# minibatche before checking the network
# on the validation set; in this case we
# check every epoch
best_validation_loss = numpy.inf
best_iter = 0
test_score = 0.
start_time = timeit.default_timer()
epoch = 0
done_looping = False
while (epoch < n_epochs) and (not done_looping):
epoch = epoch + 1
for minibatch_index in range(n_train_batches):
minibatch_avg_cost = train_model(minibatch_index)
# iteration number
iter = (epoch - 1) * n_train_batches + minibatch_index
if (iter + 1) % validation_frequency == 0:
# compute zero-one loss on validation set
validation_losses = [validate_model(i) for i
in range(n_valid_batches)]
this_validation_loss = numpy.mean(validation_losses)
print(
'epoch %i, minibatch %i/%i, validation error %f %%' %
(
epoch,
minibatch_index + 1,
n_train_batches,
this_validation_loss * 100.
)
)
# if we got the best validation score until now
if this_validation_loss < best_validation_loss:
#improve patience if loss improvement is good enough
if (
this_validation_loss < best_validation_loss *
improvement_threshold
):
patience = max(patience, iter * patience_increase)
best_validation_loss = this_validation_loss
best_iter = iter
# test it on the test set
test_losses = [test_model(i) for i
in range(n_test_batches)]
test_score = numpy.mean(test_losses)
print((' epoch %i, minibatch %i/%i, test error of '
'best model %f %%') %
(epoch, minibatch_index + 1, n_train_batches,
test_score * 100.))
if patience <= iter:
done_looping = True
break
end_time = timeit.default_timer()
print(('Optimization complete. Best validation score of %f %% '
'obtained at iteration %i, with test performance %f %%') %
(best_validation_loss * 100., best_iter + 1, test_score * 100.))
print(('The code for file ' +
os.path.split(__file__)[1] +
' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
if __name__ == '__main__':
test_mlp()