本文部分内容来自zouxy09的博客,谢谢!http://blog.csdn.net/zouxy09/article/details/9993371
以及斯坦福大学深度学习教程:http://ufldl.stanford.edu/wiki/index.php/UFLDL教程
CNN结构的连接比权值多很多,因为权值共享。CNN通过数据驱动的方式学习得到一些滤波器,作为提取输入的特征的一种方法。
典型CNN中开始几层都是卷积和下采样交替,然后在最后是一些全连接层。在全连接层时已经将所有两维特征map转化为全连接一维输入。
假设该网络能处理c类分类问题,共N个训练样本。定义平方误差代价函数:
调整参数。
批量梯度下降法是一种常用的优化目标函数的方法,通过对目标函数关于参数求导,来更新参数,其目标函数延梯度下降的方向快速逼近最小值。所以每次迭代都按照如下公式对
反向传播算法的思路如下:
CNN中卷积层的BP更新。在卷积层,上层的特征map被一个可以学习的卷积核进行卷积,然后通过一个激活函数,就可以得到输出特征map。每个输出map可以组合卷积多个输入map,正向传播时如下计算:
对于卷积层参数的调整,在计算残差时看的是卷积层和下采样层间的连接,在调整参数时看的是上层和卷积层间的连接。
对于下采样层,输入N个特征map,则输出N个map,只是每个输出map就变小了。正向传播时如下计算:
function net = cnnapplygrads(net, opts)
for l = 2 : numel(net.layers)
if strcmp(net.layers{l}.type, 'c')
for j = 1 : numel(net.layers{l}.a)
for ii = 1 : numel(net.layers{l - 1}.a)
net.layers{l}.k{ii}{j} = net.layers{l}.k{ii}{j} - opts.alpha * net.layers{l}.dk{ii}{j};
end
net.layers{l}.b{j} = net.layers{l}.b{j} - opts.alpha * net.layers{l}.db{j};
end
end
end
net.ffW = net.ffW - opts.alpha * net.dffW;
net.ffb = net.ffb - opts.alpha * net.dffb;
end
2、cnnbp.m
function net = cnnbp(net, y)
n = numel(net.layers);
% error
net.e = net.o - y;
% loss function
net.L = 1/2* sum(net.e(:) .^ 2) / size(net.e, 2);
%% backprop deltas
net.od = net.e .* (net.o .* (1 - net.o)); % output delta
net.fvd = (net.ffW' * net.od); % feature vector delta
if strcmp(net.layers{n}.type, 'c') % only conv layers has sigm function
net.fvd = net.fvd .* (net.fv .* (1 - net.fv));
end
% reshape feature vector deltas into output map style
sa = size(net.layers{n}.a{1});
fvnum = sa(1) * sa(2);
for j = 1 : numel(net.layers{n}.a)
net.layers{n}.d{j} = reshape(net.fvd(((j - 1) * fvnum + 1) : j * fvnum, :), sa(1), sa(2), sa(3));
end
for l = (n - 1) : -1 : 1
if strcmp(net.layers{l}.type, 'c')
for j = 1 : numel(net.layers{l}.a)
net.layers{l}.d{j} = net.layers{l}.a{j} .* (1 - net.layers{l}.a{j}) .* (expand(net.layers{l + 1}.d{j}, [net.layers{l + 1}.scale net.layers{l + 1}.scale 1]) / net.layers{l + 1}.scale ^ 2);
end
elseif strcmp(net.layers{l}.type, 's')
for i = 1 : numel(net.layers{l}.a)
z = zeros(size(net.layers{l}.a{1}));
for j = 1 : numel(net.layers{l + 1}.a)
z = z + convn(net.layers{l + 1}.d{j}, rot180(net.layers{l + 1}.k{i}{j}), 'full');
end
net.layers{l}.d{i} = z;
end
end
end
%% calc gradients
for l = 2 : n
if strcmp(net.layers{l}.type, 'c')
for j = 1 : numel(net.layers{l}.a)
for i = 1 : numel(net.layers{l - 1}.a)
net.layers{l}.dk{i}{j} = convn(flipall(net.layers{l - 1}.a{i}), net.layers{l}.d{j}, 'valid') / size(net.layers{l}.d{j}, 3);
end
net.layers{l}.db{j} = sum(net.layers{l}.d{j}(:)) / size(net.layers{l}.d{j}, 3);
end
end
end
net.dffW = net.od * (net.fv)' / size(net.od, 2);
net.dffb = mean(net.od, 2);
function X = rot180(X)
X = flipdim(flipdim(X, 1), 2);
end
end
3、cnnff.m
function net = cnnff(net, x)
n = numel(net.layers);
net.layers{1}.a{1} = x;
inputmaps = 1;
for l = 2 : n % for each layer
if strcmp(net.layers{l}.type, 'c')
% !!below can probably be handled by insane matrix operations
for j = 1 : net.layers{l}.outputmaps % for each output map
% create temp output map
z = zeros(size(net.layers{l - 1}.a{1}) - [net.layers{l}.kernelsize - 1 net.layers{l}.kernelsize - 1 0]);
for i = 1 : inputmaps % for each input map
% convolve with corresponding kernel and add to temp output map
z = z + convn(net.layers{l - 1}.a{i}, net.layers{l}.k{i}{j}, 'valid');
end
% add bias, pass through nonlinearity
net.layers{l}.a{j} = sigm(z + net.layers{l}.b{j});
end
% set number of input maps to this layers number of outputmaps
inputmaps = net.layers{l}.outputmaps;
elseif strcmp(net.layers{l}.type, 's')
% downsample
for j = 1 : inputmaps
z = convn(net.layers{l - 1}.a{j}, ones(net.layers{l}.scale) / (net.layers{l}.scale ^ 2), 'valid'); % !! replace with variable
net.layers{l}.a{j} = z(1 : net.layers{l}.scale : end, 1 : net.layers{l}.scale : end, :);
end
end
end
% concatenate all end layer feature maps into vector
net.fv = [];
for j = 1 : numel(net.layers{n}.a)
sa = size(net.layers{n}.a{j});
net.fv = [net.fv; reshape(net.layers{n}.a{j}, sa(1) * sa(2), sa(3))];
end
% feedforward into output perceptrons
net.o = sigm(net.ffW * net.fv + repmat(net.ffb, 1, size(net.fv, 2)));
end
4、cnnnumgradcheck.m
function cnnnumgradcheck(net, x, y)
epsilon = 1e-4;
er = 1e-8;
n = numel(net.layers);
for j = 1 : numel(net.ffb)
net_m = net; net_p = net;
net_p.ffb(j) = net_m.ffb(j) + epsilon;
net_m.ffb(j) = net_m.ffb(j) - epsilon;
net_m = cnnff(net_m, x); net_m = cnnbp(net_m, y);
net_p = cnnff(net_p, x); net_p = cnnbp(net_p, y);
d = (net_p.L - net_m.L) / (2 * epsilon);
e = abs(d - net.dffb(j));
if e > er
error('numerical gradient checking failed');
end
end
for i = 1 : size(net.ffW, 1)
for u = 1 : size(net.ffW, 2)
net_m = net; net_p = net;
net_p.ffW(i, u) = net_m.ffW(i, u) + epsilon;
net_m.ffW(i, u) = net_m.ffW(i, u) - epsilon;
net_m = cnnff(net_m, x); net_m = cnnbp(net_m, y);
net_p = cnnff(net_p, x); net_p = cnnbp(net_p, y);
d = (net_p.L - net_m.L) / (2 * epsilon);
e = abs(d - net.dffW(i, u));
if e > er
error('numerical gradient checking failed');
end
end
end
for l = n : -1 : 2
if strcmp(net.layers{l}.type, 'c')
for j = 1 : numel(net.layers{l}.a)
net_m = net; net_p = net;
net_p.layers{l}.b{j} = net_m.layers{l}.b{j} + epsilon;
net_m.layers{l}.b{j} = net_m.layers{l}.b{j} - epsilon;
net_m = cnnff(net_m, x); net_m = cnnbp(net_m, y);
net_p = cnnff(net_p, x); net_p = cnnbp(net_p, y);
d = (net_p.L - net_m.L) / (2 * epsilon);
e = abs(d - net.layers{l}.db{j});
if e > er
error('numerical gradient checking failed');
end
for i = 1 : numel(net.layers{l - 1}.a)
for u = 1 : size(net.layers{l}.k{i}{j}, 1)
for v = 1 : size(net.layers{l}.k{i}{j}, 2)
net_m = net; net_p = net;
net_p.layers{l}.k{i}{j}(u, v) = net_p.layers{l}.k{i}{j}(u, v) + epsilon;
net_m.layers{l}.k{i}{j}(u, v) = net_m.layers{l}.k{i}{j}(u, v) - epsilon;
net_m = cnnff(net_m, x); net_m = cnnbp(net_m, y);
net_p = cnnff(net_p, x); net_p = cnnbp(net_p, y);
d = (net_p.L - net_m.L) / (2 * epsilon);
e = abs(d - net.layers{l}.dk{i}{j}(u, v));
if e > er
error('numerical gradient checking failed');
end
end
end
end
end
elseif strcmp(net.layers{l}.type, 's')
% for j = 1 : numel(net.layers{l}.a)
% net_m = net; net_p = net;
% net_p.layers{l}.b{j} = net_m.layers{l}.b{j} + epsilon;
% net_m.layers{l}.b{j} = net_m.layers{l}.b{j} - epsilon;
% net_m = cnnff(net_m, x); net_m = cnnbp(net_m, y);
% net_p = cnnff(net_p, x); net_p = cnnbp(net_p, y);
% d = (net_p.L - net_m.L) / (2 * epsilon);
% e = abs(d - net.layers{l}.db{j});
% if e > er
% error('numerical gradient checking failed');
% end
% end
end
end
% keyboard
end
5、cnnsetup.m
function net = cnnsetup(net, x, y)
assert(~isOctave() || compare_versions(OCTAVE_VERSION, '3.8.0', '>='), ['Octave 3.8.0 or greater is required for CNNs as there is a bug in convolution in previous versions. See http://savannah.gnu.org/bugs/?39314. Your version is ' myOctaveVersion]);
inputmaps = 1;
mapsize = size(squeeze(x(:, :, 1)));
for l = 1 : numel(net.layers) % layer
if strcmp(net.layers{l}.type, 's')
mapsize = mapsize / net.layers{l}.scale;
assert(all(floor(mapsize)==mapsize), ['Layer ' num2str(l) ' size must be integer. Actual: ' num2str(mapsize)]);
for j = 1 : inputmaps
net.layers{l}.b{j} = 0;
end
end
if strcmp(net.layers{l}.type, 'c')
mapsize = mapsize - net.layers{l}.kernelsize + 1;
fan_out = net.layers{l}.outputmaps * net.layers{l}.kernelsize ^ 2;
for j = 1 : net.layers{l}.outputmaps % output map
fan_in = inputmaps * net.layers{l}.kernelsize ^ 2;
for i = 1 : inputmaps % input map
net.layers{l}.k{i}{j} = (rand(net.layers{l}.kernelsize) - 0.5) * 2 * sqrt(6 / (fan_in + fan_out));
end
net.layers{l}.b{j} = 0;
end
inputmaps = net.layers{l}.outputmaps;
end
end
% 'onum' is the number of labels, that's why it is calculated using size(y, 1). If you have 20 labels so the output of the network will be 20 neurons.
% 'fvnum' is the number of output neurons at the last layer, the layer just before the output layer.
% 'ffb' is the biases of the output neurons.
% 'ffW' is the weights between the last layer and the output neurons. Note that the last layer is fully connected to the output layer, that's why the size of the weights is (onum * fvnum)
fvnum = prod(mapsize) * inputmaps;
onum = size(y, 1);
net.ffb = zeros(onum, 1);
net.ffW = (rand(onum, fvnum) - 0.5) * 2 * sqrt(6 / (onum + fvnum));
end
6、cnntest.m
function [er, bad] = cnntest(net, x, y)
% feedforward
net = cnnff(net, x);
[~, h] = max(net.o);
[~, a] = max(y);
bad = find(h ~= a);
er = numel(bad) / size(y, 2);
end
function net = cnntrain(net, x, y, opts)
m = size(x, 3);
numbatches = m / opts.batchsize;
if rem(numbatches, 1) ~= 0
error('numbatches not integer');
end
net.rL = [];
for i = 1 : opts.numepochs
disp(['epoch ' num2str(i) '/' num2str(opts.numepochs)]);
tic;
kk = randperm(m);
for l = 1 : numbatches
batch_x = x(:, :, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize));
batch_y = y(:, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize));
net = cnnff(net, batch_x);
net = cnnbp(net, batch_y);
net = cnnapplygrads(net, opts);
if isempty(net.rL)
net.rL(1) = net.L;
end
net.rL(end + 1) = 0.99 * net.rL(end) + 0.01 * net.L;
end
toc;
end
end
function test_example_CNN
load mnist_uint8;
train_x = double(reshape(train_x',28,28,60000))/255;
test_x = double(reshape(test_x',28,28,10000))/255;
train_y = double(train_y');
test_y = double(test_y');
%% ex1 Train a 6c-2s-12c-2s Convolutional neural network
%will run 1 epoch in about 200 second and get around 11% error.
%With 100 epochs you'll get around 1.2% error
rand('state',0)
cnn.layers = {
struct('type', 'i') %input layer
struct('type', 'c', 'outputmaps', 6, 'kernelsize', 5) %convolution layer
struct('type', 's', 'scale', 2) %sub sampling layer
struct('type', 'c', 'outputmaps', 12, 'kernelsize', 5) %convolution layer
struct('type', 's', 'scale', 2) %subsampling layer
};
opts.alpha = 1;
opts.batchsize = 50;
opts.numepochs = 1;
cnn = cnnsetup(cnn, train_x, train_y);
cnn = cnntrain(cnn, train_x, train_y, opts);
[er, bad] = cnntest(cnn, test_x, test_y);
er
%plot mean squared error
figure; plot(cnn.rL);
assert(er<0.12, 'Too big error');
注:另外还有CNN具体MATLAB实现代码及详细解释请参照zouxy09的博客:http://blog.csdn.net/zouxy09/article/details/9993743/ 该作者博客里解释的很详细,另外作者还写了很多关于深度学习的笔记,都写得很棒,在此对其表示膜拜和感谢!