DeepLearnToolBox中CNN源码解析

         DeepLearnToolbox是一个简单理解CNN过程的工具箱,可以在github下载。为了理解卷积神经网络的过程,我特此对CNN部分源码进行了注释。公式的计算可以由上一篇blog推导得出。

         注意:代码中没有的subsampling进行设置参数,将subsampling层的参数w就设置为了0.25,而偏置参数b设置为0。卷积层计算过程为上一层所有feature map的卷积的结果和,后再加一个偏置,再取sigmoid函数。而subsampling的计算过程为上一层对应的2*2的feature map的像素值求和再取平均,没有加上偏置和取sigmoid。最后一层隐藏层为4*4大小的12个feature map,因此最后能得到192维的特征,全连接就是192*10(分类数目)。。

 

此外net中一些参数进行说明:

net.fv: 最后一层隐藏层的特征矩阵,采用的是全连接方式

net.o: 最后输出的结果,每一列为一个样本结果

net.od: 最后一层输出层所对应的残差

net.fvd: 最后一层隐藏层所对应的误差(全连接的方式)

 

test_example_CNN.m

 

%function test_example_CNN
addpath D:\DeepLearning\DeepLearnToolbox-master\data\
addpath D:\DeepLearning\DeepLearnToolbox-master\CNN\
addpath D:\DeepLearning\DeepLearnToolbox-master\util\
load mnist_uint8;

train_x = double(reshape(train_x',28,28,60000))/255;   % 训练集变成60000张28*28的图片大小 28*28*60000,像素点归一化到[0,1]
test_x = double(reshape(test_x',28,28,10000))/255;      %  测试集   28*28*10000
train_y = double(train_y');   %10*6000   每列代表一个标签  softmax回归模型
test_y = double(test_y');

%% ex1 Train a 6c-2s-12c-2s Convolutional neural network 
%will run 1 epoch in about 200 second and get around 11% error. 
%With 100 epochs you'll get around 1.2% error

rand('state',0)

cnn.layers = {        %%% 设置各层feature maps个数及卷积模板大小等属性
    struct('type', 'i') %input layer
    struct('type', 'c', 'outputmaps', 6, 'kernelsize', 5) %convolution layer
    struct('type', 's', 'scale', 2) %sub sampling layer
    struct('type', 'c', 'outputmaps', 12, 'kernelsize', 5) %convolution layer
    struct('type', 's', 'scale', 2) %subsampling layer
};


opts.alpha = 0.01;   %迭代下降的速率
opts.batchsize = 50;   %每次选择50个样本进行更新  随机梯度下降,每次只选用50个样本进行更新
opts.numepochs = 50;   %迭代次数

cnn = cnnsetup(cnn, train_x, train_y);      %对各层参数进行初始化 包括权重和偏置
cnn = cnntrain(cnn, train_x, train_y, opts);  %训练的过程,包括bp算法及迭代过程

[er, bad] = cnntest(cnn, test_x, test_y);

%plot mean squared error
figure; plot(cnn.rL);
% assert(er<0.12, 'Too big error');

cnnsetup.m

 

function net = cnnsetup(net, x, y)
%     assert(~isOctave() || compare_versions(OCTAVE_VERSION, '3.8.0', '>='), ['Octave 3.8.0 or greater is required for CNNs as there is a bug in convolution in previous versions. See http://savannah.gnu.org/bugs/?39314. Your version is ' myOctaveVersion]);
    inputmaps = 1;   %输入图片数量 输入feature maps数量
    mapsize = size(squeeze(x(:, :, 1)));    % 图片的大小 squeeze 要不要都行28 28

    for l = 1 : numel(net.layers)   %  layer
        if strcmp(net.layers{l}.type, 's')
            mapsize = mapsize / net.layers{l}.scale;    %% sumsampling的featuremap长宽都是上一层卷积层featuremap的一半
            assert(all(floor(mapsize)==mapsize), ['Layer ' num2str(l) ' size must be integer. Actual: ' num2str(mapsize)]);
            for j = 1 : inputmaps
                net.layers{l}.b{j} = 0;   % 将偏置初始化0, 权重weight,,这段代码subsampling层将weight设为1/4 而偏置参数设为0,故subsampling阶段无需参数
            end
        end
        if strcmp(net.layers{l}.type, 'c')
            mapsize = mapsize - net.layers{l}.kernelsize + 1;   % 得到当前层feature map的大小
            fan_out = net.layers{l}.outputmaps * net.layers{l}.kernelsize ^ 2;   % fan_out与fan_in都是用来初始化kernel的,不知道why 
            for j = 1 : net.layers{l}.outputmaps  %  output map  当前层feature maps的个数
                fan_in = inputmaps * net.layers{l}.kernelsize ^ 2;
                for i = 1 : inputmaps  %  input map          共享权值,故kernel参数个数为inputmaps*outputmaps个数,每一个都是5*5
                    net.layers{l}.k{i}{j} = (rand(net.layers{l}.kernelsize) - 0.5) * 2 * sqrt(6 / (fan_in + fan_out));   %% 初始化每个feature map对应的kernel参数 -0.5 再乘2归一化到[-1,1],最终归一化到[-sqrt(6 / (fan_in + fan_out)),+sqrt(6 / (fan_in + fan_out))] why?? 
                end
                net.layers{l}.b{j} = 0;  % 初始话feture map对应的偏置参数 初始化为0
            end
            inputmaps = net.layers{l}.outputmaps;   % 修改输入feature maps的个数以便下次使用
        end
    end
    % 'onum' is the number of labels, that's why it is calculated using size(y, 1). If you have 20 labels so the output of the network will be 20 neurons.
    % 'fvnum' is the number of output neurons at the last layer, the layer just before the output layer.
    % 'ffb' is the biases of the output neurons.
    % 'ffW' is the weights between the last layer and the output neurons. Note that the last layer is fully connected to the output layer, that's why the size of the weights is (onum * fvnum)
    fvnum = prod(mapsize) * inputmaps;             % S4最后结点个数即为特征的个数 用作全连接 12*4*4=192维特征
    onum = size(y, 1);        %最终分类的个数  10类

    net.ffb = zeros(onum, 1);     %softmat回归的偏置参数个数
    net.ffW = (rand(onum, fvnum) - 0.5) * 2 * sqrt(6 / (onum + fvnum));    %% softmaxt回归的权值参数 为10*192个 全连接
end

cnntrain.m

 

function net = cnntrain(net, x, y, opts)
    m = size(x, 3);   %% 图片一共的数量 60000
    numbatches = m / opts.batchsize;    % 循环的次数 共1200次,每次使用50个样本进行
    if rem(numbatches, 1) ~= 0
        error('numbatches not integer');
    end
    net.rL = [];
    for i = 1 : opts.numepochs    
        disp(['epoch ' num2str(i) '/' num2str(opts.numepochs)]);
        tic;
        kk = randperm(m);  %% 随机产生m以内的不重复的m个数
        for l = 1 : numbatches    %% 循环1200次,每次选取50个样本进行更新
            batch_x = x(:, :, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize));    %50个样本的训练数据
            batch_y = y(:,    kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize));    %50个样本所对应的标签

            net = cnnff(net, batch_x);        %计算前向传播
            net = cnnbp(net, batch_y);      %bp算法更新参数
            opts.i = i;
            opts.l = l;
            net = cnnapplygrads(net, opts);   %% 运用梯度迭代更新参数
            if isempty(net.rL)
                net.rL(1) = net.L;
            end
            net.rL(end + 1) = 0.99 * net.rL(end) + 0.01 * net.L;   %为什么要这样计算不太明白 net.L为每次迭代时候的cost function
        end
        toc;
    end
    
end

cnnff.m

function net = cnnff(net, x)
    n = numel(net.layers);    %% 所具有的层数
    net.layers{1}.a{1} = x;    %第一层的激励值初始化
    inputmaps = 1;

    for l = 2 : n   %  for each layer        %针对每一个卷积层
        if strcmp(net.layers{l}.type, 'c')
            %  !!below can probably be handled by insane matrix operations
            for j = 1 : net.layers{l}.outputmaps   %  for each output map %针对该层的每一个feture map
                %  create temp output map
                z = zeros(size(net.layers{l - 1}.a{1}) - [net.layers{l}.kernelsize - 1 net.layers{l}.kernelsize - 1 0]);   % 该层feture map的大小,最后一位是样本图片个数 初始化为0
                for i = 1 : inputmaps   %  for each input map  %针对每一个输入feature map
                    %  convolve with corresponding kernel and add to temp output map
                    z = z + convn(net.layers{l - 1}.a{i}, net.layers{l}.k{i}{j}, 'valid');  %做卷积操作  k{i}{j} 是5*5的double类型,其中a{i}是输入图片的feature map 大小为28*28*50 50为图像数量
                end                                                                         %% 卷积操作这里的k是不是应该旋转180度
                %  add bias, pass through nonlinearity
                net.layers{l}.a{j} = sigm(z + net.layers{l}.b{j});   %%获取sigmoid function的值  
            end
            %  set number of input maps to this layers number of outputmaps
            inputmaps = net.layers{l}.outputmaps;   %% 设置新的输入feature maps的个数    
        elseif strcmp(net.layers{l}.type, 's')   %%% 下采样采用的方法是,2*2相加乘以权值1/4,  没有取偏置和取sigmoid
            %  downsample
            for j = 1 : inputmaps
                z = convn(net.layers{l - 1}.a{j}, ones(net.layers{l}.scale) / (net.layers{l}.scale ^ 2), 'valid');   %  !! replace with variable   %%先卷积后各行各列取结果
                net.layers{l}.a{j} = z(1 : net.layers{l}.scale : end, 1 : net.layers{l}.scale : end, :);    %得到的结果是上一层卷积层行列的一半  a=z
            end
        end
    end

    %  concatenate all end layer feature maps into vector
    net.fv = [];   %%用来保存最后一个隐藏层所对应的特征 将feature maps变成全连接的形式
    for j = 1 : numel(net.layers{n}.a)      % 最后一层隐层具有的feture map数量
        sa = size(net.layers{n}.a{j});
        net.fv = [net.fv; reshape(net.layers{n}.a{j}, sa(1) * sa(2), sa(3))];  %% 最后得到192*50的矩阵,每一列对应一个样本图像的特征,192=4*4*12 12个feturemap 
    end
    %  feedforward into output perceptrons
    net.o = sigm(net.ffW * net.fv + repmat(net.ffb, 1, size(net.fv, 2)));    %% 结果为10*50的矩阵,每一列表示一个样本图像的标签结果 取了sigmoid function表明是k个二分类器,各类之间不互斥,当然也可以换成softmax回归
   % net.o = softmax(net.ffW * net.fv + repmat(net.ffb, 1, size(net.fv, 2))); 
end
 

cnnbp.m

 

function net = cnnbp(net, y)
    n = numel(net.layers);    %layers个数

    %   error
    net.e = net.o - y;   % 10*50  每一列表示一个样本图像
    %  loss function
    net.L = 1/2* sum(net.e(:) .^ 2) / size(net.e, 2);   %% cost function 没有加入参数构成贝叶斯学派的观点

    %%  backprop deltas
    net.od = net.e .* (net.o .* (1 - net.o));   %  output delta 输出层的误差 用来求解  10*50
    net.fvd = (net.ffW' * net.od);              %  feature vector delta  最后一层隐藏层误差 如果是下采样层,由于a=z,所以误差就是这个结果(导数为1,就是对z求导),如果是卷积层,那么需要乘以f(z)的导数 192*50
    if strcmp(net.layers{n}.type, 'c')         %  only conv layers has sigm function
        net.fvd = net.fvd .* (net.fv .* (1 - net.fv));     %% 如果最后一个隐藏层是卷积层,直接用该公式就能得到误差
    end

    %  reshape feature vector deltas into output map style
    sa = size(net.layers{n}.a{1});    %%layers{n}共有12个a 每个a都是4*4*50  50 为样本图片的个数   n表示最后一层隐藏层
    fvnum = sa(1) * sa(2);
    for j = 1 : numel(net.layers{n}.a)   %%最后一个隐藏层一共有多少个feature maps,每个feature map即表示为d{j}变成4*4*50的形式,50为样本图片数量,这样好用于计算前面层次的误差**** 转变
        net.layers{n}.d{j} = reshape(net.fvd(((j - 1) * fvnum + 1) : j * fvnum, :), sa(1), sa(2), sa(3));   %将最后一层隐藏层变成feature maps的形式,这样易求解前一层卷积的结果
    end

    for l = (n - 1) : -1 : 1      %实际是到2终止了,1是输入层,没有误差要求
        if strcmp(net.layers{l}.type, 'c')   %卷积层的计算方式
            for j = 1 : numel(net.layers{l}.a)   %第n-1层具有的feature maps的个数,进行遍历 每个d{j}是8*8*50的形式, 由于下一层为下采样层,故后一层d{j}扩展为8*8的(每个点复制成2*2的),按照bp求误差公式就可以得出,这里权重就为1/4,
                net.layers{l}.d{j} = net.layers{l}.a{j} .* (1 - net.layers{l}.a{j}) .* (expand(net.layers{l + 1}.d{j}, [net.layers{l + 1}.scale net.layers{l + 1}.scale 1]) / net.layers{l + 1}.scale ^ 2);
            end
        elseif strcmp(net.layers{l}.type, 's')   %下采样层的计算方式
            for i = 1 : numel(net.layers{l}.a)   %该层feature maps的个数 每个a都是12*12*50 的大小,其中50为样本图片的个数
                z = zeros(size(net.layers{l}.a{1}));      %大小等于 当前层feature map的大小
                for j = 1 : numel(net.layers{l + 1}.a)     %计算公式来自 Notes on Convolutional Neural Networks的pdf,,将当前层下采样层与后面的采样层每个feature map相连接, 故按照bp的公式要进行求和
                     z = z + convn(net.layers{l + 1}.d{j}, rot180(net.layers{l + 1}.k{i}{j}), 'full');   %%% 可以举一个简单的例子进行讲解  所有节点相乘都是相加的(因为该结点是与后一层所有的feature maps都是有连接的),
                end                                                                                      %% 卷积 full valid是什么意思 要弄清楚????
                net.layers{l}.d{i} = z;          %% 因为是下采样层,所以a=z,就f(z)=z,导数就等于1,所以误差就是所连接结点权值与后一层误差和
            end
        end
    end

    %%  calc gradients    %% 对kij求偏导没有看懂 为什么要进行求和
    for l = 2 : n
        if strcmp(net.layers{l}.type, 'c')
            for j = 1 : numel(net.layers{l}.a)
                for i = 1 : numel(net.layers{l - 1}.a)
                    net.layers{l}.dk{i}{j} = convn(flipall(net.layers{l - 1}.a{i}), net.layers{l}.d{j}, 'valid') / size(net.layers{l}.d{j}, 3);   % 可以看论文中的推导!与论文中先将k rot180,然后再rot整体效果是一样的。
                end
                net.layers{l}.db{j} = sum(net.layers{l}.d{j}(:)) / size(net.layers{l}.d{j}, 3);   %% 对偏置参数b的导数
            end
        end
    end
    net.dffW = net.od * (net.fv)' / size(net.od, 2);      %softmax回归中参数所对应的导数
    net.dffb = mean(net.od, 2);                %% softmax回归中最后参数b所对应的导数

    function X = rot180(X)
        X = flipdim(flipdim(X, 1), 2);  % flipdim(X, 1) 行互换  flipdim(X, 2) 列互换
    end
end

cnnapplygrads.m

function net = cnnapplygrads(net, opts)
    for l = 2 : numel(net.layers)
        if strcmp(net.layers{l}.type, 'c')
            for j = 1 : numel(net.layers{l}.a)
                for ii = 1 : numel(net.layers{l - 1}.a)
                    net.layers{l}.k{ii}{j} = net.layers{l}.k{ii}{j} - 1/(opts.i+opts.alpha + opts.l) * net.layers{l}.dk{ii}{j};   %% 梯度下降求更新后的参数
                end
                net.layers{l}.b{j} = net.layers{l}.b{j} - 1/(opts.i+opts.alpha + opts.l) * net.layers{l}.db{j};
            end
        end
    end

    net.ffW = net.ffW - 1/(opts.i+opts.alpha + opts.l) * net.dffW;
    net.ffb = net.ffb - 1/(opts.i+opts.alpha + opts.l) * net.dffb;
end


cnntest.m

 

function [er, bad] = cnntest(net, x, y)
    %  feedforward
    net = cnnff(net, x);
    [~, h] = max(net.o);
    [~, a] = max(y);
    bad = find(h ~= a);

    er = numel(bad) / size(y, 2);
end

将代码中opts.numepochs 迭代次数设为1和10分别获得了11.13%和2.73%的错误率。

将最后一层的分类回归改为softmax回归得到的错误率为16.01%和5.10%。当然这里修改迭代次数即下降的速率可能会得到更佳的效果!

你可能感兴趣的:(机器学习)