DeepLearnToolbox_NN notes

DeepLearnToolbox usage:matlab->set path->add with subfloders 


DeepLearnToolbox_NN   (collated According to the function called order)

Contents

  • ex1 vanilla(香草) neural net
  • ex2 neural net with L2 weight decay
  • ex3 neural net with dropout
  • ex4 neural net with sigmoid activation function
  • ex5 plotting functionality
  • ex6 neural net with sigmoid activation and plotting of validation and training error
function test_example_NN
%rewritten and noted by Kylin 2013/12/11
load mnist_uint8;

ex_choise=5;%选择做第几个实验(1—6)
%将数据缩放到(0,1)
train_x = double(train_x) / 255;
test_x  = double(test_x)  / 255;
train_y = double(train_y);
test_y  = double(test_y);

% normalize
%标准化,均值0,标准差为1
[train_x, mu, sigma] = zscore(train_x);
test_x = normalize(test_x, mu, sigma);

ex1 vanilla(香草) neural net

if (ex_choise==1)
    rand('state',0)
    nn = nnsetup([784 100 10]);%构造一个ANN,784-100-10
    opts.numepochs =  1;   %  Number of full sweeps through data,扫描整个样本的次数
    opts.batchsize = 100;  %  Take a mean gradient step over this many samples
    [nn, L] = nntrain(nn, train_x, train_y, opts);

    [er, bad] = nntest(nn, test_x, test_y);

    assert(er < 0.08, 'Too big error');%er>0.08时警告
end

ex2 neural net with L2 weight decay

if ex_choise==2
    rand('state',0)
    nn = nnsetup([784 100 10]);

    nn.weightPenaltyL2 = 1e-4;  %  L2 weight decay
    opts.numepochs =  1;        %  Number of full sweeps through data
    opts.batchsize = 100;       %  Take a mean gradient step over this many samples

    nn = nntrain(nn, train_x, train_y, opts);

    [er, bad] = nntest(nn, test_x, test_y);
    assert(er < 0.1, 'Too big error');
end

ex3 neural net with dropout

if ex_choise==3
    rand('state',0)
    nn = nnsetup([784 100 10]);

    nn.dropoutFraction = 0.5;   %  Dropout fraction
    opts.numepochs =  1;        %  Number of full sweeps through data
    opts.batchsize = 100;       %  Take a mean gradient step over this many samples

    nn = nntrain(nn, train_x, train_y, opts);

    [er, bad] = nntest(nn, test_x, test_y);
    assert(er < 0.1, 'Too big error');
end

ex4 neural net with sigmoid activation function

if ex_choise==4
    rand('state',0)
    nn = nnsetup([784 100 10]);

    nn.activation_function = 'sigm';    %  Sigmoid activation function
    nn.learningRate = 1;                %  Sigm require a lower learning rate
    opts.numepochs =  1;                %  Number of full sweeps through data
    opts.batchsize = 100;               %  Take a mean gradient step over this many samples

    nn = nntrain(nn, train_x, train_y, opts);

    [er, bad] = nntest(nn, test_x, test_y);
    assert(er < 0.1, 'Too big error');
end

ex5 plotting functionality

if ex_choise==5
    rand('state',0)
    nn = nnsetup([784 20 10]);
    opts.numepochs         = 5;            %  Number of full sweeps through data
    nn.output              = 'softmax';    %  use softmax output
    opts.batchsize         = 1000;         %  Take a mean gradient step over this many samples
    opts.plot              = 1;            %  enable plotting

    nn = nntrain(nn, train_x, train_y, opts);

    [er, bad] = nntest(nn, test_x, test_y);
    assert(er < 0.1, 'Too big error');
end
epoch 1/5. Took 1.2234 seconds. Mini-batch mean squared error on training set is 0.94451; Full-batch train err = 0.387429
epoch 2/5. Took 1.2288 seconds. Mini-batch mean squared error on training set is 0.35993; Full-batch train err = 0.310213
epoch 3/5. Took 1.2134 seconds. Mini-batch mean squared error on training set is 0.30622; Full-batch train err = 0.279947
epoch 4/5. Took 1.2075 seconds. Mini-batch mean squared error on training set is 0.27597; Full-batch train err = 0.249559
epoch 5/5. Took 1.2745 seconds. Mini-batch mean squared error on training set is 0.25535; Full-batch train err = 0.234196

ex6 neural net with sigmoid activation and plotting of validation and training error

if ex_choise==6
    % split training data into training and validation data
    vx   = train_x(1:10000,:);
    tx = train_x(10001:end,:);
    vy   = train_y(1:10000,:);
    ty = train_y(10001:end,:);

    rand('state',0)
    nn                      = nnsetup([784 20 10]);
    nn.output               = 'softmax';                   %  use softmax output
    opts.numepochs          = 5;                           %  Number of full sweeps through data
    opts.batchsize          = 1000;                        %  Take a mean gradient step over this many samples
    opts.plot               = 1;                           %  enable plotting
    nn = nntrain(nn, tx, ty, opts, vx, vy);                %  nntrain takes validation set as last two arguments (optionally)

    [er, bad] = nntest(nn, test_x, test_y);
    assert(er < 0.1, 'Too big error');
end
function [x, mu, sigma] = zscore(x)
    mu=mean(x);
    sigma=max(std(x),eps);
	x=bsxfun(@minus,x,mu);
	x=bsxfun(@rdivide,x,sigma);
end
function x = normalize(x, mu, sigma)
    x=bsxfun(@minus,x,mu);
	x=bsxfun(@rdivide,x,sigma);
end
function nn = nnsetup(architecture)
%NNSETUP creates a Feedforward Backpropagate Neural Network
% nn = nnsetup(architecture) returns an neural network structure with n=numel(architecture)
% layers, architecture being a n x 1 vector of layer sizes e.g. [784 100 10]

nn.size   = architecture;
nn.n      = numel(nn.size);

nn.activation_function              = 'tanh_opt';   %  Activation functions of hidden layers: 'sigm' (sigmoid) or 'tanh_opt' (optimal tanh).
nn.learningRate                     = 2;            %  learning rate Note: typically needs to be lower when using 'sigm' activation function and non-normalized inputs.
nn.momentum                         = 0.5;          %  Momentum
nn.scaling_learningRate             = 1;            %  Scaling factor for the learning rate (each epoch)
nn.weightPenaltyL2                  = 0;            %  L2 regularization
nn.nonSparsityPenalty               = 0;            %  Non sparsity penalty 非稀疏性惩罚参数
nn.sparsityTarget                   = 0.05;         %  Sparsity target
nn.inputZeroMaskedFraction          = 0;            %  Used for Denoising AutoEncoders
nn.dropoutFraction                  = 0;            %  Dropout level (http://www.cs.toronto.edu/~hinton/absps/dropout.pdf)
nn.testing                          = 0;            %  Internal variable(内部变量). nntest sets this to one.
nn.output                           = 'sigm';       %  output unit 'sigm' (=logistic), 'softmax' and 'linear'

for i = 2 : nn.n
    %初始化weights and weight momentum(eg:W1 100*785)
    nn.W{i - 1} = (rand(nn.size(i), nn.size(i - 1)+1) - 0.5) * 2 * 4 * sqrt(6 / (nn.size(i) + nn.size(i - 1)));
    nn.vW{i - 1} = zeros(size(nn.W{i - 1}));% VW 100*785

    % average activations (for use with sparsity,eg:sparseAE KL散度约束)
    nn.p{i}     = zeros(1, nn.size(i));
end
end
function [nn, L]  = nntrain(nn, train_x, train_y, opts, val_x, val_y)
%NNTRAIN trains a neural net
% [nn, L] = nnff(nn, x, y, opts) trains the neural network nn with input x and
% output y for opts.numepochs epochs, with minibatches of size
% opts.batchsize. Returns a neural network nn with updated activations,
% errors, weights and biases, (nn.a, nn.e, nn.W, nn.b) and L, the sum
% squared error for each training minibatch.

assert(isfloat(train_x), 'train_x must be a float');
assert(nargin == 4 || nargin == 6,'number ofinput arguments must be 4 or 6')

loss.train.e               = [];
loss.train.e_frac          = [];
loss.val.e                 = [];
loss.val.e_frac            = [];
opts.validation = 0;
if nargin == 6
    opts.validation = 1;
end

fhandle = [];
if isfield(opts,'plot') && opts.plot == 1
    fhandle = figure();
end

m = size(train_x, 1); %m为训练样本的数量(60000)

%设置opt参数
batchsize = opts.batchsize;%batch的大小,100.实验ex5画图时为1000
numepochs = opts.numepochs;%扫描次数,1。实验5,迭代5次

numbatches = m / batchsize;%batch的个数,600。ex5:1000

assert(rem(numbatches, 1) == 0, 'numbatches must be a integer');

L = zeros(numepochs*numbatches,1);%L为batch数量和扫描样本次数的乘积 600。
                                  %ex5,300*1
n = 1;
for i = 1 : numepochs
    tic;

    kk = randperm(m);%生成一个和整个样本大小的随机向量 1*60000
    for l = 1 : numbatches%1-600
        %取出一个batch(100个)大小的训练样本
        batch_x = train_x(kk((l - 1) * batchsize + 1 : l * batchsize), :);

        %Add noise to input (for use in denoising autoencoder)
        if(nn.inputZeroMaskedFraction ~= 0)
            %将样本中小于nn.inputZeroMaskedFraction水平的元素置为0
            batch_x = batch_x.*(rand(size(batch_x))>nn.inputZeroMaskedFraction);
        end
        %batch的标签
        batch_y = train_y(kk((l - 1) * batchsize + 1 : l * batchsize), :);
        %前向传播
        %NNFF performs a feedforward pass
        % nn = nnff(nn, x, y) returns an neural network structure with updated
        % layer activations, error and loss (nn.a, nn.e and nn.L)
        nn = nnff(nn, batch_x, batch_y);
        %误差反传
        %NNBP performs backpropagation
        % nn = nnbp(nn) returns an neural network structure with updated weights
        nn = nnbp(nn);
        %梯度下降(SGD)
        %NNAPPLYGRADS updates weights and biases with calculated gradients
        % nn = nnapplygrads(nn) returns an neural network structure with updated
        % weights and biases
        nn = nnapplygrads(nn);
        %600*1向量,每一个元素表示一个mini-batch梯度下降一次得到的误差值
        L(n) = nn.L;%600个batch,扫描(迭代)一次,L为每一次batch迭代的损失函数值

        n = n + 1;
    end

    t = toc;

    if opts.validation == 1
        loss = nneval(nn, loss, train_x, train_y, val_x, val_y);
        str_perf = sprintf('; Full-batch train mse = %f, val mse = %f', loss.train.e(end), loss.val.e(end));
    else
        loss = nneval(nn, loss, train_x, train_y);% 用Full-batch train,来得到一个误差
        str_perf = sprintf('; Full-batch train err = %f', loss.train.e(end));
    end
    if ishandle(fhandle)
        nnupdatefigures(nn, fhandle, loss, opts, i);
    end

    disp(['epoch ' num2str(i) '/' num2str(opts.numepochs) '. Took ' num2str(t) ' seconds' '. Mini-batch mean squared error on training set is ' num2str(mean(L((n-numbatches):(n-1)))) str_perf]);
    nn.learningRate = nn.learningRate * nn.scaling_learningRate;%scaling_learningRate,学习率调整参数
end
end
function nn = nnff(nn, x, y)
%NNFF performs a feedforward pass
% nn = nnff(nn, x, y) returns an neural network structure with updated
% layer activations, error and loss (nn.a, nn.e and nn.L)

    n = nn.n;
    m = size(x, 1);%batch样本大小,测试时为10000个样本

    x = [ones(m,1) x];
    nn.a{1} = x;%这里用a1表示输入x,也就是输入层的输出

    %feedforward pass
    for i = 2 : n-1
        switch nn.activation_function
            case 'sigm'
                % Calculate the unit's outputs (including the bias term)
                nn.a{i} = sigm(nn.a{i - 1} * nn.W{i - 1}');
            case 'tanh_opt'
                nn.a{i} = tanh_opt(nn.a{i - 1} * nn.W{i - 1}');
        end

        %dropout
        if(nn.dropoutFraction > 0)
            if(nn.testing)% at testing time,在测试时用
                nn.a{i} = nn.a{i}.*(1 - nn.dropoutFraction);%测试时这样做.*(1-p)
            else
                %rand(size(nn.a{i}))产生一个100*100的(0,1)随机矩阵,大于0.5的置为1,否则为0
                nn.dropOutMask{i} = (rand(size(nn.a{i}))>nn.dropoutFraction);
                nn.a{i} = nn.a{i}.*nn.dropOutMask{i};%保留a(i)矩阵中大于1的元素
            end
        end

        %calculate running exponential activations for use with sparsity
        %计算Sparsity,nonSparsityPenalty是对没达到SparsityTarget参数的惩罚系数???
        if(nn.nonSparsityPenalty>0)
            nn.p{i} = 0.99 * nn.p{i} + 0.01 * mean(nn.a{i}, 1);%P表示?
        end

        %Add the bias term
        nn.a{i} = [ones(m,1) nn.a{i}];%100*100变成100*101,增加一列
    end
    switch nn.output
        case 'sigm'
            nn.a{n} = sigm(nn.a{n - 1} * nn.W{n - 1}');
        case 'linear'
            nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
        case 'softmax'
            nn.a{n} = nn.a{n - 1} * nn.W{n - 1}';
            nn.a{n} = exp(bsxfun(@minus, nn.a{n}, max(nn.a{n},[],2)));
            nn.a{n} = bsxfun(@rdivide, nn.a{n}, sum(nn.a{n}, 2));
    end

    %error and loss
    nn.e = y - nn.a{n};

    switch nn.output
        case {'sigm', 'linear'}
            nn.L = 1/2 * sum(sum(nn.e .^ 2)) / m;
        case 'softmax'
            nn.L = -sum(sum(y .* log(nn.a{n}))) / m;
    end
end
function nn = nnbp(nn)
%NNBP performs backpropagation
% nn = nnbp(nn) returns an neural network structure with updated weights

    n = nn.n;
    sparsityError = 0;
    switch nn.output%输出层
        case 'sigm'
            d{n} = - nn.e .* (nn.a{n} .* (1 - nn.a{n}));%-(y-a(n)).*f'(z(n))
        case {'softmax','linear'}
            d{n} = - nn.e;
    end
    for i = (n - 1) : -1 : 2%倒数第二层到第二层
        % Derivative of the activation function
        switch nn.activation_function
            case 'sigm'
                d_act = nn.a{i} .* (1 - nn.a{i});%f'(z(i))
            case 'tanh_opt'
                d_act = 1.7159 * 2/3 * (1 - 1/(1.7159)^2 * nn.a{i}.^2);
        end

        if(nn.nonSparsityPenalty>0)
            pi = repmat(nn.p{i}, size(nn.a{i}, 1), 1);
            sparsityError = [zeros(size(nn.a{i},1),1) nn.nonSparsityPenalty * (-nn.sparsityTarget ./ pi + (1 - nn.sparsityTarget) ./ (1 - pi))];
        end

        % Backpropagate first derivatives
        if i+1==n % in this case in d{n} there is not the bias term to be removed,最后一层没有偏置
            d{i} = (d{i + 1} * nn.W{i} + sparsityError) .* d_act; % Bishop (5.56)%delta(L),also called residual
        else % in this case in d{i} the bias term has to be removed,偏置不参与误差反传
            d{i} = (d{i + 1}(:,2:end) * nn.W{i} + sparsityError) .* d_act;
        end

        if(nn.dropoutFraction>0)%i从2开始,ones(size(d{i},1),1)=100*1
            d{i} = d{i} .* [ones(size(d{i},1),1) nn.dropOutMask{i}];%这里delta(L)也得dropout
        end

    end

    for i = 1 : (n - 1)%计算偏导数(关于W的)delta(W)
        if i+1==n
            nn.dW{i} = (d{i + 1}' * nn.a{i}) / size(d{i + 1}, 1);
        else
            nn.dW{i} = (d{i + 1}(:,2:end)' * nn.a{i}) / size(d{i + 1}, 1);
        end
    end
end
function nn = nnapplygrads(nn)
%NNAPPLYGRADS updates weights and biases with calculated gradients
% nn = nnapplygrads(nn) returns an neural network structure with updated
% weights and biases

    for i = 1 : (nn.n - 1)%nn.n表示层数
        if(nn.weightPenaltyL2>0)%weightPenaltyL2权值L2正则化参数
            dW = nn.dW{i} + nn.weightPenaltyL2 * [zeros(size(nn.W{i},1),1) nn.W{i}(:,2:end)];
        else
            dW = nn.dW{i};
        end

        dW = nn.learningRate * dW;

        if(nn.momentum>0)%动量项
            nn.vW{i} = nn.momentum*nn.vW{i} + dW;
            dW = nn.vW{i};
        end

        nn.W{i} = nn.W{i} - dW;%权值更新(这里阈值包含在权值矩阵中,为新加的一列)
    end
end
function nnupdatefigures(nn,fhandle,L,opts,i)
%NNUPDATEFIGURES updates figures during training
if i > 1 %don't plot first point, its only a point
    x_ax = 1:i;
    % create legend
    if opts.validation == 1
        M            = {'Training','Validation'};
    else
        M            = {'Training'};
    end

    %create data for plots
    if strcmp(nn.output,'softmax')%Compare strings (case sensitive)
        plot_x       = x_ax';
        plot_ye      = L.train.e';
        plot_yfrac   = L.train.e_frac';

    else
        plot_x       = x_ax';
        plot_ye      = L.train.e';
    end

    %add error on validation data if present
    if opts.validation == 1
        plot_x       = [plot_x, x_ax'];
        plot_ye      = [plot_ye,L.val.e'];
    end


    %add classification error on validation data if present
    if opts.validation == 1 && strcmp(nn.output,'softmax')
        plot_yfrac   = [plot_yfrac, L.val.e_frac'];
    end

%    plotting
    figure(fhandle);
    if strcmp(nn.output,'softmax')  %also plot classification error
        %subplot(m,n,p) breaks the figure window into an m-by-n grid and creates an axes object in the pth location
        %for the current plot, and returns the axes handle.
        p1 = subplot(1,2,1);%P1为第一个图句柄
        plot(plot_x,plot_ye);
        xlabel('Number of epochs'); ylabel('Error');title('Error');
        title('Error')
        legend(p1, M,'Location','NorthEast');%加上“training”的标注
        set(p1, 'Xlim',[0,opts.numepochs + 1])%为x轴设置范围

        p2 = subplot(1,2,2);
        plot(plot_x,plot_yfrac);
        xlabel('Number of epochs'); ylabel('Misclassification rate');
        title('Misclassification rate')
        legend(p2, M,'Location','NorthEast');
        set(p2, 'Xlim',[0,opts.numepochs + 1])

    else

        p = plot(plot_x,plot_ye);
        xlabel('Number of epochs'); ylabel('Error');title('Error');
        legend(p, M,'Location','NorthEast');
        set(gca, 'Xlim',[0,opts.numepochs + 1])

    end
    drawnow;%刷新图
end
end
function [er, bad] = nntest(nn, x, y)
    labels = nnpredict(nn, x);
    [~, expected] = max(y,[],2);%目标lable
    bad = find(labels ~= expected);
    er = numel(bad) / size(x, 1);
end







 
 


你可能感兴趣的:(NetWork,deep,learning,Neural)