第1步:利用训练样本集训练第一个稀疏编码器
第2步:利用训练样本集训练第二个稀疏编码器
第3步:利用第二个稀疏编码器提取到的特征训练softmax回归模型
第4步:利用误差反向传播进行微调
第5步:利用测试样本集对得到的分类器进行精度测试
下面将程序实现过程中的关键代码post出,欢迎各位网友指点!
stackedAEExercise.m
clc clear close all addpath ../common/ addpath ../common/minFunc %%====================================================================== %% STEP 0: 设置多层自编码器的相关参数 % 整个网络的输入输出结构 inputSize = 28 * 28; numClasses = 10; % 稀疏自编码器结构 hiddenSizeL1 = 200; % Layer 1 Hidden Size hiddenSizeL2 = 200; % Layer 2 Hidden Size % 一些权值 sparsityParam = 0.1; % desired average activation of the hidden units.that is ρ in the lecture beta = 3; % weight of sparsity penalty term lambda = 3e-3; % weight decay parameter %%====================================================================== %% STEP 1: 载入MNSIT数据集及标签集 addpath mnist\ trainData = loadMNISTImages('mnist/train-images-idx3-ubyte'); trainLabels = loadMNISTLabels('mnist/train-labels-idx1-ubyte'); trainLabels(trainLabels == 0) = 10; % Remap 0 to 10 since our labels need to start from 1 %%====================================================================== %% STEP 2: 训练第一个稀疏自编码器(训练样本集为trainData,看作是无标签训练样本集) % Randomly initialize the parameters sae1Theta = initializeParameters(hiddenSizeL1, inputSize); % 利用无标签样本集对稀疏自编码器进行学习,学习到的参数存放在向量sae1OptTheta中 % 优化函数的一些参数设置 options.Method = 'lbfgs'; options.maxIter = 400; % Maximum number of iterations of L-BFGS to run options.display = 'on'; % 调用优化函数,得到优化向量sae1OptTheta [sae1OptTheta, ~] = minFunc( @(p) sparseAutoencoderCost(p, ... inputSize, hiddenSizeL1, ... %输入维数、输出维数 lambda, sparsityParam, ... beta, trainData), ... sae1Theta, options); %save('sae1OptTheta.mat','sae1OptTheta') % % 权值可视化(Visualize weights) % W11 = reshape(sae1OptTheta(1:hiddenSizeL1 * inputSize), hiddenSizeL1, inputSize); % display_network(W11'); % load('sae1OptTheta.mat'); %%====================================================================== %% STEP 3: 训练第二个稀疏自编码器(训练数据是第一个自编码器提取到的特征) % 求解第一个自编码器的输出sae1Features(维数为hiddenSizeL1) [sae1Features] = feedForwardAutoencoder(sae1OptTheta, hiddenSizeL1, ... inputSize, trainData); % Randomly initialize the parameters sae2Theta = initializeParameters(hiddenSizeL2, hiddenSizeL1); % 开始训练第二个自编码器,输入维数是hiddenSizeL1,输出维数是hiddenSizeL2,优化向量存放在sae2OptTheta中 [sae2OptTheta, ~] = minFunc( @(p) sparseAutoencoderCost(p, ... hiddenSizeL1, hiddenSizeL2, ... %输入维数、输出维数 lambda, sparsityParam, ... beta, sae1Features), ... sae2Theta, options); % save('sae2OptTheta.mat','sae2OptTheta') % % Visualize weights % % W21 = reshape(sae2OptTheta(1:hiddenSizeL2 * hiddenSizeL1), hiddenSizeL2, hiddenSizeL1); % % display_network(W21'); %无法可视化!! % load('sae2OptTheta.mat'); %%====================================================================== %% STEP 4: 训练softmax classifier(它的输入为第二个自编码器提取到的特征sae2Features) % 求解第二个自编码器的输出sae1Features(维数为hiddenSizeL2) [sae2Features] = feedForwardAutoencoder(sae2OptTheta, hiddenSizeL2, ... hiddenSizeL1, sae1Features); % Randomly initialize the parameters saeSoftmaxTheta = 0.005 * randn(hiddenSizeL2 * numClasses, 1); % 开始优化softmax classifier,得到优化向量 options.maxIter = 100; softmaxModel = softmaxTrain(size(sae2Features,1), numClasses, lambda, ... sae2Features, trainLabels, options); saeSoftmaxOptTheta=softmaxModel.optTheta(:); % load('saeSoftmaxOptTheta.mat') %%====================================================================== %% STEP 5: 微调多层自编码器 % 利用稀疏自编码(stack)和softmax分类器(saeSoftmaxOptTheta)学习到的参数作为微调模型的初始值 % 稀疏自编码的参数stack stack = cell(2,1);%存放稀疏自编码器参数的元胞 stack{1}.w = reshape(sae1OptTheta(1:hiddenSizeL1*inputSize), ... hiddenSizeL1, inputSize); stack{1}.b = sae1OptTheta(2*hiddenSizeL1*inputSize+1:2*hiddenSizeL1*inputSize+hiddenSizeL1); stack{2}.w = reshape(sae2OptTheta(1:hiddenSizeL2*hiddenSizeL1), ... hiddenSizeL2, hiddenSizeL1); stack{2}.b = sae2OptTheta(2*hiddenSizeL2*hiddenSizeL1+1:2*hiddenSizeL2*hiddenSizeL1+hiddenSizeL2); [stackparams, netconfig] = stack2params(stack);%所有stack转化为向量形式,并提取稀疏自编码器的结构 % 整个模型参数(saeSoftmaxOptTheta+stack) stackedAETheta = [ saeSoftmaxOptTheta ; stackparams ]; % 是否进行梯度检验 DEBUG=1; if DEBUG checkStackedAECost() end % 开始进行微调优化 (Use minFunc to minimize the function) [stackedAEOptTheta, cost] = minFunc( @(p) stackedAECost(p, ... inputSize, hiddenSizeL2,...%输入层维数、最后一个稀疏编码器隐藏层维数 numClasses, netconfig, ...%稀疏自编码器的结构 lambda, trainData, trainLabels), ... stackedAETheta, options); %%====================================================================== %% STEP 6: Test % 获取有标签样本集 testData = loadMNISTImages('mnist/t10k-images-idx3-ubyte'); testLabels = loadMNISTLabels('mnist/t10k-labels-idx1-ubyte'); testLabels(testLabels == 0) = 10; % Remap 0 to 10 % 进行预测(微调后的) [pred] = stackedAEPredict(stackedAEOptTheta, inputSize, hiddenSizeL2, ... numClasses, netconfig, testData); acc = mean(testLabels(:) == pred(:));% 计算预测精度 fprintf('After Finetuning Test Accuracy: %0.3f%%\n', acc * 100); % 进行预测(微调前的) [pred] = stackedAEPredict(stackedAETheta, inputSize, hiddenSizeL2, ... numClasses, netconfig, testData); acc = mean(testLabels(:) == pred(:));% 计算预测精度 fprintf('Before Finetuning Test Accuracy: %0.3f%%\n', acc * 100); % Accuracy is the proportion of correctly classified images % The results for our implementation were: % Before Finetuning Test Accuracy: 87.7% % After Finetuning Test Accuracy: 97.6% % % If your values are too low (accuracy less than 95%), you should check % your code for errors, and make sure you are training on the % entire data set of 60000 28x28 training images % (unless you modified the loading code, this should be the case)
stackedAEPredict.m
% stackedAEPredict: Takes a trained theta and a test data set, % and returns the predicted labels for each example. % theta: trained weights from the autoencoder % visibleSize: the number of input units % hiddenSize: the number of hidden units *at the 2nd layer* % numClasses: the number of categories % data: Our matrix containing the training data as columns. So, data(:,i) is the i-th training example. % Your code should produce the prediction matrix % pred, where pred(i) is argmax_c P(y(c) | x(i)). function [pred] = stackedAEPredict(theta, inputSize, hiddenSize, numClasses, netconfig, data) %% Unroll theta parameter % We first extract the part which compute the softmax gradient softmaxTheta = reshape(theta(1:hiddenSize*numClasses), numClasses, hiddenSize); % Extract out the "stack" stack = params2stack(theta(hiddenSize*numClasses+1:end), netconfig); %% ---------- YOUR CODE HERE -------------------------------------- % Instructions: Compute pred using theta assuming that the labels start from 1. %% 前向传播计算 a{1}=data; depth=numel(netconfig.layersizes); for i=1:depth a{i+1}=sigmoid(bsxfun(@plus,stack{i}.w*a{i},stack{i}.b)); end %% softmax模型的输出Htheta softmaxData=a{depth+1};%softmax的输入即为stack自编码器最后一层的输出 M=softmaxTheta*softmaxData;%矩阵M M=bsxfun(@minus,M,max(M));%减去行向量α,防止数据溢出 Htheta=bsxfun(@rdivide,exp(M),sum(exp(M)));%softmax模型的假设函数输出 %% 计算Htheta每一列最大元素所在位置,即为该列所对应样本的类别 [~,pred]=max(Htheta); end % You might find this useful function sigm = sigmoid(x) sigm = 1 ./ (1 + exp(-x)); end
stackedAECost.m
%{ Takes a trained softmaxTheta and a training data set with labels, and returns cost and gradient using a stacked autoencoder model. Used for finetuning. 输入: theta:整个网络的权值向量 visibleSize: 网络的输入层维数 hiddenSize: 最后一个稀疏自编码器的隐藏层维数 numClasses: 类别总数 netconfig: the network configuration of the stack lambda: the weight regularization penalty data: 训练样本集,data(:,i) is the i-th training example. labels: 训练样本集的标签, where labels(i) is the label for the i-th training example 输出: cost:代价函数 grad:梯度向量 %} function [ cost, grad ] = stackedAECost(theta, ... inputSize, hiddenSize, ...%输入层维数、最后一个稀疏编码器隐藏层维数 numClasses, netconfig, ...%总类数、稀疏自编码器的结构 lambda, data, labels) %% 从输入的网络参数向量theta中得到softmax分类器和稀疏自编码器的参数 softmaxTheta = reshape(theta(1:hiddenSize*numClasses), numClasses, hiddenSize);%softmax的参数矩阵 stack = params2stack(theta(hiddenSize*numClasses+1:end), netconfig);% Extract out the "stack" %% 初始化 %样本个数 numCases = size(data, 2); %样本标签矩阵groundTruth(即I阵) groundTruth = full(sparse(labels, 1:numCases, 1)); % softmax分类器的梯度 softmaxThetaGrad = zeros(size(softmaxTheta)); % 稀疏自编码器的梯度(权值w和偏执项b) stackgrad = cell(size(stack)); for d = 1:numel(stack) stackgrad{d}.w = zeros(size(stack{d}.w)); stackgrad{d}.b = zeros(size(stack{d}.b)); end %% 前向传播算法 % 初始化工作 depth=numel(stack);% 稀疏自编码器隐藏层的层数(the layor of the network) z=cell(depth+1,1); % stack网络各层的激励值 a=cell(depth+1,1); % stack网络各层的激励值 a{1}=data; % 输入层数据 % 各稀疏自编码器输出a{2},...,a{depth+1} for i=1:depth %各稀疏编码器提取到的features z{i+1}=bsxfun(@plus,stack{i}.w*a{i},stack{i}.b); a{i+1}=sigmoid(z{i+1}); end % softmax分类器的输出Htheta softmaxData=a{depth+1};%softmax的输入即为stack自编码器最后一层的输出 M=softmaxTheta*softmaxData;%矩阵M M=bsxfun(@minus,M,max(M));%减去行向量α,防止数据溢出 Htheta=bsxfun(@rdivide,exp(M),sum(exp(M)));%softmax分类器的假设函数输出 %% 多层网络代价函数的计算(%要对整个网络的所有参数,包括softmax分类器和自编码器的所有参数) cost=-sum(sum(groundTruth.*log(Htheta)))/numCases+lambda*sum(softmaxTheta(:).^2)/2; %% 梯度计算 % softmax层的梯度 softmaxThetaGrad=-(groundTruth-Htheta)*softmaxData'/numCases+lambda*softmaxTheta; % 稀疏自编码层 % 敏感度 delta=cell(depth+1,1); delta{depth+1}=-softmaxTheta'*(groundTruth-Htheta).*a{depth+1}.*(1-a{depth+1}); for i=depth:-1:2 delta{i}=stack{i}.w'*delta{i+1}.*(a{i}).*(1-a{i}); end % 梯度值 for i=depth:-1:1 stackgrad{i}.w=delta{i+1}*a{i}'/numCases; stackgrad{i}.b=sum(delta{i+1},2)'/numCases; if size(stackgrad{i}.b,2)~=1 stackgrad{i}.b=stackgrad{i}.b'; end end %% Roll gradient vector grad = [softmaxThetaGrad(:) ; stack2params(stackgrad)]; end % You might find this useful function sigm = sigmoid(x) sigm = 1 ./ (1 + exp(-x)); end