对于神经网络这里应该就不用叙述了吧,之后可能会写一些关于深度学习方面的博客。这里要介绍的神经网络,名字应该叫做前馈神经网络(Feed-Forward Networks)。至于这个神经网络为什么要叫前馈,是因为这个网络在拓扑结构上不含有反向边,也就是没有环。而不是指的信号不能反向地传递。
对于单个神经元,Shark提供了几种激活函数以供选择:
5.线性函数,将它作为隐层神经元的激活函数可能不太合适。因为线性函数的组合结果还是线性函数,导致整个网络没有办法拟合特别复杂的函数。在中间的一些层次上也可以使用该函数,如果将后一层神经元的个数定的较少的话,可以达到一个降维的作用。
作为所有类型神经元的基类,它定义了神经元的激活函数以及激活函数对应的导数。该类定义在
。
template<class Derived> //注意到模板的参数是一个派生类类型,这个玄机我们之后会说
class NeuronBase{
private:
template<class T>
struct Function{ //定义神经元的激活函数
typedef T argument_type;
typedef argument_type result_type;
static const bool zero_identity = false;
Function(NeuronBase const* self):m_self(static_castconst*>(self)){} //这一段代码也是需要注意的一点
//重载括号运算符,计算激活函数的输出
result_type operator()(argument_type x)const{
return m_self->function(x);
}
Derived const* m_self;
};
//定义激活函数的导数,与Function结构是类似的
template<class T>
struct FunctionDerivative{
typedef T argument_type;
typedef argument_type result_type;
static const bool zero_identity = false;
FunctionDerivative(NeuronBase const* self):m_self(static_castconst*>(self)){}
result_type operator()(argument_type x)const{
return m_self->functionDerivative(x);
}
Derived const* m_self;
};
public:
//对于输入的每一项,计算其对应的激活值
template<class E>
blas::vector_unarytypename E::value_type> > operator()(blas::vector_expression const& x)const{
typedef Function<typename E::value_type> functor_type;
return blas::vector_unary(x,functor_type(this));
}
template<class E>
blas::matrix_unarytypename E::value_type> > operator()(blas::matrix_expression const& x)const{
typedef Function<typename E::value_type> functor_type;
return blas::matrix_unary(x,functor_type(this));
}
//计算输入对应的激活函数的导数值
template<class E>
blas::vector_unarytypename E::value_type> > derivative(blas::vector_expression const& x)const{
typedef FunctionDerivative<typename E::value_type> functor_type;
return blas::vector_unary(x,functor_type(this));
}
template<class E>
blas::matrix_unarytypename E::value_type> > derivative(blas::matrix_expression const& x)const{
typedef FunctionDerivative<typename E::value_type> functor_type;
return blas::matrix_unary(x,functor_type(this));
}
};
需要在子类中定义具体的激活函数以及其对应的导数形式,接下来我们来看一个具体的神经元类。
struct LogisticNeuron : public detail::NeuronBase{
template<class T>
T function(T x)const{
return sigmoid(x);
}
template<class T>
T functionDerivative(T y)const{
return y * (1 - y);
}
};
其实这个类的实现还是非常简单的。但是将其与基类联系起来看,其实是利用模板来实现多态。这种方法我也是第一次碰到。我们还是利用一点点的篇幅来介绍下这种技术。
我们把传统的多态实现方式称为动态多态,而模板的实现则是静态多态。区别如下:
其余神经元与LogisticNeuron类的形式差不多,这里就不再具体介绍了。但是在其中会发现一个原来没有介绍过的神经元类型。
template<class Neuron>
struct DropoutNeuron: public detail::NeuronBase >{
DropoutNeuron():m_probability(0.5),m_stochastic(true){}
template<class T>
T function(T x)const{
if(m_stochastic && Rng::coinToss(m_probability)){
return T(0);
}
else if(!m_stochastic){
return (1-m_probability)*m_neuron.function(x);
}else{
return m_neuron.function(x);
}
}
template<class T>
T functionDerivative(T y)const{
if(!m_stochastic){
return (1-m_probability)*m_neuron.functionDerivative(y/ (1-m_probability));
}else{
return m_neuron.functionDerivative(y);
}
}
void setProbability(double probability){m_probability = probability;}
void setStochastic(bool stochastic){m_stochastic = stochastic;}
private:
double m_probability; //将输出甚至为0的概率
bool m_stochastic;
Neuron m_neuron;
};
这个类是对我们之前介绍的基本神经元类型,如LogisticNeuron类,的一种封装。并在其中应用了dropout技术。该技术最近也是比较火。它的思想非常的简单,通过将神经元的输出以一定的概率设置为0,来达到减小模型过拟合的概率。这里有一个问题就是,当m_stochastic这个变量被设置为true时是dropout,但如果被设置为false呢?
该类是定义网络具体结构的类。神经网络中所有的隐层单元的激活函数都是一样的。但是输出层的激活函数与隐层的可以不一致。该类定义在
中。
首先来介绍下网络的几种连接方式。
struct FFNetStructures{
enum ConnectionType{
Normal, //没有跨层之间的连接
InputOutputShortcut, //有输入层到输出层的连接
Full //网络中的某一层与其下所有层都是有连接的
};
};
template<class HiddenNeuron,class OutputNeuron>
class FFNet :public AbstractModel
{
//网络中的神经元数,输入层神经元数,输出层神经元数
std::size_t m_numberOfNeurons;
std::size_t m_inputNeurons;
std::size_t m_outputNeurons;
//层间神经元连接的权值矩阵
std::vector m_layerMatrix;
//输入层与输出层间的连接权值,当且仅当连接方式选择了InputOutputShortcut,这个矩阵才是有意义的
RealMatrix m_inputOutputShortcut;
//这个矩阵是从输出层方向看整个网络得到的权值连接矩阵,相当于是m_layerMatrix的转置,主要用于反传过程
std::vector m_backpropMatrix;
//神经元的偏置,只有隐层单元和输出神经元可能有这一项
RealVector m_bias;
// 表示隐藏层、输出层神经元的类型
HiddenNeuron m_hiddenNeuron;
OutputNeuron m_outputNeuron;
struct InternalState: public State{
//存储输入是上一个batch数据时,网络中每一个神经元的激活值
RealMatrix responses;
void resize(std::size_t neurons, std::size_t patterns){
responses.resize(neurons,patterns);
}
};
public:
FFNet()
:m_numberOfNeurons(0),m_inputNeurons(0),m_outputNeurons(0){
m_features|=HAS_FIRST_PARAMETER_DERIVATIVE;
m_features|=HAS_FIRST_INPUT_DERIVATIVE;
}
std::string name() const
{ return "FFNet"; }
std::size_t inputSize()const{
return m_inputNeurons;
}
std::size_t outputSize()const{
return m_outputNeurons;
}
std::size_t numberOfNeurons()const{
return m_numberOfNeurons;
}
//返回隐层神经元数,这个值在类中是没有直接给出的
std::size_t numberOfHiddenNeurons()const{
return numberOfNeurons() - inputSize() -outputSize();
}
std::vector const& layerMatrices()const{
return m_layerMatrix;
}
RealMatrix const& layerMatrix(std::size_t layer)const{
return m_layerMatrix[layer];
}
//设置指定层次间的连接权值矩阵以及偏置,同样也需要更新反向权值矩阵
void setLayer(std::size_t layerNumber, RealMatrix const& m, RealVector const& bias){
SIZE_CHECK(m.size1() == bias.size());
SIZE_CHECK(m.size1() == m_layerMatrix[layerNumber].size1());
SIZE_CHECK(m.size2() == m_layerMatrix[layerNumber].size2());
m_layerMatrix[layerNumber] = m;
std::size_t start = 0;
for(std::size_t i = 0; i != layerNumber; ++i){
start += m_layerMatrix[i].size1();
}
noalias(subrange(m_bias,start,start+bias.size())) = bias;
//set backprop matrices
setParameterVector(parameterVector());
}
std::vector const& backpropMatrices()const{
return m_backpropMatrix;
}
RealMatrix const& inputOutputShortcut() const{
return m_inputOutputShortcut;
}
HiddenNeuron const& hiddenActivationFunction()const{
return m_hiddenNeuron;
}
OutputNeuron const& outputActivationFunction()const{
return m_outputNeuron;
}
HiddenNeuron& hiddenActivationFunction(){
return m_hiddenNeuron;
}
OutputNeuron& outputActivationFunction(){
return m_outputNeuron;
}
const RealVector& bias()const{
return m_bias;
}
//返回第layer层的偏置向量
RealVector bias(std::size_t layer)const{
std::size_t start = 0;
for(std::size_t i = 0; i != layer; ++i){
start +=layerMatrices()[i].size1();
}
return subrange(m_bias,start,start+layerMatrices()[layer].size1());
}
std::size_t numberOfParameters()const{
std::size_t numParams = m_inputOutputShortcut.size1()*m_inputOutputShortcut.size2();
numParams += bias().size();
for(std::size_t i = 0; i != layerMatrices().size(); ++i){
numParams += layerMatrices()[i].size1()*layerMatrices()[i].size2();
}
return numParams;
}
//将网络中所有的参数向量化返回,注意到参数的顺序是连接权值矩阵,偏置,m_inputOutputShortcut矩阵
RealVector parameterVector() const{
RealVector parameters(numberOfParameters());
init(parameters) << matrixSet(m_layerMatrix),m_bias,toVector(m_inputOutputShortcut);
return parameters;
}
//根据传入的参数向量来更改神经网络的参数,注意到更新的顺序与输出的顺序是一致的
void setParameterVector(RealVector const& newParameters){
init(newParameters) >> matrixSet(m_layerMatrix),m_bias,toVector(m_inputOutputShortcut);
//如果相邻两个层之间是有连接的,那么只需要将权值矩阵转置下即可
//它这里考虑的情况还比较复杂,因为可能会有跨层之间的连接,比如说full的连接形式
std::size_t layeriStart = 0;//表示layeri之下有多少个神经元
for(std::size_t layeri = 0; layeri != m_layerMatrix.size(); ++layeri){
std::size_t columni = 0;
std::size_t neuronsi = inputSize(); //表示layeri层有多少个神经元
if(layeri > 0)
neuronsi = m_layerMatrix[layeri-1].size1();
std::size_t layerjStart = layeriStart + neuronsi;//表示layerj之下有多少个神经元
for(std::size_t layerj = layeri; layerj != m_layerMatrix.size(); ++layerj){
std::size_t neuronsj = m_layerMatrix[layerj].size1();//表示这一层神经元的个数
//表示layerj层之下神经元个数减去当前层神经元个数,是否小于等于layeri层之下神经元个数
//如果是小于的话,表示这两层神经元之间是有交集的
if(layerjStart-m_layerMatrix[layerj].size2() <= layeriStart){
std::size_t weightStartj = layeriStart -(layerjStart - m_layerMatrix[layerj].size2());
noalias(columns(m_backpropMatrix[layeri],columni,columni+neuronsj))
= trans(columns(m_layerMatrix[layerj],weightStartj,weightStartj+neuronsi));
}
columni += neuronsj; //为了处理full连接的情况,将该层之下的每一层给剥离开
layerjStart += neuronsj;
}
layeriStart += neuronsi;
}
}
RealMatrix const& neuronResponses(State const& state)const{
InternalState const& s = state.toState();
return s.responses;
}
boost::shared_ptr createState()const{
return boost::shared_ptr(new InternalState());
}
//计算第layer层神经元的激活值,参数patterns表示该层的输入
void evalLayer(std::size_t layer,RealMatrix const& patterns,RealMatrix& outputs)const{
std::size_t numPatterns = patterns.size1();
std::size_t numOutputs = m_layerMatrix[layer].size1();
outputs.resize(numPatterns,numOutputs);
outputs.clear();
noalias(outputs) = prod(patterns,trans(layerMatrix(layer)));
if(!bias().empty()){
noalias(outputs) += repeat(bias(layer),numPatterns);
}
//要注意区分是否是输出层神经元,因为所使用的激活函数有可能不一样
if(layer < m_layerMatrix.size()-1) {
noalias(outputs) = m_hiddenNeuron(outputs);
}
else {
noalias(outputs) = m_outputNeuron(outputs);
}
}
Data evalLayer(std::size_t layer, Data const& patterns)const{
int batches = (int) patterns.numberOfBatches();
Data result(batches);
SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i){
evalLayer(layer,patterns.batch(i),result.batch(i));
}
return result;
}
//计算网络的输出值,在responses中保存结果的副本,参数patterns表示输入数据
void eval(RealMatrix const& patterns,RealMatrix& output, State& state)const{
InternalState& s = state.toState();
std::size_t numPatterns = patterns.size1();
s.resize(numberOfNeurons(),numPatterns);
s.responses.clear();
noalias(rows(s.responses,0,m_inputNeurons)) = trans(patterns);
std::size_t beginNeuron = m_inputNeurons;
for(std::size_t layer = 0; layer != m_layerMatrix.size();++layer){
const RealMatrix& weights = m_layerMatrix[layer];
//这里s.response的一列存储的是所有神经元的激活值,所以这里是一个累加的过程,beginNeuron表示本层神经元的开始位置,endNeuron表示结束位置
//这里层次之间的权值矩阵是Wij,其中i表示上一层,j表示下一层,所以取出权值矩阵后,行数表示这一层神经元的个数,列数表示下一层神经元的个数
std::size_t endNeuron = beginNeuron + weights.size1();
//获取本层神经元的输入,也就是上一层神经元的输出
RealSubMatrix const input = rows(s.responses,beginNeuron - weights.size2(),beginNeuron);
//获取本层的输出
RealSubMatrix responses = rows(s.responses,beginNeuron,endNeuron);
noalias(responses) = prod(weights,input);
if(!bias().empty()){
ConstRealVectorRange bias = subrange(m_bias,beginNeuron-inputSize(),endNeuron-inputSize());
noalias(responses) += trans(repeat(bias,numPatterns));
}
SHARK_CRITICAL_REGION{
//beware Dropout Neurons!
//这里需要判断是否是输出层,所使用的神经元类型可能不同
if(layer < m_layerMatrix.size()-1) {
noalias(responses) = m_hiddenNeuron(responses);
}
else {
//add shortcuts if necessary
if(m_inputOutputShortcut.size1() != 0){
noalias(responses) += prod(m_inputOutputShortcut,trans(patterns));
}
noalias(responses) = m_outputNeuron(responses);
}
}
//go to the next layer
beginNeuron = endNeuron;
}
//Sanity check
SIZE_CHECK(beginNeuron == m_numberOfNeurons);
//copy output layer into output
output.resize(numPatterns,m_outputNeurons);
noalias(output) = trans(rows(s.responses,m_numberOfNeurons-outputSize(),m_numberOfNeurons)); //直接从responses中获取结果
}
using AbstractModel::eval;
//计算BP过程中梯度的反传值,参数patterns表示的是网络的输入,参数coefficients表示目标函数的梯度值,参数gradient保存的是输出的梯度,这里不需要计算输入层权值的改变量
void weightedParameterDerivative(
BatchInputType const& patterns, RealMatrix const& coefficients, State const& state, RealVector& gradient
)const{
SIZE_CHECK(coefficients.size2() == m_outputNeurons);
SIZE_CHECK(coefficients.size1() == patterns.size1());
std::size_t numPatterns=patterns.size1();
RealMatrix delta(numberOfNeurons(),numPatterns,0.0);
RealSubMatrix outputDelta = rows(delta,delta.size1()-outputSize(),delta.size1());
noalias(outputDelta) = trans(coefficients);
computeDelta(delta,state,false);
computeParameterDerivative(delta,state,gradient);
}
//需要计算所有神经元的改变量,参数inputDerivative是连接输入神经元权值的改变量
void weightedInputDerivative(
BatchInputType const& patterns, RealMatrix const& coefficients, State const& state, BatchInputType& inputDerivative
)const{
SIZE_CHECK(coefficients.size2() == m_outputNeurons);
SIZE_CHECK(coefficients.size1() == patterns.size1());
std::size_t numPatterns=patterns.size1();
RealMatrix delta(numberOfNeurons(),numPatterns,0.0);
RealSubMatrix outputDelta = rows(delta,delta.size1()-outputSize(),delta.size1());
noalias(outputDelta) = trans(coefficients);
computeDelta(delta,state,true);
inputDerivative.resize(numPatterns,inputSize());
noalias(inputDerivative) = trans(rows(delta,0,inputSize()));
}
virtual void weightedDerivatives(
BatchInputType const & patterns,
BatchOutputType const & coefficients,
State const& state,
RealVector& parameterDerivative,
BatchInputType& inputDerivative
)const{
SIZE_CHECK(coefficients.size2() == m_outputNeurons);
SIZE_CHECK(coefficients.size1() == patterns.size1());
std::size_t numPatterns = patterns.size1();
RealMatrix delta(numberOfNeurons(),numPatterns,0.0);
RealSubMatrix outputDelta = rows(delta,delta.size1()-outputSize(),delta.size1());
noalias(outputDelta) = trans(coefficients);
computeDelta(delta,state,true);
inputDerivative.resize(numPatterns,inputSize());
noalias(inputDerivative) = trans(rows(delta,0,inputSize()));
computeParameterDerivative(delta,state,parameterDerivative);
}
void weightedParameterDerivativeFullDelta(
RealMatrix const& patterns, RealMatrix& delta, State const& state, RealVector& gradient
)const{
InternalState const& s = state.toState();
SIZE_CHECK(delta.size1() == m_numberOfNeurons);
SIZE_CHECK(delta.size2() == patterns.size1());
SIZE_CHECK(s.responses.size2() == patterns.size1());
computeDelta(delta,state,false);
//now compute the parameter derivative from the delta values
computeParameterDerivative(delta,state,gradient);
}
//设置网络的结构,网络中每一层的神经元数都存储在layers中,参数biasNeuron表示是否需要为每一个神经元设置偏置
void setStructure(
std::vector const& layers,
FFNetStructures::ConnectionType connectivity = FFNetStructures::Normal,
bool biasNeuron = true
){
SIZE_CHECK(layers.size() >= 2);
//权值矩阵要比网络的层数少一层
m_layerMatrix.resize(layers.size()-1);
m_backpropMatrix.resize(layers.size()-1);
//如果网络的连接形式是InputOutputShortcut,且网络只有三层,则意味着其网络结构与Full是等价的
if(connectivity == FFNetStructures::InputOutputShortcut && layers.size() ==3)
connectivity = FFNetStructures::Full;
m_inputNeurons = layers.front();
m_outputNeurons = layers.back();
m_numberOfNeurons = 0;
for(std::size_t i = 0; i != layers.size(); ++i){
m_numberOfNeurons += layers[i];
}
if(biasNeuron){
m_bias.resize(m_numberOfNeurons - m_inputNeurons);
}
if(connectivity == FFNetStructures::Full){
//对于这种连接形式的网络来说,某一层与其下所有层都是有连接的,所以需要在之前对神经元数进行累加操作
std::size_t numNeurons = layers[0];
for(std::size_t i = 0; i != m_layerMatrix.size(); ++i){
m_layerMatrix[i].resize(layers[i+1],numNeurons);
m_backpropMatrix[i].resize(layers[i],m_numberOfNeurons-numNeurons);
numNeurons += layers[i+1];
}
m_inputOutputShortcut.resize(0,0);
}else{
//only connect with the previous layer
for(std::size_t i = 0; i != m_layerMatrix.size(); ++i){
m_layerMatrix[i].resize(layers[i+1],layers[i]);
m_backpropMatrix[i].resize(layers[i],layers[i+1]);
}
if(connectivity == FFNetStructures::InputOutputShortcut){
m_inputOutputShortcut.resize(m_outputNeurons,m_inputNeurons);
}
}
}
void setStructure(
std::size_t in,
std::size_t hidden,
std::size_t out,
FFNetStructures::ConnectionType connectivity = FFNetStructures::Normal,
bool bias = true
){
std::vector layer(3);
layer[0] = in;
layer[1] = hidden;
layer[2] = out;
setStructure(layer, connectivity, bias);
}
void setStructure(
std::size_t in,
std::size_t hidden1,
std::size_t hidden2,
std::size_t out,
FFNetStructures::ConnectionType connectivity = FFNetStructures::Normal,
bool bias = true
){
std::vector layer(4);
layer[0] = in;
layer[1] = hidden1;
layer[2] = hidden2;
layer[3] = out;
setStructure(layer, connectivity, bias);
}
void read( InArchive & archive ){
archive>>m_inputNeurons;
archive>>m_outputNeurons;
archive>>m_numberOfNeurons;
archive>>m_layerMatrix;
archive>>m_backpropMatrix;
archive>>m_inputOutputShortcut;
archive>>m_bias;
}
void write( OutArchive & archive ) const{
archive<private:
//计算每一层参数的delta值,bool参数表示是否需要计算输入层参数的delta值
void computeDelta(
RealMatrix& delta, State const& state, bool computeInputDelta
)const{
SIZE_CHECK(delta.size1() == numberOfNeurons());
InternalState const& s = state.toState();
//initialize output neurons using coefficients
RealSubMatrix outputDelta = rows(delta,delta.size1()-outputSize(),delta.size1());
ConstRealSubMatrix outputResponse = rows(s.responses,delta.size1()-outputSize(),delta.size1());
noalias(outputDelta) *= m_outputNeuron.derivative(outputResponse); //就是计算出输出的导数值
std::size_t endNeuron = delta.size1()-outputSize();
std::size_t layer = m_backpropMatrix.size()-1; //注意到这里已经是设置成网络的层数减1
std::size_t endIndex = computeInputDelta? 0: inputSize(); //表示是否需要计算输入的delta值
while(endNeuron > endIndex){
RealMatrix const& weights = m_backpropMatrix[layer];
std::size_t beginNeuron = endNeuron - weights.size1();//first neuron of the current layer
//get the delta and response values of this layer
RealSubMatrix layerDelta = rows(delta,beginNeuron,endNeuron);
RealSubMatrix layerDeltaInput = rows(delta,endNeuron,endNeuron+weights.size2()); //上一层传入的delta值
ConstRealSubMatrix layerResponse = rows(s.responses,beginNeuron,endNeuron);
//每一层的delta值,是上一层的delta值乘以连接的权值矩阵,再乘以该层激活函数的导数值
//我感觉这里还是有点问题,每一层的delta值,应该是上一层的delta值乘以上一层的连接权值(最后一层除外),求和之后,再乘以所连接的上一层的神经元激活函数的导数值
noalias(layerDelta) += prod(weights,layerDeltaInput);//add the values to the maybe non-empty delta part
if(layer != 0){
noalias(layerDelta) *= m_hiddenNeuron.derivative(layerResponse);
}
endNeuron=beginNeuron;
--layer;
}
//add the shortcut deltas if necessary
if(inputOutputShortcut().size1() != 0)
noalias(rows(delta,0,inputSize())) += prod(trans(inputOutputShortcut()),outputDelta);
}
//计算误差的梯度值,在对神经元的偏置进行调整时,需要用到delta值,在对连接权值进行调整时需要用到梯度值。在进行激活值、delta值、梯度值计算时需要注意到层次间的对应关系
void computeParameterDerivative(RealMatrix const& delta, State const& state, RealVector& gradient)const{
SIZE_CHECK(delta.size1() == numberOfNeurons());
InternalState const& s = state.toState();
gradient.resize(numberOfParameters());
std::size_t pos = 0;
std::size_t layerStart = inputSize();
for(std::size_t layer = 0; layer != layerMatrices().size(); ++layer){
std::size_t layerRows = layerMatrices()[layer].size1();
std::size_t layerColumns = layerMatrices()[layer].size2();
std::size_t params = layerRows*layerColumns;
//误差的梯度就是该层的delta值乘以下一层的输出
axpy_prod(
rows(delta,layerStart,layerStart+layerRows),
trans(rows(s.responses,layerStart-layerColumns,layerStart)),
to_matrix(subrange(gradient,pos,pos+params),layerRows,layerColumns)
);
pos += params;
layerStart += layerRows;
}
//check whether we need the bias derivative
//注意到偏置的梯度值是放在最后的,参数的更新只需要用到delta值
if(!bias().empty()){
for (std::size_t neuron = m_inputNeurons; neuron < m_numberOfNeurons; neuron++){
gradient(pos) = sum(row(delta,neuron));
pos++;
}
}
//compute shortcut derivative
if(inputOutputShortcut().size1() != 0){
std::size_t params = inputSize()*outputSize();
axpy_prod(
rows(delta,delta.size1()-outputSize(),delta.size1()),
trans(rows(s.responses,0,inputSize())),
to_matrix(subrange(gradient,pos,pos+params),outputSize(),inputSize())
);
}
}
};
在定义好网络之后,需要有相应的学习算法对网络的参数进行调整。最经典的方法莫过于BP算法,但是shark中使用的方法并不是传统的BP算法,只是使用了梯度的符号并没有使用其值来对参数进行调整。这句话的含义可以在具体的算法中理解。
说到这里,不得不跟目标函数扯上联系。因为正是利用目标函数的梯度的反传,才能对参数值进行调整。所以这里对于网络参数的优化,归根到底还是对于目标函数的优化。这一内容其实在之前的博客中也有提到过。
参数调整的方法定义在文件
中,实现在
中。
在这个文件中介绍了几个参数调整的方法,这里值介绍其中的一种方法,其他几种方法在思想上都是极其类似的。
delta的更新公式为:
为了使delta值的更新更稳定,将更新的幅度控制在 [Δmin,Δmax] 内。其中参数 η+>1,0<η−<1 控制了delta值的更新速度。从公式中我们看得出,delta值的更新,主要取决于相邻两次计算的梯度的符号。
参数的更新公式为:
class RpropMinus : public AbstractSingleObjectiveOptimizer
{
public:
SHARK_EXPORT_SYMBOL RpropMinus(){
m_features |= REQUIRES_FIRST_DERIVATIVE;
m_features |= CAN_SOLVE_CONSTRAINED;
m_increaseFactor = 1.2;
m_decreaseFactor = 0.5;
m_maxDelta = 1e100;
m_minDelta = 0.0;
}
std::string name() const
{ return "RpropMinus"; }
SHARK_EXPORT_SYMBOL void init(ObjectiveFunctionType& objectiveFunction, SearchPointType const& startingPoint){
init(objectiveFunction,startingPoint,0.01);
}
//参数initDelta表示设定的初始值
SHARK_EXPORT_SYMBOL virtual void init(ObjectiveFunctionType& objectiveFunction, SearchPointType const& startingPoint, double initDelta){
checkFeatures(objectiveFunction);
objectiveFunction.init();
m_parameterSize = startingPoint.size();
m_delta.resize(m_parameterSize);
m_oldDerivative.resize(m_parameterSize);
std::fill(m_delta.begin(),m_delta.end(),initDelta);
m_oldDerivative.clear();
m_best.point = startingPoint;
//evaluate initial point
//计算初始的梯度,返回的是网络的整体误差
//这里的调用顺序是:首先是调用计算目标函数梯度的函数,之后该函数会调用计算网络每一层梯度的函数
m_best.value = objectiveFunction.evalDerivative(m_best.point,m_derivative);
}
using AbstractSingleObjectiveOptimizer::init;
SHARK_EXPORT_SYMBOL void step(ObjectiveFunctionType const& objectiveFunction){
for (size_t i = 0; i < m_parameterSize; i++)
{
double p = m_best.point(i);
if (m_derivative(i) * m_oldDerivative(i) > 0)
{
m_delta(i) = std::min(m_maxDelta, m_increaseFactor * m_delta(i));
}
else if (m_derivative(i) * m_oldDerivative(i) < 0)
{
m_delta(i) = std::max(m_minDelta, m_decreaseFactor * m_delta(i));
}
m_best.point(i) -= m_delta(i) * boost::math::sign(m_derivative(i));
//如果这个点不在可行解的范围内,则不对参数的值进行改变
if (! objectiveFunction.isFeasible(m_best.point))
{
m_best.point(i) = p;
m_delta(i) *= m_decreaseFactor;
m_oldDerivative(i) = 0.0;
}
else
{
m_oldDerivative(i) = m_derivative(i);
}
}
//evaluate the new point
m_best.value = objectiveFunction.evalDerivative(m_best.point,m_derivative);
}
SHARK_EXPORT_SYMBOL virtual void read( InArchive & archive );
SHARK_EXPORT_SYMBOL virtual void write( OutArchive & archive ) const;
void setEtaMinus(double etaMinus) {
RANGE_CHECK( etaMinus < 1 );
RANGE_CHECK( etaMinus > 0 );
m_decreaseFactor = etaMinus;
}
void setEtaPlus(double etaPlus) {
RANGE_CHECK( etaPlus > 1 );
m_increaseFactor = etaPlus;
}
void setMaxDelta(double d) {
RANGE_CHECK( d > 0 );
m_maxDelta = d;
}
void setMinDelta(double d) {
RANGE_CHECK( d >= 0 );
m_minDelta = d;
}
double maxDelta() const {
return *std::max_element(m_delta.begin(),m_delta.end());
}
protected:
ObjectiveFunctionType::FirstOrderDerivative m_derivative;//表示当前的梯度
double m_increaseFactor;//$\eta^+$,默认1.2
double m_decreaseFactor;//$\eta^-$,默认0.5
double m_maxDelta;
double m_minDelta;
size_t m_parameterSize;//参数的个数
RealVector m_oldDerivative;//上一次的梯度
RealVector m_delta;//这一次的delta值
};
在介绍完那么多的内容之后,最后给出一个具体的例子,将之前那么多的内容都串联起来。
#include
#include
#include
#include
#include
#include
#include
#include
using namespace shark;
using namespace std;
// data generating distribution for our toy
// multi-category classification problem
class Problem : public LabeledDataDistributionunsigned int>
{
private:
double m_noise;
public:
Problem(double noise):m_noise(noise){}
void draw(RealVector& input, unsigned int& label)const
{
label = Rng::discrete(0, 4);
input.resize(2);
input(0) = m_noise * Rng::gauss() + 3.0 * std::cos((double)label);
input(1) = m_noise * Rng::gauss() + 3.0 * std::sin((double)label);
}
};
int main(){
//get problem data
Problem problem(1.0);
LabeledDataunsigned int> training = problem.generateDataset(1000);
LabeledDataunsigned int> test = problem.generateDataset(100);
std::size_t inputs=inputDimension(training);
std::size_t outputs = numberOfClasses(training);
std::size_t hiddens = 10;
unsigned numberOfSteps = 1000;
//create network and initialize weights random uniform
FFNet network;
network.setStructure(inputs,hiddens,outputs);
initRandomUniform(network,-0.1,0.1);
//create error function
CrossEntropy loss;
ErrorFunction error(training,&network,&loss);
// loss for evaluation
// The zeroOneLoss for multiclass problems assigns the class to the highest output
ZeroOneLoss<unsigned int, RealVector> loss01;
// evaluate initial network
Data prediction = network(training.inputs());
cout << "classification error before learning:\t" << loss01.eval(training.labels(), prediction) << endl;
//initialize Rprop
IRpropPlus optimizer;
optimizer.init(error);
for(unsigned step = 0; step != numberOfSteps; ++step)
optimizer.step(error);
// evaluate solution found by training
network.setParameterVector(optimizer.solution().point); // set weights to weights found by learning
prediction = network(training.inputs());
cout << "classification error after learning:\t" << loss01(training.labels(), prediction) << endl;
}
首先是构建网络的结构,这里是一个三层的结构。之后就是定义目标函数,这里是将交叉熵函数作为目标函数。选择IRpropPlus方法作为目标函数的优化方法,其实与我们所介绍的RpropMinus方法是类似的。最后是将学习到的最优参数写回到模型中去。