主要构造函数头文件:
/**
* Implementation of a standard feed forward network.
*
* @tparam OutputLayerType The output layer type used to evaluate the network.
* @tparam InitializationRuleType Rule used to initialize the weight matrix.
* @tparam CustomLayers Any set of custom layers that could be a part of the
* feed forward network.
*/
template<
typename OutputLayerType = NegativeLogLikelihood<>,
typename InitializationRuleType = RandomInitialization,
typename... CustomLayers
>
class FFN
{
public:
//! Convenience typedef for the internal model construction.
using NetworkType = FFN<OutputLayerType, InitializationRuleType>;
/**
* Create the FFN object.
*
* Optionally, specify which initialize rule and performance function should
* be used.
*
* If you want to pass in a parameter and discard the original parameter
* object, be sure to use std::move to avoid unnecessary copy.
*
* @param outputLayer Output layer used to evaluate the network.
* @param initializeRule Optional instantiated InitializationRule object
* for initializing the network parameter.
*/
FFN(OutputLayerType outputLayer = OutputLayerType(),
InitializationRuleType initializeRule = InitializationRuleType());
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::FFN(
OutputLayerType outputLayer, InitializationRuleType initializeRule) :
outputLayer(std::move(outputLayer)),
initializeRule(std::move(initializeRule)),
width(0),
height(0),
reset(false),
numFunctions(0),
deterministic(false)
{
/* Nothing to do here. */
}
构造函数有两个主要的模板参数: OutputLayerType 和 InitializationRuleType,去看一下它们的默认实现
NegativeLogLikelihood 头文件:
/**
* Implementation of the negative log likelihood layer. The negative log
* likelihood layer expectes that the input contains log-probabilities for each
* class. The layer also expects a class index, in the range between 1 and the
* number of classes, as target when calling the Forward function.
*
* @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
* @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
*/
template <
typename InputDataType = arma::mat,
typename OutputDataType = arma::mat
>
class NegativeLogLikelihood
{
public:
/**
* Create the NegativeLogLikelihoodLayer object.
*/
NegativeLogLikelihood();
/**
* Computes the Negative log likelihood.
*
* @param input Input data used for evaluating the specified function.
* @param target The target vector, that contains the class index in the range
* between 1 and the number of classes.
*/
template<typename InputType, typename TargetType>
typename InputType::elem_type Forward(const InputType& input,
const TargetType& target);
/**
* Ordinary feed backward pass of a neural network. The negative log
* likelihood layer expects that the input contains log-probabilities for
* each class. The layer also expects a class index, in the range between 1
* and the number of classes, as target when calling the Forward function.
*
* @param input The propagated input activation.
* @param target The target vector, that contains the class index in the range
* between 1 and the number of classes.
* @param output The calculated error.
*/
template<typename InputType, typename TargetType, typename OutputType>
void Backward(const InputType& input,
const TargetType& target,
OutputType& output);
//! Get the input parameter.
InputDataType& InputParameter() const { return inputParameter; }
//! Modify the input parameter.
InputDataType& InputParameter() { return inputParameter; }
//! Get the output parameter.
OutputDataType& OutputParameter() const { return outputParameter; }
//! Modify the output parameter.
OutputDataType& OutputParameter() { return outputParameter; }
//! Get the delta.
OutputDataType& Delta() const { return delta; }
//! Modify the delta.
OutputDataType& Delta() { return delta; }
/**
* Serialize the layer
*/
template<typename Archive>
void serialize(Archive& /* ar */, const unsigned int /* version */);
private:
//! Locally-stored delta object.
OutputDataType delta;
//! Locally-stored input parameter object.
InputDataType inputParameter;
//! Locally-stored output parameter object.
OutputDataType outputParameter;
}; // class NegativeLogLikelihood
实现:
template<typename InputDataType, typename OutputDataType>
NegativeLogLikelihood<InputDataType, OutputDataType>::NegativeLogLikelihood()
{
// Nothing to do here.
}
template<typename InputDataType, typename OutputDataType>
template<typename InputType, typename TargetType>
typename InputType::elem_type
NegativeLogLikelihood<InputDataType, OutputDataType>::Forward(
const InputType& input,
const TargetType& target)
{
typedef typename InputType::elem_type ElemType;
ElemType output = 0;
for (size_t i = 0; i < input.n_cols; ++i)
{
size_t currentTarget = target(i) - 1;
Log::Assert(currentTarget < input.n_rows,
"Target class out of range.");
output -= input(currentTarget, i);
}
return output;
}
template<typename InputDataType, typename OutputDataType>
template<typename InputType, typename TargetType, typename OutputType>
void NegativeLogLikelihood<InputDataType, OutputDataType>::Backward(
const InputType& input,
const TargetType& target,
OutputType& output)
{
output = arma::zeros<OutputType>(input.n_rows, input.n_cols);
for (size_t i = 0; i < input.n_cols; ++i)
{
size_t currentTarget = target(i) - 1;
Log::Assert(currentTarget < input.n_rows,
"Target class out of range.");
output(currentTarget, i) = -1;
}
}
template<typename InputDataType, typename OutputDataType>
template<typename Archive>
void NegativeLogLikelihood<InputDataType, OutputDataType>::serialize(
Archive& /* ar */,
const unsigned int /* version */)
{
// Nothing to do here.
}
负对数似然损失中重要的就是那两个 Forward , Backward 方法,我们不妨引入一些记号:
i n p u t : ( X 1 , ⋯ , X N ) , X i ∈ R n ∀ i ∈ [ 1 , N ] ⇒ [ x 11 x 12 ⋯ x 1 N ⋮ x n 1 x n 2 ⋯ x n N ] t a r g e t : ( y 1 , ⋯ , y N ) , y i ∈ [ 1 , m ] input: ( X_1, \cdots , X_N) \ , \quad X_i \in \mathbb{R}^n \ \ \forall \ i \in [1, N] \\[6pt] \Rightarrow \begin{bmatrix} x_{11} \ x_{12} \cdots \ x_{1N} \\ \vdots \\ x_{n1} \ x_{n2} \cdots \ x_{nN} \end{bmatrix} \\[6pt] target: (y_1 , \cdots , y_N) \ , \quad y_i \in [1, m] input:(X1,⋯,XN) ,Xi∈Rn ∀ i∈[1,N]⇒⎣⎢⎡x11 x12⋯ x1N⋮xn1 xn2⋯ xnN⎦⎥⎤target:(y1,⋯,yN) ,yi∈[1,m]
因此:
Forward:
o u t p u t = − ∑ i = 1 N x ( y i , i ) , y i ⩽ n output = - \sum_{i=1}^N x_{(y_i ,i)} \ , \quad y_i \leqslant n output=−i=1∑Nx(yi,i) ,yi⩽n
Backward:
( n × N ) : o u t p u t ( j , i ) = { − 1 , j = y i ( y i ⩽ n ) 0 , o t h e r w i s e (n \times N): \quad output_{(j, i)}= \begin{cases} -1 \ , \quad j = y_i \ \ (y_i \leqslant n) \\ 0 \ , \quad otherwise \end{cases} (n×N):output(j,i)={−1 ,j=yi (yi⩽n)0 ,otherwise
RandomInitialization :
/**
* This class is used to initialize randomly the weight matrix.
*/
class RandomInitialization
{
public:
/**
* Initialize the random initialization rule with the given lower bound and
* upper bound.
*
* @param lowerBound The number used as lower bound.
* @param upperBound The number used as upper bound.
*/
RandomInitialization(const double lowerBound = -1,
const double upperBound = 1) :
lowerBound(lowerBound), upperBound(upperBound) { }
/**
* Initialize the random initialization rule with the given bound.
* Using the negative of the bound as lower bound and the positive bound as
* upper bound.
*
* @param bound The number used as lower bound
*/
RandomInitialization(const double bound) :
lowerBound(-std::abs(bound)), upperBound(std::abs(bound)) { }
/**
* Initialize randomly the elements of the specified weight matrix.
*
* @param W Weight matrix to initialize.
* @param rows Number of rows.
* @param cols Number of columns.
*/
template<typename eT>
void Initialize(arma::Mat<eT>& W, const size_t rows, const size_t cols)
{
if (W.is_empty())
W.set_size(rows, cols);
W.randu();
W *= (upperBound - lowerBound);
W += lowerBound;
}
/**
* Initialize randomly the elements of the specified weight matrix.
*
* @param W Weight matrix to initialize.
*/
template<typename eT>
void Initialize(arma::Mat<eT>& W)
{
if (W.is_empty())
Log::Fatal << "Cannot initialize an empty matrix." << std::endl;
W.randu();
W *= (upperBound - lowerBound);
W += lowerBound;
}
/**
* Initialize randomly the elements of the specified weight 3rd order tensor.
*
* @param W Weight matrix to initialize.
* @param rows Number of rows.
* @param cols Number of columns.
* @param slices Number of slices.
*/
template<typename eT>
void Initialize(arma::Cube<eT>& W,
const size_t rows,
const size_t cols,
const size_t slices)
{
if (W.is_empty())
W.set_size(rows, cols, slices);
for (size_t i = 0; i < slices; ++i)
Initialize(W.slice(i), rows, cols);
}
/**
* Initialize randomly the elements of the specified weight 3rd order tensor.
*
* @param W Weight matrix to initialize.
*/
template<typename eT>
void Initialize(arma::Cube<eT>& W)
{
if (W.is_empty())
Log::Fatal << "Cannot initialize an empty cube." << std::endl;
for (size_t i = 0; i < W.n_slices; ++i)
Initialize(W.slice(i));
}
private:
//! The number used as lower bound.
double lowerBound;
//! The number used as upper bound.
double upperBound;
}; // class RandomInitialization
.randu() 在官方中的说明:
.randu() uses a uniform distribution in the [0,1] interval
因此,该初始化方法先产生服从 U ( 0 , 1 ) U(0, 1) U(0,1) 的初始值,再乘以 ( u p p e r B o u n d − l o w e r B o u n d ) (upperBound - lowerBound) (upperBound−lowerBound),加上 l o w e r B o u n d lowerBound lowerBound
有:
E ( W ) = ( u p p e r B o u n d + l o w e r B o u n d ) 2 D ( W ) = ( u p p e r B o u n d − l o w e r B o u n d ) 2 12 E(W) = \dfrac{(upperBound + lowerBound)}{2} \\[6pt] D(W) = \dfrac{(upperBound - lowerBound)^2}{12} E(W)=2(upperBound+lowerBound)D(W)=12(upperBound−lowerBound)2
Train 头文件:
/**
* Train the feedforward network on the given input data using the given
* optimizer.
*
* This will use the existing model parameters as a starting point for the
* optimization. If this is not what you want, then you should access the
* parameters vector directly with Parameters() and modify it as desired.
*
* If you want to pass in a parameter and discard the original parameter
* object, be sure to use std::move to avoid unnecessary copy.
*
* @tparam OptimizerType Type of optimizer to use to train the model.
* @tparam CallbackTypes Types of Callback Functions.
* @param predictors Input training variables.
* @param responses Outputs results from input training variables.
* @param optimizer Instantiated optimizer used to train the model.
* @param callbacks Callback function for ensmallen optimizer `OptimizerType`.
* See https://www.ensmallen.org/docs.html#callback-documentation.
* @return The final objective of the trained model (NaN or Inf on error).
*/
template<typename OptimizerType, typename... CallbackTypes>
double Train(arma::mat predictors,
arma::mat responses,
OptimizerType& optimizer,
CallbackTypes&&... callbacks);
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
template<typename OptimizerType, typename... CallbackTypes>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Train(
arma::mat predictors,
arma::mat responses,
OptimizerType& optimizer,
CallbackTypes&&... callbacks)
{
ResetData(std::move(predictors), std::move(responses));
WarnMessageMaxIterations<OptimizerType>(optimizer, this->predictors.n_cols);
// Train the model.
Timer::Start("ffn_optimization");
const double out = optimizer.Optimize(*this, parameter, callbacks...);
Timer::Stop("ffn_optimization");
Log::Info << "FFN::FFN(): final objective of trained model is " << out
<< "." << std::endl;
return out;
}
构造完模型后,就是利用给定的数据集和标签来进行训练,从实现来看,这不难理解:
利用 ensmallen 里的优化器,将自身作为待优化的函数传入,将参数 parameter 传入
参照之前介绍的 Adam 优化算法,可以猜到,该模型一定封装有 Evaluate 和 Gradient 函数
果不其然:
Evaluate 头文件:
/**
* Evaluate the feedforward network with the given parameters. This function
* is usually called by the optimizer to train the model.
*
* @param parameters Matrix model parameters.
*/
double Evaluate(const arma::mat& parameters);
/**
* Evaluate the feedforward network with the given parameters, but using only
* a number of data points. This is useful for optimizers such as SGD, which
* require a separable objective function.
*
* @param parameters Matrix model parameters.
* @param begin Index of the starting point to use for objective function
* evaluation.
* @param batchSize Number of points to be passed at a time to use for
* objective function evaluation.
* @param deterministic Whether or not to train or test the model. Note some
* layer act differently in training or testing mode.
*/
double Evaluate(const arma::mat& parameters,
const size_t begin,
const size_t batchSize,
const bool deterministic);
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Evaluate(
const arma::mat& parameters)
{
double res = 0;
for (size_t i = 0; i < predictors.n_cols; ++i)
res += Evaluate(parameters, i, 1, true);
return res;
}
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Evaluate(
const arma::mat& /* parameters */,
const size_t begin,
const size_t batchSize,
const bool deterministic)
{
if (parameter.is_empty())
ResetParameters();
if (deterministic != this->deterministic)
{
this->deterministic = deterministic;
ResetDeterministic();
}
Forward(predictors.cols(begin, begin + batchSize - 1));
double res = outputLayer.Forward(
boost::apply_visitor(outputParameterVisitor, network.back()),
responses.cols(begin, begin + batchSize - 1));
for (size_t i = 0; i < network.size(); ++i)
{
res += boost::apply_visitor(lossVisitor, network[i]);
}
return res;
}
先看一下两个 Reset 方法:
ResetDeterministic
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType,
CustomLayers...>::ResetDeterministic()
{
DeterministicSetVisitor deterministicSetVisitor(deterministic);
std::for_each(network.begin(), network.end(),
boost::apply_visitor(deterministicSetVisitor));
}
里面用到了两个标准库的函数,首先是 std::for_each,其函数原型:
UnaryProc for_each ( InputIterator beg, InputIterator end, UnaryProc op)
可以猜想,boost::apply_visitor 一定是个函数对象了:
boost::apply_visitor — Allows compile-time checked type-safe application of the given visitor to the content of the given variant, ensuring that all types are handled by the visitor.
apply_visitor 有多个重载,在这里应该是使用它作为一元函数对象,即,将 deterministicSetVisitor 依次作用到 network 的每一个元素上
继续去看看
DeterministicSetVisitor 头文件:
/**
* DeterministicSetVisitor set the deterministic parameter given the
* deterministic value.
*/
class DeterministicSetVisitor : public boost::static_visitor<void>
{
public:
//! Set the deterministic parameter given the current deterministic value.
DeterministicSetVisitor(const bool deterministic = true);
//! Set the deterministic parameter.
template<typename LayerType>
void operator()(LayerType* layer) const;
void operator()(MoreTypes layer) const;
private:
//! The deterministic parameter.
const bool deterministic;
//! Set the deterministic parameter if the module implements the
//! Deterministic() and Model() function.
template<typename T>
typename std::enable_if<
HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
HasModelCheck<T>::value, void>::type
LayerDeterministic(T* layer) const;
//! Set the deterministic parameter if the module implements the
//! Model() function.
template<typename T>
typename std::enable_if<
!HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
HasModelCheck<T>::value, void>::type
LayerDeterministic(T* layer) const;
//! Set the deterministic parameter if the module implements the
//! Deterministic() function.
template<typename T>
typename std::enable_if<
HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
!HasModelCheck<T>::value, void>::type
LayerDeterministic(T* layer) const;
//! Do not set the deterministic parameter if the module doesn't implement the
//! Deterministic() or Model() function.
template<typename T>
typename std::enable_if<
!HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
!HasModelCheck<T>::value, void>::type
LayerDeterministic(T* layer) const;
};
实现:
//! DeterministicSetVisitor visitor class.
inline DeterministicSetVisitor::DeterministicSetVisitor(
const bool deterministic) : deterministic(deterministic)
{
/* Nothing to do here. */
}
template<typename LayerType>
inline void DeterministicSetVisitor::operator()(LayerType* layer) const
{
LayerDeterministic(layer);
}
inline void DeterministicSetVisitor::operator()(MoreTypes layer) const
{
layer.apply_visitor(*this);
}
template<typename T>
inline typename std::enable_if<
HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
HasModelCheck<T>::value, void>::type
DeterministicSetVisitor::LayerDeterministic(T* layer) const
{
layer->Deterministic() = deterministic;
for (size_t i = 0; i < layer->Model().size(); ++i)
{
boost::apply_visitor(DeterministicSetVisitor(deterministic),
layer->Model()[i]);
}
}
template<typename T>
inline typename std::enable_if<
!HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
HasModelCheck<T>::value, void>::type
DeterministicSetVisitor::LayerDeterministic(T* layer) const
{
for (size_t i = 0; i < layer->Model().size(); ++i)
{
boost::apply_visitor(DeterministicSetVisitor(deterministic),
layer->Model()[i]);
}
}
template<typename T>
inline typename std::enable_if<
HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
!HasModelCheck<T>::value, void>::type
DeterministicSetVisitor::LayerDeterministic(T* layer) const
{
layer->Deterministic() = deterministic;
}
template<typename T>
inline typename std::enable_if<
!HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
!HasModelCheck<T>::value, void>::type
DeterministicSetVisitor::LayerDeterministic(T* /* input */) const
{
/* Nothing to do here. */
}
总体上,就是继续将函数 DeterministicSetVisitor 作用于 layer->Model 每一个元素上
其具体行为依据 Layer 类型的不同而不同
ResetParameters
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType,
CustomLayers...>::ResetParameters()
{
ResetDeterministic();
// Reset the network parameter with the given initialization rule.
NetworkInitialization<InitializationRuleType,
CustomLayers...> networkInit(initializeRule);
networkInit.Initialize(network, parameter);
}
先前介绍的 RandomInitialization 在这里派上了用场,不过,还得先去看一下 NetworkInitialization
network_init
/**
* This class is used to initialize the network with the given initialization
* rule.
*/
template<typename InitializationRuleType, typename... CustomLayers>
class NetworkInitialization
{
public:
/**
* Use the given initialization rule to initialize the specified network.
*
* @param initializeRule Rule to initialize the given network.
*/
NetworkInitialization(
const InitializationRuleType& initializeRule = InitializationRuleType()) :
initializeRule(initializeRule)
{
// Nothing to do here.
}
/**
* Initialize the specified network and store the results in the given
* parameter.
*
* @param network Network that should be initialized.
* @param parameter The network parameter.
* @param parameterOffset Offset for network paramater, default 0.
*/
template <typename eT>
void Initialize(const std::vector<LayerTypes<CustomLayers...> >& network,
arma::Mat<eT>& parameter, size_t parameterOffset = 0)
{
// Determine the number of parameter/weights of the given network.
if (parameter.is_empty())
{
size_t weights = 0;
for (size_t i = 0; i < network.size(); ++i)
weights += boost::apply_visitor(weightSizeVisitor, network[i]);
parameter.set_size(weights, 1);
}
// Initialize the network layer by layer or the complete network.
if (ann::InitTraits<InitializationRuleType>::UseLayer)
{
for (size_t i = 0, offset = parameterOffset; i < network.size(); ++i)
{
// Initialize the layer with the specified parameter/weight
// initialization rule.
const size_t weight = boost::apply_visitor(weightSizeVisitor,
network[i]);
arma::Mat<eT> tmp = arma::mat(parameter.memptr() + offset,
weight, 1, false, false);
initializeRule.Initialize(tmp, tmp.n_elem, 1);
// Increase the parameter/weight offset for the next layer.
offset += weight;
}
}
else
{
initializeRule.Initialize(parameter, parameter.n_elem, 1);
}
// Note: We can't merge the for loop into the for loop above because
// WeightSetVisitor also sets the parameter/weights of the inner modules.
// Inner Modules are held by the parent module e.g. the concat module can
// hold various other modules.
for (size_t i = 0, offset = parameterOffset; i < network.size(); ++i)
{
offset += boost::apply_visitor(WeightSetVisitor(parameter, offset),
network[i]);
boost::apply_visitor(resetVisitor, network[i]);
}
}
private:
//! Instantiated InitializationRule object for initializing the network
//! parameter.
InitializationRuleType initializeRule;
//! Locally-stored reset visitor.
ResetVisitor resetVisitor;
//! Locally-stored weight size visitor.
WeightSizeVisitor weightSizeVisitor;
}; // class NetworkInitialization
首先是对 network 每一个元素调用 weightSizeVisitor 取得 parameter 的形状
WeightSizeVisitor 头文件:
/**
* WeightSizeVisitor returns the number of weights of the given module.
*/
class WeightSizeVisitor : public boost::static_visitor<size_t>
{
public:
//! Return the number of weights.
template<typename LayerType>
size_t operator()(LayerType* layer) const;
size_t operator()(MoreTypes layer) const;
private:
//! If the module doesn't implement the Parameters() or Model() function
//! return 0.
template<typename T, typename P>
typename std::enable_if<
!HasParametersCheck<T, P&(T::*)()>::value &&
!HasModelCheck<T>::value, size_t>::type
LayerSize(T* layer, P& output) const;
//! Return the number of parameters if the module implements the Model()
//! function.
template<typename T, typename P>
typename std::enable_if<
!HasParametersCheck<T, P&(T::*)()>::value &&
HasModelCheck<T>::value, size_t>::type
LayerSize(T* layer, P& output) const;
//! Return the number of parameters if the module implements the Parameters()
//! function.
template<typename T, typename P>
typename std::enable_if<
HasParametersCheck<T, P&(T::*)()>::value &&
!HasModelCheck<T>::value, size_t>::type
LayerSize(T* layer, P& output) const;
//! Return the accumulated number of parameters if the module implements the
//! Parameters() and Model() function.
template<typename T, typename P>
typename std::enable_if<
HasParametersCheck<T, P&(T::*)()>::value &&
HasModelCheck<T>::value, size_t>::type
LayerSize(T* layer, P& output) const;
};
实现:
//! WeightSizeVisitor visitor class.
template<typename LayerType>
inline size_t WeightSizeVisitor::operator()(LayerType* layer) const
{
return LayerSize(layer, layer->OutputParameter());
}
inline size_t WeightSizeVisitor::operator()(MoreTypes layer) const
{
return layer.apply_visitor(*this);
}
template<typename T, typename P>
inline typename std::enable_if<
!HasParametersCheck<T, P&(T::*)()>::value &&
!HasModelCheck<T>::value, size_t>::type
WeightSizeVisitor::LayerSize(T* /* layer */, P& /* output */) const
{
return 0;
}
template<typename T, typename P>
inline typename std::enable_if<
!HasParametersCheck<T, P&(T::*)()>::value &&
HasModelCheck<T>::value, size_t>::type
WeightSizeVisitor::LayerSize(T* layer, P& /* output */) const
{
size_t weights = 0;
for (size_t i = 0; i < layer->Model().size(); ++i)
{
weights += boost::apply_visitor(WeightSizeVisitor(), layer->Model()[i]);
}
return weights;
}
template<typename T, typename P>
inline typename std::enable_if<
HasParametersCheck<T, P&(T::*)()>::value &&
!HasModelCheck<T>::value, size_t>::type
WeightSizeVisitor::LayerSize(T* layer, P& /* output */) const
{
return layer->Parameters().n_elem;
}
template<typename T, typename P>
inline typename std::enable_if<
HasParametersCheck<T, P&(T::*)()>::value &&
HasModelCheck<T>::value, size_t>::type
WeightSizeVisitor::LayerSize(T* layer, P& /* output */) const
{
size_t weights = layer->Parameters().n_elem;
for (size_t i = 0; i < layer->Model().size(); ++i)
{
weights += boost::apply_visitor(WeightSizeVisitor(), layer->Model()[i]);
}
return weights;
}
大体上,会返回 layer 的 Parameters 的元素个数(有的话)加上对 layer->Model 每一元素继续调用 WeightSizeVisitor 的结果(有的话)
将这些变量的个数相加,形成一个列向量,就是 parameter
InitTraits
/**
* This is a template class that can provide information about various
* initialization methods. By default, this class will provide the weakest
* possible assumptions on the initialization method, and each initialization
* method should override values as necessary. If a initialization method
* doesn't need to override a value, then there's no need to write a InitTraits
* specialization for that class.
*/
template<typename InitRuleType>
class InitTraits
{
public:
/**
* This is true if the initialization method is used for a single layer.
*/
static const bool UseLayer = true;
};
根据 UseLayer 决定是一层一层还是整个网络一起初始化,主要的初始化过程就是先前介绍的模板参数
除此之外,还对网络的每一层调用了一个 resetVisitor
resetVisitor 头文件:
/**
* ResetVisitor executes the Reset() function.
*/
class ResetVisitor : public boost::static_visitor<void>
{
public:
//! Execute the Reset() function.
template<typename LayerType>
void operator()(LayerType* layer) const;
void operator()(MoreTypes layer) const;
private:
//! Execute the Reset() function for a module which implements the Reset()
//! function.
template<typename T>
typename std::enable_if<
HasResetCheck<T, void(T::*)()>::value &&
!HasModelCheck<T>::value, void>::type
ResetParameter(T* layer) const;
//! Execute the Reset() function for a module which implements the Model()
//! function.
template<typename T>
typename std::enable_if<
!HasResetCheck<T, void(T::*)()>::value &&
HasModelCheck<T>::value, void>::type
ResetParameter(T* layer) const;
//! Execute the Reset() function for a module which implements the Reset()
//! and Model() function.
template<typename T>
typename std::enable_if<
HasResetCheck<T, void(T::*)()>::value &&
HasModelCheck<T>::value, void>::type
ResetParameter(T* layer) const;
//! Do not execute the Reset() function for a module which doesn't implement
// the Reset() or Model() function.
template<typename T>
typename std::enable_if<
!HasResetCheck<T, void(T::*)()>::value &&
!HasModelCheck<T>::value, void>::type
ResetParameter(T* layer) const;
};
实现:
//! ResetVisitor visitor class.
template<typename LayerType>
inline void ResetVisitor::operator()(LayerType* layer) const
{
ResetParameter(layer);
}
inline void ResetVisitor::operator()(MoreTypes layer) const
{
layer.apply_visitor(*this);
}
template<typename T>
inline typename std::enable_if<
HasResetCheck<T, void(T::*)()>::value &&
!HasModelCheck<T>::value, void>::type
ResetVisitor::ResetParameter(T* layer) const
{
layer->Reset();
}
template<typename T>
inline typename std::enable_if<
!HasResetCheck<T, void(T::*)()>::value &&
HasModelCheck<T>::value, void>::type
ResetVisitor::ResetParameter(T* layer) const
{
for (size_t i = 0; i < layer->Model().size(); ++i)
{
boost::apply_visitor(ResetVisitor(), layer->Model()[i]);
}
}
template<typename T>
inline typename std::enable_if<
HasResetCheck<T, void(T::*)()>::value &&
HasModelCheck<T>::value, void>::type
ResetVisitor::ResetParameter(T* layer) const
{
for (size_t i = 0; i < layer->Model().size(); ++i)
{
boost::apply_visitor(ResetVisitor(), layer->Model()[i]);
}
layer->Reset();
}
template<typename T>
inline typename std::enable_if<
!HasResetCheck<T, void(T::*)()>::value &&
!HasModelCheck<T>::value, void>::type
ResetVisitor::ResetParameter(T* /* layer */) const
{
/* Nothing to do here. */
}
和以往遇到的 Visitor 类差不多,调用 layer->Model 中的 ResetVisitor ,以及调用 layer 的 Reset 函数
如此一来,两个 Reset 方法就看完了,下面进入 Forward 函数:
一般情况下,因为 batchSize 为 1 , 所以对于 predictors 的每一列(每一数据点)调用了 Forward 函数
Forward 头文件:
// Helper functions.
/**
* The Forward algorithm (part of the Forward-Backward algorithm). Computes
* forward probabilities for each module.
*
* @param input Data sequence to compute probabilities for.
*/
template<typename InputType>
void Forward(const InputType& input);
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
template<typename InputType>
void FFN<OutputLayerType, InitializationRuleType,
CustomLayers...>::Forward(const InputType& input)
{
boost::apply_visitor(ForwardVisitor(input,
boost::apply_visitor(outputParameterVisitor, network.front())),
network.front());
if (!reset)
{
if (boost::apply_visitor(outputWidthVisitor, network.front()) != 0)
{
width = boost::apply_visitor(outputWidthVisitor, network.front());
}
if (boost::apply_visitor(outputHeightVisitor, network.front()) != 0)
{
height = boost::apply_visitor(outputHeightVisitor, network.front());
}
}
for (size_t i = 1; i < network.size(); ++i)
{
if (!reset)
{
// Set the input width.
boost::apply_visitor(SetInputWidthVisitor(width), network[i]);
// Set the input height.
boost::apply_visitor(SetInputHeightVisitor(height), network[i]);
}
boost::apply_visitor(ForwardVisitor(boost::apply_visitor(
outputParameterVisitor, network[i - 1]),
boost::apply_visitor(outputParameterVisitor, network[i])), network[i]);
if (!reset)
{
// Get the output width.
if (boost::apply_visitor(outputWidthVisitor, network[i]) != 0)
{
width = boost::apply_visitor(outputWidthVisitor, network[i]);
}
// Get the output height.
if (boost::apply_visitor(outputHeightVisitor, network[i]) != 0)
{
height = boost::apply_visitor(outputHeightVisitor, network[i]);
}
}
}
if (!reset)
reset = true;
}
Visitor 类我就不展示了,因为它无非是调用某 layer 的对应的函数
比如说第一个语句,就是调用 network 的第一层 layer 的 Forward 函数,并将 input 以及 network 第一层的 outputParameter 函数结果作为参数
接着,如果是第一次调用该函数,reset 应该为 false (初始化的结果),就取得网络第一层输出的宽度及高度(不为零的话),接着设置第二层输入的形状与第一层输出进行对接,再次调用 Visitor 类的函数,即:调用这层的 Forward 函数,并将上一层的 outputParameter,以及这层的 outputParameter 作为参数,然后在整个网络中依次进行这个过程
最后 reset 置位
然后又调用了 outputLayer 的 Forward 函数,默认情况下就是之前介绍的负对数似然损失函数
将网络最后一层的 outputParameter 以及相应的 responses 列作为参数
返回的结果再加上每一层网络的 loss 函数结果,得到 Evaluate 的最终结果
Gradient 头文件:
/**
* Evaluate the gradient of the feedforward network with the given parameters,
* and with respect to only a number of points in the dataset. This is useful
* for optimizers such as SGD, which require a separable objective function.
*
* @param parameters Matrix of the model parameters to be optimized.
* @param begin Index of the starting point to use for objective function
* gradient evaluation.
* @param gradient Matrix to output gradient into.
* @param batchSize Number of points to be processed as a batch for objective
* function gradient evaluation.
*/
void Gradient(const arma::mat& parameters,
const size_t begin,
arma::mat& gradient,
const size_t batchSize);
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Gradient(
const arma::mat& parameters,
const size_t begin,
arma::mat& gradient,
const size_t batchSize)
{
this->EvaluateWithGradient(parameters, begin, gradient, batchSize);
}
没什么好说的,下一个
EvaluateWithGradient 头文件:
/**
* Evaluate the feedforward network with the given parameters.
* This function is usually called by the optimizer to train the model.
* This just calls the overload of EvaluateWithGradient() with batchSize = 1.
*
* @param parameters Matrix model parameters.
* @param gradient Matrix to output gradient into.
*/
template<typename GradType>
double EvaluateWithGradient(const arma::mat& parameters, GradType& gradient);
/**
* Evaluate the feedforward network with the given parameters, but using only
* a number of data points. This is useful for optimizers such as SGD, which
* require a separable objective function.
*
* @param parameters Matrix model parameters.
* @param begin Index of the starting point to use for objective function
* evaluation.
* @param gradient Matrix to output gradient into.
* @param batchSize Number of points to be passed at a time to use for
* objective function evaluation.
*/
template<typename GradType>
double EvaluateWithGradient(const arma::mat& parameters,
const size_t begin,
GradType& gradient,
const size_t batchSize);
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
template<typename GradType>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::
EvaluateWithGradient(const arma::mat& parameters, GradType& gradient)
{
double res = 0;
for (size_t i = 0; i < predictors.n_cols; ++i)
res += EvaluateWithGradient(parameters, i, gradient, 1);
return res;
}
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
template<typename GradType>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::
EvaluateWithGradient(const arma::mat& /* parameters */,
const size_t begin,
GradType& gradient,
const size_t batchSize)
{
if (gradient.is_empty())
{
if (parameter.is_empty())
ResetParameters();
gradient = arma::zeros<arma::mat>(parameter.n_rows, parameter.n_cols);
}
else
{
gradient.zeros();
}
if (this->deterministic)
{
this->deterministic = false;
ResetDeterministic();
}
Forward(predictors.cols(begin, begin + batchSize - 1));
double res = outputLayer.Forward(
boost::apply_visitor(outputParameterVisitor, network.back()),
responses.cols(begin, begin + batchSize - 1));
for (size_t i = 0; i < network.size(); ++i)
{
res += boost::apply_visitor(lossVisitor, network[i]);
}
outputLayer.Backward(
boost::apply_visitor(outputParameterVisitor, network.back()),
responses.cols(begin, begin + batchSize - 1),
error);
Backward();
ResetGradients(gradient);
Gradient(predictors.cols(begin, begin + batchSize - 1));
return res;
}
开始是一些变量初始化,之前已经介绍过了
然后是 Forward 函数以及构造损失函数,和 Evaluate 中的一样
不一样的在于之后,调用了 outputLayer 的 Backward 函数,并将最后一层的 outputParameter 函数结果,相应的 responses 列,以及 error 作为参数传入,默认情况下就是先前介绍的负对数似然损失
接着又调用了无参的 Backward 函数:
Backward
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Backward()
{
boost::apply_visitor(BackwardVisitor(boost::apply_visitor(
outputParameterVisitor, network.back()), error,
boost::apply_visitor(deltaVisitor, network.back())), network.back());
for (size_t i = 2; i < network.size(); ++i)
{
boost::apply_visitor(BackwardVisitor(boost::apply_visitor(
outputParameterVisitor, network[network.size() - i]),
boost::apply_visitor(deltaVisitor, network[network.size() - i + 1]),
boost::apply_visitor(deltaVisitor, network[network.size() - i])),
network[network.size() - i]);
}
}
Backward 和 Forward 是成双入对的,调用第 i 层的 Backward 函数,将第 i 层的 outputParameter 函数结果,第 i + 1 层的 delta 函数结果,以及第 i 层 delta 函数结果作为参数,整个过程从后向前依次进行,最后一层单独处理
下一步是 ResetGradients 函数:
ResetGradients
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType,
CustomLayers...>::ResetGradients(arma::mat& gradient)
{
size_t offset = 0;
for (size_t i = 0; i < network.size(); ++i)
{
offset += boost::apply_visitor(GradientSetVisitor(gradient, offset),
network[i]);
}
}
这个无非是针对网络中的每一层调用了自己的 GradientSet 函数
Gradient 的最后一步,是调用了一个接受一个矩阵的 Gradient 函数:
Gradient 头文件:
/**
* Iterate through all layer modules and update the the gradient using the
* layer defined optimizer.
*/
template<typename InputType>
void Gradient(const InputType& input);
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
template<typename InputType>
void FFN<OutputLayerType, InitializationRuleType,
CustomLayers...>::Gradient(const InputType& input)
{
boost::apply_visitor(GradientVisitor(input,
boost::apply_visitor(deltaVisitor, network[1])), network.front());
for (size_t i = 1; i < network.size() - 1; ++i)
{
boost::apply_visitor(GradientVisitor(boost::apply_visitor(
outputParameterVisitor, network[i - 1]),
boost::apply_visitor(deltaVisitor, network[i + 1])), network[i]);
}
boost::apply_visitor(GradientVisitor(boost::apply_visitor(
outputParameterVisitor, network[network.size() - 2]), error),
network[network.size() - 1]);
}
有了先前的经验,再来看这段代码就容易理解了:
先调用网络第一层的 Gradient 函数,将 input 和第二层的 delta 函数结果作为参数传入
接着进入循环,调用第 i 层的 Gradient 函数,并将 i - 1 层的 outputParameter 函数结果以及 i + 1 层的 delta 函数结果作为参数
最后调用网络最后一层的 Gradient 函数,将前一层的 outputParameter 函数结果以及 error 作为参数传入
Predict 头文件:
/**
* Predict the responses to a given set of predictors. The responses will
* reflect the output of the given output layer as returned by the
* output layer function.
*
* If you want to pass in a parameter and discard the original parameter
* object, be sure to use std::move to avoid unnecessary copy.
*
* @param predictors Input predictors.
* @param results Matrix to put output predictions of responses into.
*/
void Predict(arma::mat predictors, arma::mat& results);
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Predict(
arma::mat predictors, arma::mat& results)
{
if (parameter.is_empty())
ResetParameters();
if (!deterministic)
{
deterministic = true;
ResetDeterministic();
}
arma::mat resultsTemp;
Forward(arma::mat(predictors.colptr(0), predictors.n_rows, 1, false, true));
resultsTemp = boost::apply_visitor(outputParameterVisitor,
network.back()).col(0);
results = arma::mat(resultsTemp.n_elem, predictors.n_cols);
results.col(0) = resultsTemp.col(0);
for (size_t i = 1; i < predictors.n_cols; ++i)
{
Forward(arma::mat(predictors.colptr(i), predictors.n_rows, 1, false, true));
resultsTemp = boost::apply_visitor(outputParameterVisitor,
network.back());
results.col(i) = resultsTemp.col(0);
}
}
整个过程大概就是将 predictors 的每一列依次进行 Forward ,取出网络最后的输出作为结果矩阵,只是将第一步分开进行,以便确认结果矩阵的形状
头文件:
/**
* Implementation of the Linear layer class. The Linear class represents a
* single layer of a neural network.
*
* @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
* @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
*/
template <
typename InputDataType = arma::mat,
typename OutputDataType = arma::mat,
typename RegularizerType = NoRegularizer
>
class Linear
{
public:
//! Create the Linear object.
Linear();
/**
* Create the Linear layer object using the specified number of units.
*
* @param inSize The number of input units.
* @param outSize The number of output units.
* @param regularizer The regularizer to use, optional.
*/
Linear(const size_t inSize,
const size_t outSize,
RegularizerType regularizer = RegularizerType());
实现:
template<typename InputDataType, typename OutputDataType,
typename RegularizerType>
Linear<InputDataType, OutputDataType, RegularizerType>::Linear() :
inSize(0),
outSize(0)
{
// Nothing to do here.
}
template<typename InputDataType, typename OutputDataType,
typename RegularizerType>
Linear<InputDataType, OutputDataType, RegularizerType>::Linear(
const size_t inSize,
const size_t outSize,
RegularizerType regularizer) :
inSize(inSize),
outSize(outSize),
regularizer(regularizer)
{
weights.set_size(outSize * inSize + outSize, 1);
}
NoRegularizer 顾名思义,就是没有 Regularizer
这里构造函数主要就是设置 weights 向量的形状 :
o u t S i z e × i n S i z e = w e i g h t . S i z e o u t S i z e × 1 = b i a s . S i z e outSize \times inSize = weight.Size \\[5pt] outSize \times 1 = bias.Size outSize×inSize=weight.SizeoutSize×1=bias.Size
这点从 Reset 函数中也可看出:
Reset
template<typename InputDataType, typename OutputDataType,
typename RegularizerType>
void Linear<InputDataType, OutputDataType, RegularizerType>::Reset()
{
weight = arma::mat(weights.memptr(), outSize, inSize, false, false);
bias = arma::mat(weights.memptr() + weight.n_elem,
outSize, 1, false, false);
}
头文件:
/**
* Ordinary feed forward pass of a neural network, evaluating the function
* f(x) by propagating the activity forward through f.
*
* @param input Input data used for evaluating the specified function.
* @param output Resulting output activation.
*/
template<typename eT>
void Forward(const arma::Mat<eT>& input, arma::Mat<eT>& output);
实现:
template<typename InputDataType, typename OutputDataType,
typename RegularizerType>
template<typename eT>
void Linear<InputDataType, OutputDataType, RegularizerType>::Forward(
const arma::Mat<eT>& input, arma::Mat<eT>& output)
{
output = weight * input;
output.each_col() += bias;
}
正如 linear 头文件的注释中所说,线性层主要作为全连接层或者仿射变换:
o u t p u t = w e i g h t ⋅ i n p u t + b i a s output = weight \cdot input + bias output=weight⋅input+bias
回顾整个网络的 Forward 函数,可知,input 要么是输入的数据点(第一层),要么是上一层的输出(除第一层外),而 output 都是这一层的 outputParameter
另外,所有参数传递都是引用类型
头文件:
/**
* Ordinary feed backward pass of a neural network, calculating the function
* f(x) by propagating x backwards trough f. Using the results from the feed
* forward pass.
*
* @param * (input) The propagated input activation.
* @param gy The backpropagated error.
* @param g The calculated gradient.
*/
template<typename eT>
void Backward(const arma::Mat<eT>& /* input */,
const arma::Mat<eT>& gy,
arma::Mat<eT>& g);
实现:
template<typename InputDataType, typename OutputDataType,
typename RegularizerType>
template<typename eT>
void Linear<InputDataType, OutputDataType, RegularizerType>::Backward(
const arma::Mat<eT>& /* input */, const arma::Mat<eT>& gy, arma::Mat<eT>& g)
{
g = weight.t() * gy;
}
回顾整个网络的 Backward 函数,可知,gy(The backpropagated error)要么是 error (最后一层),要么是下一层的 delta (除最后一层),而 g(The calculated gradient)都是这一层的 delta
另外,所有参数传递都是引用类型
头文件:
/*
* Calculate the gradient using the output delta and the input activation.
*
* @param input The input parameter used for calculating the gradient.
* @param error The calculated error.
* @param gradient The calculated gradient.
*/
template<typename eT>
void Gradient(const arma::Mat<eT>& input,
const arma::Mat<eT>& error,
arma::Mat<eT>& gradient);
实现:
template<typename InputDataType, typename OutputDataType,
typename RegularizerType>
template<typename eT>
void Linear<InputDataType, OutputDataType, RegularizerType>::Gradient(
const arma::Mat<eT>& input,
const arma::Mat<eT>& error,
arma::Mat<eT>& gradient)
{
gradient.submat(0, 0, weight.n_elem - 1, 0) = arma::vectorise(
error * input.t());
gradient.submat(weight.n_elem, 0, gradient.n_elem - 1, 0) =
arma::sum(error, 1);
regularizer.Evaluate(weights, gradient);
}
同样道理,这里的 input 要么是 input (第一层),要么是上一层的 outputParameter ,而 error 要么是下一层的 delta,要么是 error(最后一层),而 gradient 就是这层的 gradient 矩阵
另外,所有参数传递都是引用类型
.submat 原型:
X.submat ( first_row, first_col, last_row, last_col )
.vectorise(X, dim) 官方解释:
Generate a flattened version of matrix X or cube Q
The argument dim is optional; by default dim=0 is used
因此,这里的 gradient 矩阵的更新应该是分为 weights 部分和 bias 部分,weights 部分更新为 e r r o r ⋅ i n p u t T error \cdot input^{\mathsf{T}} error⋅inputT ,bias 部分更新为 error 相应整行的和
最后再调用 regularizer 的 Evaluate 函数,这里没有就不讨论了
头文件:
/**
* Implementation of the Convolution class. The Convolution class represents a
* single layer of a neural network.
*
* @tparam ForwardConvolutionRule Convolution to perform forward process.
* @tparam BackwardConvolutionRule Convolution to perform backward process.
* @tparam GradientConvolutionRule Convolution to calculate gradient.
* @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
* @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
*/
template <
typename ForwardConvolutionRule = NaiveConvolution<ValidConvolution>,
typename BackwardConvolutionRule = NaiveConvolution<FullConvolution>,
typename GradientConvolutionRule = NaiveConvolution<ValidConvolution>,
typename InputDataType = arma::mat,
typename OutputDataType = arma::mat
>
class Convolution
{
public:
//! Create the Convolution object.
Convolution();
/**
* Create the Convolution object using the specified number of input maps,
* output maps, filter size, stride and padding parameter.
*
* @param inSize The number of input maps.
* @param outSize The number of output maps.
* @param kernelWidth Width of the filter/kernel.
* @param kernelHeight Height of the filter/kernel.
* @param strideWidth Stride of filter application in the x direction.
* @param strideHeight Stride of filter application in the y direction.
* @param padW Padding width of the input.
* @param padH Padding height of the input.
* @param inputWidth The width of the input data.
* @param inputHeight The height of the input data.
* @param paddingType The type of padding (Valid or Same). Defaults to None.
*/
Convolution(const size_t inSize,
const size_t outSize,
const size_t kernelWidth,
const size_t kernelHeight,
const size_t strideWidth = 1,
const size_t strideHeight = 1,
const size_t padW = 0,
const size_t padH = 0,
const size_t inputWidth = 0,
const size_t inputHeight = 0,
const std::string& paddingType = "None");
/**
* Create the Convolution object using the specified number of input maps,
* output maps, filter size, stride and padding parameter.
*
* @param inSize The number of input maps.
* @param outSize The number of output maps.
* @param kernelWidth Width of the filter/kernel.
* @param kernelHeight Height of the filter/kernel.
* @param strideWidth Stride of filter application in the x direction.
* @param strideHeight Stride of filter application in the y direction.
* @param padW A two-value tuple indicating padding widths of the input.
* First value is padding at left side. Second value is padding on
* right side.
* @param padH A two-value tuple indicating padding heights of the input.
* First value is padding at top. Second value is padding on
* bottom.
* @param inputWidth The width of the input data.
* @param inputHeight The height of the input data.
* @param paddingType The type of padding (Valid or Same). Defaults to None.
*/
Convolution(const size_t inSize,
const size_t outSize,
const size_t kernelWidth,
const size_t kernelHeight,
const size_t strideWidth,
const size_t strideHeight,
const std::tuple<size_t, size_t>& padW,
const std::tuple<size_t, size_t>& padH,
const size_t inputWidth = 0,
const size_t inputHeight = 0,
const std::string& paddingType = "None");
实现:
template<
typename ForwardConvolutionRule,
typename BackwardConvolutionRule,
typename GradientConvolutionRule,
typename InputDataType,
typename OutputDataType
>
Convolution<
ForwardConvolutionRule,
BackwardConvolutionRule,
GradientConvolutionRule,
InputDataType,
OutputDataType
>::Convolution()
{
// Nothing to do here.
}
template<
typename ForwardConvolutionRule,
typename BackwardConvolutionRule,
typename GradientConvolutionRule,
typename InputDataType,
typename OutputDataType
>
Convolution<
ForwardConvolutionRule,
BackwardConvolutionRule,
GradientConvolutionRule,
InputDataType,
OutputDataType
>::Convolution(
const size_t inSize,
const size_t outSize,
const size_t kernelWidth,
const size_t kernelHeight,
const size_t strideWidth,
const size_t strideHeight,
const size_t padW,
const size_t padH,
const size_t inputWidth,
const size_t inputHeight,
const std::string& paddingType) :
Convolution(
inSize,
outSize,
kernelWidth,
kernelHeight,
strideWidth,
strideHeight,
std::tuple<size_t, size_t>(padW, padW),
std::tuple<size_t, size_t>(padH, padH),
inputWidth,
inputHeight,
paddingType)
{
// Nothing to do here.
}
template<
typename ForwardConvolutionRule,
typename BackwardConvolutionRule,
typename GradientConvolutionRule,
typename InputDataType,
typename OutputDataType
>
Convolution<
ForwardConvolutionRule,
BackwardConvolutionRule,
GradientConvolutionRule,
InputDataType,
OutputDataType
>::Convolution(
const size_t inSize,
const size_t outSize,
const size_t kernelWidth,
const size_t kernelHeight,
const size_t strideWidth,
const size_t strideHeight,
const std::tuple<size_t, size_t>& padW,
const std::tuple<size_t, size_t>& padH,
const size_t inputWidth,
const size_t inputHeight,
const std::string& paddingType) :
inSize(inSize),
outSize(outSize),
kernelWidth(kernelWidth),
kernelHeight(kernelHeight),
strideWidth(strideWidth),
strideHeight(strideHeight),
padWLeft(std::get<0>(padW)),
padWRight(std::get<1>(padW)),
padHBottom(std::get<1>(padH)),
padHTop(std::get<0>(padH)),
inputWidth(inputWidth),
inputHeight(inputHeight),
outputWidth(0),
outputHeight(0)
{
weights.set_size(WeightSize(), 1);
// Transform paddingType to lowercase.
std::string paddingTypeLow = paddingType;
util::ToLower(paddingType, paddingTypeLow);
if (paddingTypeLow == "valid")
{
padWLeft = 0;
padWRight = 0;
padHTop = 0;
padHBottom = 0;
}
else if (paddingTypeLow == "same")
{
InitializeSamePadding();
}
padding = ann::Padding<>(padWLeft, padWRight, padHTop, padHBottom);
}
主要分析一下第三个构造函数
weights 矩阵的行数为:
//! Get size of weights for the layer.
size_t WeightSize() const
{
return (outSize * inSize * kernelWidth * kernelHeight) + outSize;
}
看一下 Reset 函数:
template<
typename ForwardConvolutionRule,
typename BackwardConvolutionRule,
typename GradientConvolutionRule,
typename InputDataType,
typename OutputDataType
>
void Convolution<
ForwardConvolutionRule,
BackwardConvolutionRule,
GradientConvolutionRule,
InputDataType,
OutputDataType
>::Reset()
{
weight = arma::cube(weights.memptr(), kernelWidth, kernelHeight,
outSize * inSize, false, false);
bias = arma::mat(weights.memptr() + weight.n_elem,
outSize, 1, false, false);
}
weight 是一个 Cube ,拥有 o u t S i z e × i n S i z e outSize \times inSize outSize×inSize 个 Slice,每个 Slice 包含了 k e r n e l W i d t h × k e r n e l H e i g h t kernelWidth \times kernelHeight kernelWidth×kernelHeight 的矩阵
而 bias 是一个列矩阵,有 outSize 行
由此就不难解释 weights 的行数了:
o u t S i z e × i n S i z e × k e r n e l W i d t h × k e r n e l H e i g h t + o u t S i z e outSize \times inSize \times kernelWidth \times kernelHeight + outSize outSize×inSize×kernelWidth×kernelHeight+outSize
接着是 padding ,我们先按照默认的 None 来
Padding 头文件:
/**
* Implementation of the Padding module class. The Padding module applies a bias term
* to the incoming data.
*
* @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
* @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
*/
template <
typename InputDataType = arma::mat,
typename OutputDataType = arma::mat
>
class Padding
{
public:
/**
* Create the Padding object using the specified number of output units.
*
* @param padWLeft Left padding width of the input.
* @param padWRight Right padding width of the input.
* @param padHTop Top padding height of the input.
* @param padHBottom Bottom padding height of the input.
*/
Padding(const size_t padWLeft = 0,
const size_t padWRight = 0,
const size_t padHTop = 0,
const size_t padHBottom = 0);
/**
* Ordinary feed forward pass of a neural network, evaluating the function
* f(x) by propagating the activity forward through f.
*
* @param input Input data used for evaluating the specified function.
* @param output Resulting output activation.
*/
template<typename eT>
void Forward(const arma::Mat<eT>& input, arma::Mat<eT>& output);
/**
* Ordinary feed backward pass of a neural network, calculating the function
* f(x) by propagating x backwards trough f. Using the results from the feed
* forward pass.
*
* @param * (input) The propagated input activation.
* @param gy The backpropagated error.
* @param g The calculated gradient.
*/
template<typename eT>
void Backward(const arma::Mat<eT>& /* input */,
const arma::Mat<eT>& gy,
arma::Mat<eT>& g);
//! Get the output parameter.
OutputDataType const& OutputParameter() const { return outputParameter; }
//! Modify the output parameter.
OutputDataType& OutputParameter() { return outputParameter; }
//! Get the delta.
OutputDataType const& Delta() const { return delta; }
//! Modify the delta.
OutputDataType& Delta() { return delta; }
//! Get the left padding width.
size_t PadWLeft() const { return padWLeft; }
//! Modify the left padding width.
size_t& PadWLeft() { return padWLeft; }
//! Get the right padding width.
size_t PadWRight() const { return padWRight; }
//! Modify the right padding width.
size_t& PadWRight() { return padWRight; }
//! Get the top padding width.
size_t PadHTop() const { return padHTop; }
//! Modify the top padding width.
size_t& PadHTop() { return padHTop; }
//! Get the bottom padding width.
size_t PadHBottom() const { return padHBottom; }
//! Modify the bottom padding width.
size_t& PadHBottom() { return padHBottom; }
/**
* Serialize the layer.
*/
template<typename Archive>
void serialize(Archive& ar, const unsigned int /* version */);
private:
//! Locally-stored left padding width.
size_t padWLeft;
//! Locally-stored right padding width.
size_t padWRight;
//! Locally-stored top padding height.
size_t padHTop;
//! Locally-stored bottom padding height.
size_t padHBottom;
//! Locally-stored number of rows and columns of input.
size_t nRows, nCols;
//! Locally-stored delta object.
OutputDataType delta;
//! Locally-stored output parameter object.
OutputDataType outputParameter;
}; // class Padding
实现:
template<typename InputDataType, typename OutputDataType>
Padding<InputDataType, OutputDataType>::Padding(
const size_t padWLeft,
const size_t padWRight,
const size_t padHTop,
const size_t padHBottom) :
padWLeft(padWLeft),
padWRight(padWRight),
padHTop(padHTop),
padHBottom(padHBottom),
nRows(0),
nCols(0)
{
// Nothing to do here.
}
template<typename InputDataType, typename OutputDataType>
template<typename eT>
void Padding<InputDataType, OutputDataType>::Forward(
const arma::Mat<eT>& input, arma::Mat<eT>& output)
{
nRows = input.n_rows;
nCols = input.n_cols;
output = arma::zeros(nRows + padWLeft + padWRight,
nCols + padHTop + padHBottom);
output.submat(padWLeft, padHTop, padWLeft + nRows - 1,
padHTop + nCols - 1) = input;
}
template<typename InputDataType, typename OutputDataType>
template<typename eT>
void Padding<InputDataType, OutputDataType>::Backward(
const arma::Mat<eT>& /* input */,
const arma::Mat<eT>& gy,
arma::Mat<eT>& g)
{
g = gy.submat(padWLeft, padHTop, padWLeft + nRows - 1,
padHTop + nCols - 1);
}
template<typename InputDataType, typename OutputDataType>
template<typename Archive>
void Padding<InputDataType, OutputDataType>::serialize(
Archive& ar, const unsigned int /* version */)
{
ar & BOOST_SERIALIZATION_NVP(padWLeft);
ar & BOOST_SERIALIZATION_NVP(padWRight);
ar & BOOST_SERIALIZATION_NVP(padHTop);
ar & BOOST_SERIALIZATION_NVP(padHBottom);
}
可以看到 padding 层的 Forward 和 Backward 函数只是单纯地扩充一下输出的矩阵,默认的填充数值为零
头文件:
/**
* Ordinary feed forward pass of a neural network, evaluating the function
* f(x) by propagating the activity forward through f.
*
* @param input Input data used for evaluating the specified function.
* @param output Resulting output activation.
*/
template<typename eT>
void Forward(const arma::Mat<eT>& input, arma::Mat<eT>& output);
实现:
template<
typename ForwardConvolutionRule,
typename BackwardConvolutionRule,
typename GradientConvolutionRule,
typename InputDataType,
typename OutputDataType
>
template<typename eT>
void Convolution<
ForwardConvolutionRule,
BackwardConvolutionRule,
GradientConvolutionRule,
InputDataType,
OutputDataType
>::Forward(const arma::Mat<eT>& input, arma::Mat<eT>& output)
{
batchSize = input.n_cols;
arma::cube inputTemp(const_cast<arma::Mat<eT>&>(input).memptr(),
inputWidth, inputHeight, inSize * batchSize, false, false);
if (padWLeft != 0 || padWRight != 0 || padHTop != 0 || padHBottom != 0)
{
inputPaddedTemp.set_size(inputTemp.n_rows + padWLeft + padWRight,
inputTemp.n_cols + padHTop + padHBottom, inputTemp.n_slices);
for (size_t i = 0; i < inputTemp.n_slices; ++i)
{
padding.Forward(inputTemp.slice(i), inputPaddedTemp.slice(i));
}
}
size_t wConv = ConvOutSize(inputWidth, kernelWidth, strideWidth, padWLeft,
padWRight);
size_t hConv = ConvOutSize(inputHeight, kernelHeight, strideHeight, padHTop,
padHBottom);
output.set_size(wConv * hConv * outSize, batchSize);
outputTemp = arma::Cube<eT>(output.memptr(), wConv, hConv,
outSize * batchSize, false, false);
outputTemp.zeros();
for (size_t outMap = 0, outMapIdx = 0, batchCount = 0; outMap <
outSize * batchSize; outMap++)
{
if (outMap != 0 && outMap % outSize == 0)
{
batchCount++;
outMapIdx = 0;
}
for (size_t inMap = 0; inMap < inSize; inMap++, outMapIdx++)
{
arma::Mat<eT> convOutput;
if (padWLeft != 0 || padWRight != 0 || padHTop != 0 || padHBottom != 0)
{
ForwardConvolutionRule::Convolution(inputPaddedTemp.slice(inMap +
batchCount * inSize), weight.slice(outMapIdx), convOutput,
strideWidth, strideHeight);
}
else
{
ForwardConvolutionRule::Convolution(inputTemp.slice(inMap +
batchCount * inSize), weight.slice(outMapIdx), convOutput,
strideWidth, strideHeight);
}
outputTemp.slice(outMap) += convOutput;
}
outputTemp.slice(outMap) += bias(outMap % outSize);
}
outputWidth = outputTemp.n_rows;
outputHeight = outputTemp.n_cols;
}
先构建一个 Cube ,其 Slice 为 inSize * input.n_cols,每个 Slice 是一个 inputWidth * inputHeight 的矩阵
接着,如果需要扩充的话,就按照 pad 定的尺寸进行扩充,并将 inputTemp 的每一个 Slice 填充到 inputPaddedTemp 的每一个 Slice 上
然后计算:
w C o n v = ⌊ i n p u t W i d t h + p a d W L e f t + p a d W R i g h t − k e r n e l W i d t h ⌋ s t r i d e W i d t h + 1 h C o n v c = ⌊ i n p u t H e i g h t + p a d H T o p + p a d H B o t t o m − k e r n e l H e i g h t ⌋ s r i d e H e i g h t + 1 wConv = \dfrac{\lfloor inputWidth + padWLeft + padWRight - kernelWidth \rfloor}{strideWidth} + 1\\[6pt] hConvc = \dfrac{\lfloor inputHeight + padHTop + padHBottom - kernelHeight \rfloor}{srideHeight} + 1 wConv=strideWidth⌊inputWidth+padWLeft+padWRight−kernelWidth⌋+1hConvc=srideHeight⌊inputHeight+padHTop+padHBottom−kernelHeight⌋+1
接着按照这个尺寸设置 output 和 outputTemp
接下来的双重循环用到了 ForwardConvolutionRule 默认是 NaiveConvolution
其实现:
/**
* Computes the two-dimensional convolution. This class allows specification of
* the type of the border type. The convolution can be compute with the valid
* border type of the full border type (default).
*
* FullConvolution: returns the full two-dimensional convolution.
* ValidConvolution: returns only those parts of the convolution that are
* computed without the zero-padded edges.
*
* @tparam BorderMode Type of the border mode (FullConvolution or
* ValidConvolution).
*/
template<typename BorderMode = FullConvolution>
class NaiveConvolution
{
public:
/*
* Perform a convolution (valid mode).
*
* @param input Input used to perform the convolution.
* @param filter Filter used to perform the convolution.
* @param output Output data that contains the results of the convolution.
* @param dW Stride of filter application in the x direction.
* @param dH Stride of filter application in the y direction.
* @param dilationW The dilation factor in x direction.
* @param dilationH The dilation factor in y direction.
*/
template<typename eT, typename Border = BorderMode>
static typename std::enable_if<
std::is_same<Border, ValidConvolution>::value, void>::type
Convolution(const arma::Mat<eT>& input,
const arma::Mat<eT>& filter,
arma::Mat<eT>& output,
const size_t dW = 1,
const size_t dH = 1,
const size_t dilationW = 1,
const size_t dilationH = 1)
{
output = arma::zeros<arma::Mat<eT> >(
(input.n_rows - (filter.n_rows - 1) * dilationW - 1) / dW + 1,
(input.n_cols - (filter.n_cols - 1) * dilationH - 1) / dH + 1);
// It seems to be about 3.5 times faster to use pointers instead of
// filter(ki, kj) * input(leftInput + ki, topInput + kj) and output(i, j).
eT* outputPtr = output.memptr();
for (size_t j = 0; j < output.n_cols; ++j)
{
for (size_t i = 0; i < output.n_rows; ++i, outputPtr++)
{
const eT* kernelPtr = filter.memptr();
for (size_t kj = 0; kj < filter.n_cols; ++kj)
{
const eT* inputPtr = input.colptr(kj * dilationW + j * dW) + i * dH;
for (size_t ki = 0; ki < filter.n_rows; ++ki, ++kernelPtr,
inputPtr += dilationH)
*outputPtr += *kernelPtr * (*inputPtr);
}
}
}
}
/*
* Perform a convolution (full mode).
*
* @param input Input used to perform the convolution.
* @param filter Filter used to perform the convolution.
* @param output Output data that contains the results of the convolution.
* @param dW Stride of filter application in the x direction.
* @param dH Stride of filter application in the y direction.
* @param dilationW The dilation factor in x direction.
* @param dilationH The dilation factor in y direction.
*/
template<typename eT, typename Border = BorderMode>
static typename std::enable_if<
std::is_same<Border, FullConvolution>::value, void>::type
Convolution(const arma::Mat<eT>& input,
const arma::Mat<eT>& filter,
arma::Mat<eT>& output,
const size_t dW = 1,
const size_t dH = 1,
const size_t dilationW = 1,
const size_t dilationH = 1)
{
size_t outputRows = (input.n_rows - 1) * dW + 2 * (filter.n_rows - 1)
* dilationW + 1;
size_t outputCols = (input.n_cols - 1) * dH + 2 * (filter.n_cols - 1)
* dilationH + 1;
for (size_t i = 0; i < dW; ++i)
{
if (((((i + outputRows - 2 * (filter.n_rows - 1) * dilationW - 1) % dW)
+ dW) % dW) == i){
outputRows += i;
break;
}
}
for (size_t i = 0; i < dH; ++i)
{
if (((((i + outputCols - 2 * (filter.n_cols - 1) * dilationH - 1) % dH)
+ dH) % dH) == i){
outputCols += i;
break;
}
}
// Pad filter and input to the working output shape.
arma::Mat<eT> inputPadded = arma::zeros<arma::Mat<eT> >(outputRows,
outputCols);
inputPadded.submat((filter.n_rows - 1) * dilationW, (filter.n_cols - 1)
* dilationH, (filter.n_rows - 1) * dilationW + input.n_rows - 1,
(filter.n_cols - 1) * dilationH + input.n_cols - 1) = input;
NaiveConvolution<ValidConvolution>::Convolution(inputPadded, filter,
output, 1, 1, dilationW, dilationH);
}
/*
* Perform a convolution using 3rd order tensors.
*
* @param input Input used to perform the convolution.
* @param filter Filter used to perform the convolution.
* @param output Output data that contains the results of the convolution.
* @param dW Stride of filter application in the x direction.
* @param dH Stride of filter application in the y direction.
* @param dilationW The dilation factor in x direction.
* @param dilationH The dilation factor in y direction.
*/
template<typename eT>
static void Convolution(const arma::Cube<eT>& input,
const arma::Cube<eT>& filter,
arma::Cube<eT>& output,
const size_t dW = 1,
const size_t dH = 1,
const size_t dilationW = 1,
const size_t dilationH = 1)
{
arma::Mat<eT> convOutput;
NaiveConvolution<BorderMode>::Convolution(input.slice(0), filter.slice(0),
convOutput, dW, dH, dilationW, dilationH);
output = arma::Cube<eT>(convOutput.n_rows, convOutput.n_cols,
input.n_slices);
output.slice(0) = convOutput;
for (size_t i = 1; i < input.n_slices; ++i)
{
NaiveConvolution<BorderMode>::Convolution(input.slice(i), filter.slice(i),
output.slice(i), dW, dH, dilationW, dilationH);
}
}
/*
* Perform a convolution using dense matrix as input and a 3rd order tensors
* as filter and output.
*
* @param input Input used to perform the convolution.
* @param filter Filter used to perform the convolution.
* @param output Output data that contains the results of the convolution.
* @param dW Stride of filter application in the x direction.
* @param dH Stride of filter application in the y direction.
* @param dilationW The dilation factor in x direction.
* @param dilationH The dilation factor in y direction.
*/
template<typename eT>
static void Convolution(const arma::Mat<eT>& input,
const arma::Cube<eT>& filter,
arma::Cube<eT>& output,
const size_t dW = 1,
const size_t dH = 1,
const size_t dilationW = 1,
const size_t dilationH = 1)
{
arma::Mat<eT> convOutput;
NaiveConvolution<BorderMode>::Convolution(input, filter.slice(0),
convOutput, dW, dH, dilationW, dilationH);
output = arma::Cube<eT>(convOutput.n_rows, convOutput.n_cols,
filter.n_slices);
output.slice(0) = convOutput;
for (size_t i = 1; i < filter.n_slices; ++i)
{
NaiveConvolution<BorderMode>::Convolution(input, filter.slice(i),
output.slice(i), dW, dH, dilationW, dilationH);
}
}
/*
* Perform a convolution using a 3rd order tensors as input and output and a
* dense matrix as filter.
*
* @param input Input used to perform the convolution.
* @param filter Filter used to perform the convolution.
* @param output Output data that contains the results of the convolution.
* @param dW Stride of filter application in the x direction.
* @param dH Stride of filter application in the y direction.
* @param dilationW The dilation factor in x direction.
* @param dilationH The dilation factor in y direction.
*/
template<typename eT>
static void Convolution(const arma::Cube<eT>& input,
const arma::Mat<eT>& filter,
arma::Cube<eT>& output,
const size_t dW = 1,
const size_t dH = 1,
const size_t dilationW = 1,
const size_t dilationH = 1)
{
arma::Mat<eT> convOutput;
NaiveConvolution<BorderMode>::Convolution(input.slice(0), filter,
convOutput, dW, dH, dilationW, dilationH);
output = arma::Cube<eT>(convOutput.n_rows, convOutput.n_cols,
input.n_slices);
output.slice(0) = convOutput;
for (size_t i = 1; i < input.n_slices; ++i)
{
NaiveConvolution<BorderMode>::Convolution(input.slice(i), filter,
output.slice(i), dW, dH, dilationW, dilationH);
}
}
}; // class NaiveConvolution
Forward 默认使用的是 Valid 模式,简单看一下其实现(默认 dilationW = dilationH = 1):
i n p u t : ( m × n ) f i l t e r : ( p × q ) o u t p u t : ( a × b ) = ( m − p d W + 1 , n − q d H + 1 ) ⇒ o u t p u t ( i , j ) = ∑ k j = 0 q ∑ k i = 0 p i n p u t ( k i + i ⋅ d H , k j + j ⋅ d W ) × k e r n e l ( k i , k j ) input : (m \times n) \\[5pt] filter : (p \times q) \\[5pt] output: (a \times b) = ( \dfrac{m-p}{dW} + 1 , \dfrac{n-q}{dH} + 1) \\[6pt] \Rightarrow output_{(i , j)} = \sum_{k_j = 0}^{q} \sum_{k_i = 0}^p input_{(k_i + i \cdot dH \ , \ k_j + j \cdot dW)} \times kernel_{(k_i \ , \ k_j)} input:(m×n)filter:(p×q)output:(a×b)=(dWm−p+1,dHn−q+1)⇒output(i,j)=kj=0∑qki=0∑pinput(ki+i⋅dH , kj+j⋅dW)×kernel(ki , kj)
(注意 Armadillo 中的矩阵以列为主序)
官方说明:
.memptr()
Data for matrices is stored in a column-by-column order
Data for cubes is stored in a slice-by-slice (matrix-by-matrix) order
第一重循环是 outMap 的循环,而 outMap 是在遍历 outputTemp 的每一个 Slice ,其中,每到一个 outSize 的倍数时(除了零),递增 batchCount 以及置零 outMapIdx
outMapIdx 是在遍历 weight 的 Slice (weight 的 Slice 个数是:outSize × \times × inSize)
第二重循环是 inMap 的循环,inMap 在其中遍历 inSize ,同时 outMapIdx 也在每次循环中递增(因此,outMapIdx 的递增将循环 weights 的全部 Slice),循环内部则使用上面介绍的卷积操作,将 inputTemp (或相应填充过的)与 weight 进行卷积,结果加到 outputTemp 中
这里的 batchCount 就起到了统一每个 batch 的作用,因为 inputTemp 的 Slice 个数是 inSize × \times × batchSize
batchCount × \times × inSize 相当于是基准,inMap 就是每次的偏移
当一个 batch 卷积结束后,还要再加上相应的 bias
最后调整 outputWidth 和 outputHeight
头文件:
/**
* Ordinary feed backward pass of a neural network, calculating the function
* f(x) by propagating x backwards through f. Using the results from the feed
* forward pass.
*
* @param * (input) The propagated input activation.
* @param gy The backpropagated error.
* @param g The calculated gradient.
*/
template<typename eT>
void Backward(const arma::Mat<eT>& /* input */,
const arma::Mat<eT>& gy,
arma::Mat<eT>& g);
实现:
template<
typename ForwardConvolutionRule,
typename BackwardConvolutionRule,
typename GradientConvolutionRule,
typename InputDataType,
typename OutputDataType
>
template<typename eT>
void Convolution<
ForwardConvolutionRule,
BackwardConvolutionRule,
GradientConvolutionRule,
InputDataType,
OutputDataType
>::Backward(
const arma::Mat<eT>& /* input */, const arma::Mat<eT>& gy, arma::Mat<eT>& g)
{
arma::cube mappedError(((arma::Mat<eT>&) gy).memptr(), outputWidth,
outputHeight, outSize * batchSize, false, false);
g.set_size(inputWidth * inputHeight * inSize, batchSize);
gTemp = arma::Cube<eT>(g.memptr(), inputWidth, inputHeight,
inSize * batchSize, false, false);
gTemp.zeros();
for (size_t outMap = 0, outMapIdx = 0, batchCount = 0; outMap <
outSize * batchSize; outMap++)
{
if (outMap != 0 && outMap % outSize == 0)
{
batchCount++;
outMapIdx = 0;
}
for (size_t inMap = 0; inMap < inSize; inMap++, outMapIdx++)
{
arma::Mat<eT> output, rotatedFilter;
Rotate180(weight.slice(outMapIdx), rotatedFilter);
BackwardConvolutionRule::Convolution(mappedError.slice(outMap),
rotatedFilter, output, strideWidth, strideHeight);
if (padWLeft != 0 || padWRight != 0 || padHTop != 0 || padHBottom != 0)
{
gTemp.slice(inMap + batchCount * inSize) += output.submat(padWLeft,
padHTop, padWLeft + gTemp.n_rows - 1, padHTop + gTemp.n_cols - 1);
}
else
{
gTemp.slice(inMap + batchCount * inSize) += output;
}
}
}
}
双重循环里用到了 BackwardConvolutionRule::Convolution 其默认是 NaiveConvolution
/*
* Perform a convolution (full mode).
*
* @param input Input used to perform the convolution.
* @param filter Filter used to perform the convolution.
* @param output Output data that contains the results of the convolution.
* @param dW Stride of filter application in the x direction.
* @param dH Stride of filter application in the y direction.
* @param dilationW The dilation factor in x direction.
* @param dilationH The dilation factor in y direction.
*/
template<typename eT, typename Border = BorderMode>
static typename std::enable_if<
std::is_same<Border, FullConvolution>::value, void>::type
Convolution(const arma::Mat<eT>& input,
const arma::Mat<eT>& filter,
arma::Mat<eT>& output,
const size_t dW = 1,
const size_t dH = 1,
const size_t dilationW = 1,
const size_t dilationH = 1)
{
size_t outputRows = (input.n_rows - 1) * dW + 2 * (filter.n_rows - 1)
* dilationW + 1;
size_t outputCols = (input.n_cols - 1) * dH + 2 * (filter.n_cols - 1)
* dilationH + 1;
for (size_t i = 0; i < dW; ++i)
{
if (((((i + outputRows - 2 * (filter.n_rows - 1) * dilationW - 1) % dW)
+ dW) % dW) == i){
outputRows += i;
break;
}
}
for (size_t i = 0; i < dH; ++i)
{
if (((((i + outputCols - 2 * (filter.n_cols - 1) * dilationH - 1) % dH)
+ dH) % dH) == i){
outputCols += i;
break;
}
}
// Pad filter and input to the working output shape.
arma::Mat<eT> inputPadded = arma::zeros<arma::Mat<eT> >(outputRows,
outputCols);
inputPadded.submat((filter.n_rows - 1) * dilationW, (filter.n_cols - 1)
* dilationH, (filter.n_rows - 1) * dilationW + input.n_rows - 1,
(filter.n_cols - 1) * dilationH + input.n_cols - 1) = input;
NaiveConvolution<ValidConvolution>::Convolution(inputPadded, filter,
output, 1, 1, dilationW, dilationH);
}
FullConvolution 就是在 ValidConvolution 之前找到适当 outputRows 和 outputCols 构造出 padded 后的 input ,再调用 ValidConvolution 的卷积
回到 Backward ,大概的过程和 Forward 差不多,根据误差 (gy) 和梯度 (g) 分别构造出临时的 cube ,接着将逆时针旋转了 18 0 ∘ 180^{\circ} 180∘ 的 weight 与误差进行卷积并加入到梯度矩阵里
头文件:
/*
* Calculate the gradient using the output delta and the input activation.
*
* @param input The input parameter used for calculating the gradient.
* @param error The calculated error.
* @param gradient The calculated gradient.
*/
template<typename eT>
void Gradient(const arma::Mat<eT>& /* input */,
const arma::Mat<eT>& error,
arma::Mat<eT>& gradient);
实现:
template<
typename ForwardConvolutionRule,
typename BackwardConvolutionRule,
typename GradientConvolutionRule,
typename InputDataType,
typename OutputDataType
>
template<typename eT>
void Convolution<
ForwardConvolutionRule,
BackwardConvolutionRule,
GradientConvolutionRule,
InputDataType,
OutputDataType
>::Gradient(
const arma::Mat<eT>& input,
const arma::Mat<eT>& error,
arma::Mat<eT>& gradient)
{
arma::cube mappedError(((arma::Mat<eT>&) error).memptr(), outputWidth,
outputHeight, outSize * batchSize, false, false);
arma::cube inputTemp(((arma::Mat<eT>&) input).memptr(), inputWidth,
inputHeight, inSize * batchSize, false, false);
gradient.set_size(weights.n_elem, 1);
gradientTemp = arma::Cube<eT>(gradient.memptr(), weight.n_rows,
weight.n_cols, weight.n_slices, false, false);
gradientTemp.zeros();
for (size_t outMap = 0, outMapIdx = 0, batchCount = 0; outMap <
outSize * batchSize; outMap++)
{
if (outMap != 0 && outMap % outSize == 0)
{
batchCount++;
outMapIdx = 0;
}
for (size_t inMap = 0; inMap < inSize; inMap++, outMapIdx++)
{
arma::Mat<eT> inputSlice;
if (padWLeft != 0 || padWRight != 0 || padHTop != 0 || padHBottom != 0)
{
inputSlice = inputPaddedTemp.slice(inMap + batchCount * inSize);
}
else
{
inputSlice = inputTemp.slice(inMap + batchCount * inSize);
}
arma::Mat<eT> deltaSlice = mappedError.slice(outMap);
arma::Mat<eT> output;
GradientConvolutionRule::Convolution(inputSlice, deltaSlice,
output, strideWidth, strideHeight);
if (gradientTemp.n_rows < output.n_rows ||
gradientTemp.n_cols < output.n_cols)
{
gradientTemp.slice(outMapIdx) += output.submat(0, 0,
gradientTemp.n_rows - 1, gradientTemp.n_cols - 1);
}
else if (gradientTemp.n_rows > output.n_rows ||
gradientTemp.n_cols > output.n_cols)
{
gradientTemp.slice(outMapIdx).submat(0, 0, output.n_rows - 1,
output.n_cols - 1) += output;
}
else
{
gradientTemp.slice(outMapIdx) += output;
}
}
gradient.submat(weight.n_elem + (outMap % outSize), 0, weight.n_elem +
(outMap % outSize), 0) = arma::accu(mappedError.slice(outMap));
}
}
首先是 input 和 error 进行卷积
接着将这个卷积的结果以不出界的形式加到 gradientTemp 中
一轮 batch 后,gradient 再加上 mappedError 相应 Slice 的元素之和
iris
#include
#include
#include
#include
using namespace arma;
using namespace mlpack;
using namespace mlpack::ann;
void ffn_test()
{
// load data
mat train_data;
mat train_labels;
mat test_data;
mat test_labels;
mlpack::data::Load("/home/aurainting/下载/mlpack-3.4.2/build/iris_train.csv", train_data);
mlpack::data::Load("/home/aurainting/下载/mlpack-3.4.2/build/iris_train_labels.csv", train_labels);
mlpack::data::Load("/home/aurainting/下载/mlpack-3.4.2/build/iris_test.csv", test_data);
mlpack::data::Load("/home/aurainting/下载/mlpack-3.4.2/build/iris_test_labels.csv", test_labels);
// build model
FFN<> model;
model.Add<Linear<>>(train_data.n_rows, 6);
model.Add<ReLULayer<>>();
model.Add<Linear<>>(6, 4);
model.Add<ReLULayer<>>();
model.Add<Linear<>>(4, 3);
model.Add<LogSoftMax<>>();
// train
model.Train<ens::Adam>(train_data, train_labels + 1, ens::ProgressBar());
// predict
mat res;
model.Predict(test_data, res);
mat pred(1, test_labels.n_cols);
for (size_t i = 0; i < res.n_cols; ++i)
pred(0, i) = arma::index_max(res.col(i));
cout << "accuracy: "
<< static_cast<double>(arma::accu(pred == test_labels)) / test_labels.n_cols << endl;
}
int main()
{
ffn_test();
}
mnist
#include
#include
#include
#include
#include
#include
using namespace std;
using namespace arma;
using namespace mlpack;
using namespace mlpack::ann;
int reverseInt(int i)
{
unsigned char ch1, ch2, ch3, ch4;
ch1 = i & 255;
ch2 = (i >> 8) & 255;
ch3 = (i >> 16) & 255;
ch4 = (i >> 24) & 255;
return ((int)ch1 << 24) + ((int)ch2 << 16) + ((int)ch3 << 8) + ch4;
}
void read_mnist_labels(const string& filepath, mat& labels)
{
ifstream file(filepath, ios::binary);
if (file.is_open()) {
int magic_number = 0;
int number_of_items = 0;
file.read((char*)&magic_number, sizeof (magic_number));
file.read((char*)&number_of_items, sizeof (number_of_items));
magic_number = reverseInt(magic_number);
number_of_items = reverseInt(number_of_items);
labels.resize(1, number_of_items);
for (int i = 0; i < number_of_items; ++i) {
unsigned char label = 0;
file.read((char*)&label, sizeof (label));
labels(0, i) = label;
}
}
}
void read_mnist_images(const string& filepath, mat& images)
{
ifstream file(filepath, ios::binary);
if (file.is_open()) {
int magic_number = 0;
int number_of_images = 0;
int n_rows = 0;
int n_cols = 0;
file.read((char*)&magic_number, sizeof (magic_number));
file.read((char*)&number_of_images, sizeof (number_of_images));
file.read((char*)&n_rows, sizeof (n_rows));
file.read((char*)&n_cols, sizeof (n_cols));
magic_number = reverseInt(magic_number);
number_of_images = reverseInt(number_of_images);
n_rows = reverseInt(n_rows);
n_cols = reverseInt(n_cols);
images.reshape(n_rows * n_cols, number_of_images);
for (int i = 0; i < number_of_images; ++i)
for (int j = 0; j < n_rows * n_cols; ++j) {
unsigned char pixel = 0;
file.read((char*)&pixel, sizeof (pixel));
images(j, i) = pixel;
}
}
}
void ffn_test()
{
// load data
string train_labels_path = "/home/aurainting/文档/data/mnist/train-labels-idx1-ubyte";
string train_images_path = "/home/aurainting/文档/data/mnist/train-images-idx3-ubyte";
string test_labels_path = "/home/aurainting/文档/data/mnist/t10k-labels-idx1-ubyte";
string test_images_path = "/home/aurainting/文档/data/mnist/t10k-images-idx3-ubyte";
mat train_labels;
mat test_labels;
mat train_images;
mat test_images;
read_mnist_labels(train_labels_path, train_labels);
read_mnist_labels(test_labels_path, test_labels);
read_mnist_images(train_images_path, train_images);
read_mnist_images(test_images_path, test_images);
// normalize
uword nPoints = train_images.n_cols;
for (uword i = 0; i < nPoints; ++i)
train_images.col(i) /= norm(train_images.col(i), 2);
nPoints = test_images.n_cols;
for (uword i = 0; i < nPoints; ++i)
test_images.col(i) /= norm(test_images.col(i), 2);
// build model
FFN<> model;
model.Add<Convolution<>>(1, 8, 5, 5, 1, 1, 0, 0, 28, 28);
model.Add<ReLULayer<>>();
model.Add<MaxPooling<>>(8, 8, 2, 2);
model.Add<Convolution<>>(8, 12, 2, 2);
model.Add<ReLULayer<>>();
model.Add<MaxPooling<>>(2, 2, 2, 2);
model.Add<Linear<>>(192, 32);
model.Add<ReLULayer<>>();
model.Add<Linear<>>(32, 10);
model.Add<LogSoftMax<>>();
// train
ens::Adam opt(0.001, 8, 0.9, 0.999, 1e-8, 8 * train_images.n_cols);
model.Train<ens::Adam>(train_images, train_labels + 1, opt, ens::ProgressBar());
// predict
mat results;
model.Predict(test_images, results);
mat pred(1, results.n_cols);
for (size_t i = 0; i < results.n_cols; ++i)
pred(0, i) = arma::index_max(results.col(i));
cout << "accuracy: "
<< static_cast<double>(arma::accu(pred == test_labels)) / test_labels.n_cols << endl;
}
int main()
{
ffn_test();
}
Artificial Neural Network
Armadillo