FFN(mlpack)

前馈神经网络

  • FFN
    • Constructor
    • Train
    • Evaluate
      • Reset
      • Forward
      • Loss
    • Gradient
    • EvaluateWithGradient
    • Predict
  • Layer
    • Linear
      • Constructor
      • Forward
      • Backward
      • Gradient
    • Convolution
      • Constructor
      • Forward
      • Backward
      • Gradient
  • Test
  • Reference

FFN

Constructor

主要构造函数头文件:

/**
 * Implementation of a standard feed forward network.
 *
 * @tparam OutputLayerType The output layer type used to evaluate the network.
 * @tparam InitializationRuleType Rule used to initialize the weight matrix.
 * @tparam CustomLayers Any set of custom layers that could be a part of the
 *         feed forward network.
 */
template<
  typename OutputLayerType = NegativeLogLikelihood<>,
  typename InitializationRuleType = RandomInitialization,
  typename... CustomLayers
>
class FFN
{
 public:
  //! Convenience typedef for the internal model construction.
  using NetworkType = FFN<OutputLayerType, InitializationRuleType>;

  /**
   * Create the FFN object.
   *
   * Optionally, specify which initialize rule and performance function should
   * be used.
   *
   * If you want to pass in a parameter and discard the original parameter
   * object, be sure to use std::move to avoid unnecessary copy.
   *
   * @param outputLayer Output layer used to evaluate the network.
   * @param initializeRule Optional instantiated InitializationRule object
   *        for initializing the network parameter.
   */
  FFN(OutputLayerType outputLayer = OutputLayerType(),
      InitializationRuleType initializeRule = InitializationRuleType());

实现:

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::FFN(
    OutputLayerType outputLayer, InitializationRuleType initializeRule) :
    outputLayer(std::move(outputLayer)),
    initializeRule(std::move(initializeRule)),
    width(0),
    height(0),
    reset(false),
    numFunctions(0),
    deterministic(false)
{
  /* Nothing to do here. */
}

构造函数有两个主要的模板参数: OutputLayerType 和 InitializationRuleType,去看一下它们的默认实现

NegativeLogLikelihood 头文件:

/**
 * Implementation of the negative log likelihood layer. The negative log
 * likelihood layer expectes that the input contains log-probabilities for each
 * class. The layer also expects a class index, in the range between 1 and the
 * number of classes, as target when calling the Forward function.
 *
 * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
 *         arma::sp_mat or arma::cube).
 * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
 *         arma::sp_mat or arma::cube).
 */
template <
    typename InputDataType = arma::mat,
    typename OutputDataType = arma::mat
>
class NegativeLogLikelihood
{
 public:
  /**
   * Create the NegativeLogLikelihoodLayer object.
   */
  NegativeLogLikelihood();

  /**
   * Computes the Negative log likelihood.
   *
   * @param input Input data used for evaluating the specified function.
   * @param target The target vector, that contains the class index in the range
   *        between 1 and the number of classes.
   */
  template<typename InputType, typename TargetType>
  typename InputType::elem_type Forward(const InputType& input,
                                        const TargetType& target);

  /**
   * Ordinary feed backward pass of a neural network. The negative log
   * likelihood layer expects that the input contains log-probabilities for
   * each class. The layer also expects a class index, in the range between 1
   * and the number of classes, as target when calling the Forward function.
   *
   * @param input The propagated input activation.
   * @param target The target vector, that contains the class index in the range
   *        between 1 and the number of classes.
   * @param output The calculated error.
   */
  template<typename InputType, typename TargetType, typename OutputType>
  void Backward(const InputType& input,
                const TargetType& target,
                OutputType& output);

  //! Get the input parameter.
  InputDataType& InputParameter() const { return inputParameter; }
  //! Modify the input parameter.
  InputDataType& InputParameter() { return inputParameter; }

  //! Get the output parameter.
  OutputDataType& OutputParameter() const { return outputParameter; }
  //! Modify the output parameter.
  OutputDataType& OutputParameter() { return outputParameter; }

  //! Get the delta.
  OutputDataType& Delta() const { return delta; }
  //! Modify the delta.
  OutputDataType& Delta() { return delta; }

  /**
   * Serialize the layer
   */
  template<typename Archive>
  void serialize(Archive& /* ar */, const unsigned int /* version */);

 private:
  //! Locally-stored delta object.
  OutputDataType delta;

  //! Locally-stored input parameter object.
  InputDataType inputParameter;

  //! Locally-stored output parameter object.
  OutputDataType outputParameter;
}; // class NegativeLogLikelihood

实现:

template<typename InputDataType, typename OutputDataType>
NegativeLogLikelihood<InputDataType, OutputDataType>::NegativeLogLikelihood()
{
  // Nothing to do here.
}

template<typename InputDataType, typename OutputDataType>
template<typename InputType, typename TargetType>
typename InputType::elem_type
NegativeLogLikelihood<InputDataType, OutputDataType>::Forward(
    const InputType& input,
    const TargetType& target)
{
  typedef typename InputType::elem_type ElemType;
  ElemType output = 0;
  for (size_t i = 0; i < input.n_cols; ++i)
  {
    size_t currentTarget = target(i) - 1;
    Log::Assert(currentTarget < input.n_rows,
        "Target class out of range.");

    output -= input(currentTarget, i);
  }

  return output;
}

template<typename InputDataType, typename OutputDataType>
template<typename InputType, typename TargetType, typename OutputType>
void NegativeLogLikelihood<InputDataType, OutputDataType>::Backward(
      const InputType& input,
      const TargetType& target,
      OutputType& output)
{
  output = arma::zeros<OutputType>(input.n_rows, input.n_cols);
  for (size_t i = 0; i < input.n_cols; ++i)
  {
    size_t currentTarget = target(i) - 1;
    Log::Assert(currentTarget < input.n_rows,
        "Target class out of range.");

    output(currentTarget, i) = -1;
  }
}

template<typename InputDataType, typename OutputDataType>
template<typename Archive>
void NegativeLogLikelihood<InputDataType, OutputDataType>::serialize(
    Archive& /* ar */,
    const unsigned int /* version */)
{
  // Nothing to do here.
}

负对数似然损失中重要的就是那两个 Forward , Backward 方法,我们不妨引入一些记号:
i n p u t : ( X 1 , ⋯   , X N )   , X i ∈ R n    ∀   i ∈ [ 1 , N ] ⇒ [ x 11   x 12 ⋯   x 1 N ⋮ x n 1   x n 2 ⋯   x n N ] t a r g e t : ( y 1 , ⋯   , y N )   , y i ∈ [ 1 , m ] input: ( X_1, \cdots , X_N) \ , \quad X_i \in \mathbb{R}^n \ \ \forall \ i \in [1, N] \\[6pt] \Rightarrow \begin{bmatrix} x_{11} \ x_{12} \cdots \ x_{1N} \\ \vdots \\ x_{n1} \ x_{n2} \cdots \ x_{nN} \end{bmatrix} \\[6pt] target: (y_1 , \cdots , y_N) \ , \quad y_i \in [1, m] input:(X1,,XN) ,XiRn   i[1,N]x11 x12 x1Nxn1 xn2 xnNtarget:(y1,,yN) ,yi[1,m]
因此:
Forward:
o u t p u t = − ∑ i = 1 N x ( y i , i )   , y i ⩽ n output = - \sum_{i=1}^N x_{(y_i ,i)} \ , \quad y_i \leqslant n output=i=1Nx(yi,i) ,yin
Backward:
( n × N ) : o u t p u t ( j , i ) = { − 1   , j = y i    ( y i ⩽ n ) 0   , o t h e r w i s e (n \times N): \quad output_{(j, i)}= \begin{cases} -1 \ , \quad j = y_i \ \ (y_i \leqslant n) \\ 0 \ , \quad otherwise \end{cases} (n×N):output(j,i)={1 ,j=yi  (yin)0 ,otherwise

RandomInitialization :

/**
 * This class is used to initialize randomly the weight matrix.
 */
class RandomInitialization
{
 public:
  /**
   * Initialize the random initialization rule with the given lower bound and
   * upper bound.
   *
   * @param lowerBound The number used as lower bound.
   * @param upperBound The number used as upper bound.
   */
  RandomInitialization(const double lowerBound = -1,
                       const double upperBound = 1) :
      lowerBound(lowerBound), upperBound(upperBound) { }

  /**
   * Initialize the random initialization rule with the given bound.
   * Using the negative of the bound as lower bound and the positive bound as
   * upper bound.
   *
   * @param bound The number used as lower bound
   */
  RandomInitialization(const double bound) :
      lowerBound(-std::abs(bound)), upperBound(std::abs(bound)) { }

  /**
   * Initialize randomly the elements of the specified weight matrix.
   *
   * @param W Weight matrix to initialize.
   * @param rows Number of rows.
   * @param cols Number of columns.
   */
  template<typename eT>
  void Initialize(arma::Mat<eT>& W, const size_t rows, const size_t cols)
  {
    if (W.is_empty())
      W.set_size(rows, cols);

    W.randu();
    W *= (upperBound - lowerBound);
    W += lowerBound;
  }

  /**
   * Initialize randomly the elements of the specified weight matrix.
   *
   * @param W Weight matrix to initialize.
   */
  template<typename eT>
  void Initialize(arma::Mat<eT>& W)
  {
    if (W.is_empty())
      Log::Fatal << "Cannot initialize an empty matrix." << std::endl;

    W.randu();
    W *= (upperBound - lowerBound);
    W += lowerBound;
  }

  /**
   * Initialize randomly the elements of the specified weight 3rd order tensor.
   *
   * @param W Weight matrix to initialize.
   * @param rows Number of rows.
   * @param cols Number of columns.
   * @param slices Number of slices.
   */
  template<typename eT>
  void Initialize(arma::Cube<eT>& W,
                  const size_t rows,
                  const size_t cols,
                  const size_t slices)
  {
    if (W.is_empty())
      W.set_size(rows, cols, slices);

    for (size_t i = 0; i < slices; ++i)
      Initialize(W.slice(i), rows, cols);
  }

  /**
   * Initialize randomly the elements of the specified weight 3rd order tensor.
   *
   * @param W Weight matrix to initialize.
   */
  template<typename eT>
  void Initialize(arma::Cube<eT>& W)
  {
    if (W.is_empty())
      Log::Fatal << "Cannot initialize an empty cube." << std::endl;

    for (size_t i = 0; i < W.n_slices; ++i)
      Initialize(W.slice(i));
  }

 private:
  //! The number used as lower bound.
  double lowerBound;

  //! The number used as upper bound.
  double upperBound;
}; // class RandomInitialization

.randu() 在官方中的说明:

.randu() uses a uniform distribution in the [0,1] interval

因此,该初始化方法先产生服从 U ( 0 , 1 ) U(0, 1) U(0,1) 的初始值,再乘以 ( u p p e r B o u n d − l o w e r B o u n d ) (upperBound - lowerBound) (upperBoundlowerBound),加上 l o w e r B o u n d lowerBound lowerBound
有:
E ( W ) = ( u p p e r B o u n d + l o w e r B o u n d ) 2 D ( W ) = ( u p p e r B o u n d − l o w e r B o u n d ) 2 12 E(W) = \dfrac{(upperBound + lowerBound)}{2} \\[6pt] D(W) = \dfrac{(upperBound - lowerBound)^2}{12} E(W)=2(upperBound+lowerBound)D(W)=12(upperBoundlowerBound)2

Train

Train 头文件:

  /**
   * Train the feedforward network on the given input data using the given
   * optimizer.
   *
   * This will use the existing model parameters as a starting point for the
   * optimization. If this is not what you want, then you should access the
   * parameters vector directly with Parameters() and modify it as desired.
   *
   * If you want to pass in a parameter and discard the original parameter
   * object, be sure to use std::move to avoid unnecessary copy.
   *
   * @tparam OptimizerType Type of optimizer to use to train the model.
   * @tparam CallbackTypes Types of Callback Functions.
   * @param predictors Input training variables.
   * @param responses Outputs results from input training variables.
   * @param optimizer Instantiated optimizer used to train the model.
   * @param callbacks Callback function for ensmallen optimizer `OptimizerType`.
   *      See https://www.ensmallen.org/docs.html#callback-documentation.
   * @return The final objective of the trained model (NaN or Inf on error).
   */
  template<typename OptimizerType, typename... CallbackTypes>
  double Train(arma::mat predictors,
               arma::mat responses,
               OptimizerType& optimizer,
               CallbackTypes&&... callbacks);

实现:

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
template<typename OptimizerType, typename... CallbackTypes>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Train(
      arma::mat predictors,
      arma::mat responses,
      OptimizerType& optimizer,
      CallbackTypes&&... callbacks)
{
  ResetData(std::move(predictors), std::move(responses));

  WarnMessageMaxIterations<OptimizerType>(optimizer, this->predictors.n_cols);

  // Train the model.
  Timer::Start("ffn_optimization");
  const double out = optimizer.Optimize(*this, parameter, callbacks...);
  Timer::Stop("ffn_optimization");

  Log::Info << "FFN::FFN(): final objective of trained model is " << out
      << "." << std::endl;
  return out;
}

构造完模型后,就是利用给定的数据集和标签来进行训练,从实现来看,这不难理解:
利用 ensmallen 里的优化器,将自身作为待优化的函数传入,将参数 parameter 传入

参照之前介绍的 Adam 优化算法,可以猜到,该模型一定封装有 Evaluate 和 Gradient 函数

果不其然:

Evaluate

Evaluate 头文件:

  /**
   * Evaluate the feedforward network with the given parameters. This function
   * is usually called by the optimizer to train the model.
   *
   * @param parameters Matrix model parameters.
   */
  double Evaluate(const arma::mat& parameters);

   /**
   * Evaluate the feedforward network with the given parameters, but using only
   * a number of data points. This is useful for optimizers such as SGD, which
   * require a separable objective function.
   *
   * @param parameters Matrix model parameters.
   * @param begin Index of the starting point to use for objective function
   *        evaluation.
   * @param batchSize Number of points to be passed at a time to use for
   *        objective function evaluation.
   * @param deterministic Whether or not to train or test the model. Note some
   *        layer act differently in training or testing mode.
   */
  double Evaluate(const arma::mat& parameters,
                  const size_t begin,
                  const size_t batchSize,
                  const bool deterministic);

实现:

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Evaluate(
    const arma::mat& parameters)
{
  double res = 0;
  for (size_t i = 0; i < predictors.n_cols; ++i)
    res += Evaluate(parameters, i, 1, true);

  return res;
}

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Evaluate(
    const arma::mat& /* parameters */,
    const size_t begin,
    const size_t batchSize,
    const bool deterministic)
{
  if (parameter.is_empty())
    ResetParameters();

  if (deterministic != this->deterministic)
  {
    this->deterministic = deterministic;
    ResetDeterministic();
  }

  Forward(predictors.cols(begin, begin + batchSize - 1));
  double res = outputLayer.Forward(
      boost::apply_visitor(outputParameterVisitor, network.back()),
      responses.cols(begin, begin + batchSize - 1));

  for (size_t i = 0; i < network.size(); ++i)
  {
    res += boost::apply_visitor(lossVisitor, network[i]);
  }

  return res;
}

先看一下两个 Reset 方法:

Reset

ResetDeterministic

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType,
         CustomLayers...>::ResetDeterministic()
{
  DeterministicSetVisitor deterministicSetVisitor(deterministic);
  std::for_each(network.begin(), network.end(),
      boost::apply_visitor(deterministicSetVisitor));
}

里面用到了两个标准库的函数,首先是 std::for_each,其函数原型:

UnaryProc for_each ( InputIterator beg, InputIterator end, UnaryProc op)

可以猜想,boost::apply_visitor 一定是个函数对象了:

boost::apply_visitor — Allows compile-time checked type-safe application of the given visitor to the content of the given variant, ensuring that all types are handled by the visitor.

apply_visitor 有多个重载,在这里应该是使用它作为一元函数对象,即,将 deterministicSetVisitor 依次作用到 network 的每一个元素上

继续去看看
DeterministicSetVisitor 头文件:

/**
 * DeterministicSetVisitor set the deterministic parameter given the
 * deterministic value.
 */
class DeterministicSetVisitor : public boost::static_visitor<void>
{
 public:
  //! Set the deterministic parameter given the current deterministic value.
  DeterministicSetVisitor(const bool deterministic = true);

  //! Set the deterministic parameter.
  template<typename LayerType>
  void operator()(LayerType* layer) const;

  void operator()(MoreTypes layer) const;

 private:
  //! The deterministic parameter.
  const bool deterministic;

  //! Set the deterministic parameter if the module implements the
  //! Deterministic() and Model() function.
  template<typename T>
  typename std::enable_if<
      HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
      HasModelCheck<T>::value, void>::type
  LayerDeterministic(T* layer) const;

  //! Set the deterministic parameter if the module implements the
  //! Model() function.
  template<typename T>
  typename std::enable_if<
      !HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
      HasModelCheck<T>::value, void>::type
  LayerDeterministic(T* layer) const;

  //! Set the deterministic parameter if the module implements the
  //! Deterministic() function.
  template<typename T>
  typename std::enable_if<
      HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
      !HasModelCheck<T>::value, void>::type
  LayerDeterministic(T* layer) const;

  //! Do not set the deterministic parameter if the module doesn't implement the
  //! Deterministic() or Model() function.
  template<typename T>
  typename std::enable_if<
      !HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
      !HasModelCheck<T>::value, void>::type
  LayerDeterministic(T* layer) const;
};

实现:

//! DeterministicSetVisitor visitor class.
inline DeterministicSetVisitor::DeterministicSetVisitor(
    const bool deterministic) : deterministic(deterministic)
{
  /* Nothing to do here. */
}

template<typename LayerType>
inline void DeterministicSetVisitor::operator()(LayerType* layer) const
{
  LayerDeterministic(layer);
}

inline void DeterministicSetVisitor::operator()(MoreTypes layer) const
{
  layer.apply_visitor(*this);
}

template<typename T>
inline typename std::enable_if<
    HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
    HasModelCheck<T>::value, void>::type
DeterministicSetVisitor::LayerDeterministic(T* layer) const
{
  layer->Deterministic() = deterministic;

  for (size_t i = 0; i < layer->Model().size(); ++i)
  {
    boost::apply_visitor(DeterministicSetVisitor(deterministic),
        layer->Model()[i]);
  }
}

template<typename T>
inline typename std::enable_if<
    !HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
    HasModelCheck<T>::value, void>::type
DeterministicSetVisitor::LayerDeterministic(T* layer) const
{
  for (size_t i = 0; i < layer->Model().size(); ++i)
  {
    boost::apply_visitor(DeterministicSetVisitor(deterministic),
        layer->Model()[i]);
  }
}

template<typename T>
inline typename std::enable_if<
    HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
    !HasModelCheck<T>::value, void>::type
DeterministicSetVisitor::LayerDeterministic(T* layer) const
{
  layer->Deterministic() = deterministic;
}

template<typename T>
inline typename std::enable_if<
    !HasDeterministicCheck<T, bool&(T::*)(void)>::value &&
    !HasModelCheck<T>::value, void>::type
DeterministicSetVisitor::LayerDeterministic(T* /* input */) const
{
  /* Nothing to do here. */
}

总体上,就是继续将函数 DeterministicSetVisitor 作用于 layer->Model 每一个元素上
其具体行为依据 Layer 类型的不同而不同

ResetParameters

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType,
         CustomLayers...>::ResetParameters()
{
  ResetDeterministic();

  // Reset the network parameter with the given initialization rule.
  NetworkInitialization<InitializationRuleType,
                        CustomLayers...> networkInit(initializeRule);
  networkInit.Initialize(network, parameter);
}

先前介绍的 RandomInitialization 在这里派上了用场,不过,还得先去看一下 NetworkInitialization

network_init

/**
 * This class is used to initialize the network with the given initialization
 * rule.
 */
template<typename InitializationRuleType, typename... CustomLayers>
class NetworkInitialization
{
 public:
  /**
   * Use the given initialization rule to initialize the specified network.
   *
   * @param initializeRule Rule to initialize the given network.
   */
  NetworkInitialization(
      const InitializationRuleType& initializeRule = InitializationRuleType()) :
      initializeRule(initializeRule)
  {
    // Nothing to do here.
  }

  /**
   * Initialize the specified network and store the results in the given
   * parameter.
   *
   * @param network Network that should be initialized.
   * @param parameter The network parameter.
   * @param parameterOffset Offset for network paramater, default 0.
   */
  template <typename eT>
  void Initialize(const std::vector<LayerTypes<CustomLayers...> >& network,
                  arma::Mat<eT>& parameter, size_t parameterOffset = 0)
  {
    // Determine the number of parameter/weights of the given network.
    if (parameter.is_empty())
    {
      size_t weights = 0;
      for (size_t i = 0; i < network.size(); ++i)
        weights += boost::apply_visitor(weightSizeVisitor, network[i]);
      parameter.set_size(weights, 1);
    }

    // Initialize the network layer by layer or the complete network.
    if (ann::InitTraits<InitializationRuleType>::UseLayer)
    {
      for (size_t i = 0, offset = parameterOffset; i < network.size(); ++i)
      {
        // Initialize the layer with the specified parameter/weight
        // initialization rule.
        const size_t weight = boost::apply_visitor(weightSizeVisitor,
            network[i]);
        arma::Mat<eT> tmp = arma::mat(parameter.memptr() + offset,
            weight, 1, false, false);
        initializeRule.Initialize(tmp, tmp.n_elem, 1);

        // Increase the parameter/weight offset for the next layer.
        offset += weight;
      }
    }
    else
    {
      initializeRule.Initialize(parameter, parameter.n_elem, 1);
    }

    // Note: We can't merge the for loop into the for loop above because
    // WeightSetVisitor also sets the parameter/weights of the inner modules.
    // Inner Modules are held by the parent module e.g. the concat module can
    // hold various other modules.
    for (size_t i = 0, offset = parameterOffset; i < network.size(); ++i)
    {
      offset += boost::apply_visitor(WeightSetVisitor(parameter, offset),
          network[i]);

      boost::apply_visitor(resetVisitor, network[i]);
    }
  }

 private:
  //! Instantiated InitializationRule object for initializing the network
  //! parameter.
  InitializationRuleType initializeRule;

  //! Locally-stored reset visitor.
  ResetVisitor resetVisitor;

  //! Locally-stored weight size visitor.
  WeightSizeVisitor weightSizeVisitor;
}; // class NetworkInitialization

首先是对 network 每一个元素调用 weightSizeVisitor 取得 parameter 的形状

WeightSizeVisitor 头文件:

/**
 * WeightSizeVisitor returns the number of weights of the given module.
 */
class WeightSizeVisitor : public boost::static_visitor<size_t>
{
 public:
  //! Return the number of weights.
  template<typename LayerType>
  size_t operator()(LayerType* layer) const;

  size_t operator()(MoreTypes layer) const;

 private:
  //! If the module doesn't implement the Parameters() or Model() function
  //! return 0.
  template<typename T, typename P>
  typename std::enable_if<
      !HasParametersCheck<T, P&(T::*)()>::value &&
      !HasModelCheck<T>::value, size_t>::type
  LayerSize(T* layer, P& output) const;

  //! Return the number of parameters if the module implements the Model()
  //! function.
  template<typename T, typename P>
  typename std::enable_if<
      !HasParametersCheck<T, P&(T::*)()>::value &&
      HasModelCheck<T>::value, size_t>::type
  LayerSize(T* layer, P& output) const;

  //! Return the number of parameters if the module implements the Parameters()
  //! function.
  template<typename T, typename P>
  typename std::enable_if<
      HasParametersCheck<T, P&(T::*)()>::value &&
      !HasModelCheck<T>::value, size_t>::type
  LayerSize(T* layer, P& output) const;

  //! Return the accumulated number of parameters if the module implements the
  //! Parameters() and Model() function.
  template<typename T, typename P>
  typename std::enable_if<
      HasParametersCheck<T, P&(T::*)()>::value &&
      HasModelCheck<T>::value, size_t>::type
  LayerSize(T* layer, P& output) const;
};

实现:

//! WeightSizeVisitor visitor class.
template<typename LayerType>
inline size_t WeightSizeVisitor::operator()(LayerType* layer) const
{
  return LayerSize(layer, layer->OutputParameter());
}

inline size_t WeightSizeVisitor::operator()(MoreTypes layer) const
{
  return layer.apply_visitor(*this);
}

template<typename T, typename P>
inline typename std::enable_if<
    !HasParametersCheck<T, P&(T::*)()>::value &&
    !HasModelCheck<T>::value, size_t>::type
WeightSizeVisitor::LayerSize(T* /* layer */, P& /* output */) const
{
  return 0;
}

template<typename T, typename P>
inline typename std::enable_if<
    !HasParametersCheck<T, P&(T::*)()>::value &&
    HasModelCheck<T>::value, size_t>::type
WeightSizeVisitor::LayerSize(T* layer, P& /* output */) const
{
  size_t weights = 0;
  for (size_t i = 0; i < layer->Model().size(); ++i)
  {
    weights += boost::apply_visitor(WeightSizeVisitor(), layer->Model()[i]);
  }

  return weights;
}

template<typename T, typename P>
inline typename std::enable_if<
    HasParametersCheck<T, P&(T::*)()>::value &&
    !HasModelCheck<T>::value, size_t>::type
WeightSizeVisitor::LayerSize(T* layer, P& /* output */) const
{
  return layer->Parameters().n_elem;
}

template<typename T, typename P>
inline typename std::enable_if<
    HasParametersCheck<T, P&(T::*)()>::value &&
    HasModelCheck<T>::value, size_t>::type
WeightSizeVisitor::LayerSize(T* layer, P& /* output */) const
{
  size_t weights = layer->Parameters().n_elem;
  for (size_t i = 0; i < layer->Model().size(); ++i)
  {
    weights += boost::apply_visitor(WeightSizeVisitor(), layer->Model()[i]);
  }

  return weights;
}

大体上,会返回 layer 的 Parameters 的元素个数(有的话)加上对 layer->Model 每一元素继续调用 WeightSizeVisitor 的结果(有的话)

将这些变量的个数相加,形成一个列向量,就是 parameter

InitTraits

/**
 * This is a template class that can provide information about various
 * initialization methods. By default, this class will provide the weakest
 * possible assumptions on the initialization method, and each initialization
 * method should override values as necessary. If a initialization method
 * doesn't need to override a value, then there's no need to write a InitTraits
 * specialization for that class.
 */
template<typename InitRuleType>
class InitTraits
{
 public:
  /**
   * This is true if the initialization method is used for a single layer.
   */
  static const bool UseLayer = true;
};

根据 UseLayer 决定是一层一层还是整个网络一起初始化,主要的初始化过程就是先前介绍的模板参数

除此之外,还对网络的每一层调用了一个 resetVisitor

resetVisitor 头文件:

/**
 * ResetVisitor executes the Reset() function.
 */
class ResetVisitor : public boost::static_visitor<void>
{
 public:
  //! Execute the Reset() function.
  template<typename LayerType>
  void operator()(LayerType* layer) const;

  void operator()(MoreTypes layer) const;

 private:
  //! Execute the Reset() function for a module which implements the Reset()
  //! function.
  template<typename T>
  typename std::enable_if<
      HasResetCheck<T, void(T::*)()>::value &&
      !HasModelCheck<T>::value, void>::type
  ResetParameter(T* layer) const;

  //! Execute the Reset() function for a module which implements the Model()
  //! function.
  template<typename T>
  typename std::enable_if<
      !HasResetCheck<T, void(T::*)()>::value &&
      HasModelCheck<T>::value, void>::type
  ResetParameter(T* layer) const;

  //! Execute the Reset() function for a module which implements the Reset()
  //! and Model() function.
  template<typename T>
  typename std::enable_if<
      HasResetCheck<T, void(T::*)()>::value &&
      HasModelCheck<T>::value, void>::type
  ResetParameter(T* layer) const;

  //! Do not execute the Reset() function for a module which doesn't implement
  // the Reset() or Model() function.
  template<typename T>
  typename std::enable_if<
      !HasResetCheck<T, void(T::*)()>::value &&
      !HasModelCheck<T>::value, void>::type
  ResetParameter(T* layer) const;
};

实现:

//! ResetVisitor visitor class.
template<typename LayerType>
inline void ResetVisitor::operator()(LayerType* layer) const
{
  ResetParameter(layer);
}

inline void ResetVisitor::operator()(MoreTypes layer) const
{
  layer.apply_visitor(*this);
}

template<typename T>
inline typename std::enable_if<
    HasResetCheck<T, void(T::*)()>::value &&
    !HasModelCheck<T>::value, void>::type
ResetVisitor::ResetParameter(T* layer) const
{
  layer->Reset();
}

template<typename T>
inline typename std::enable_if<
    !HasResetCheck<T, void(T::*)()>::value &&
    HasModelCheck<T>::value, void>::type
ResetVisitor::ResetParameter(T* layer) const
{
  for (size_t i = 0; i < layer->Model().size(); ++i)
  {
    boost::apply_visitor(ResetVisitor(), layer->Model()[i]);
  }
}

template<typename T>
inline typename std::enable_if<
    HasResetCheck<T, void(T::*)()>::value &&
    HasModelCheck<T>::value, void>::type
ResetVisitor::ResetParameter(T* layer) const
{
  for (size_t i = 0; i < layer->Model().size(); ++i)
  {
    boost::apply_visitor(ResetVisitor(), layer->Model()[i]);
  }

  layer->Reset();
}

template<typename T>
inline typename std::enable_if<
    !HasResetCheck<T, void(T::*)()>::value &&
    !HasModelCheck<T>::value, void>::type
ResetVisitor::ResetParameter(T* /* layer */) const
{
  /* Nothing to do here. */
}

和以往遇到的 Visitor 类差不多,调用 layer->Model 中的 ResetVisitor ,以及调用 layer 的 Reset 函数

如此一来,两个 Reset 方法就看完了,下面进入 Forward 函数:

Forward

一般情况下,因为 batchSize 为 1 , 所以对于 predictors 的每一列(每一数据点)调用了 Forward 函数

Forward 头文件:

  // Helper functions.
  /**
   * The Forward algorithm (part of the Forward-Backward algorithm).  Computes
   * forward probabilities for each module.
   *
   * @param input Data sequence to compute probabilities for.
   */
  template<typename InputType>
  void Forward(const InputType& input);

实现:

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
template<typename InputType>
void FFN<OutputLayerType, InitializationRuleType,
         CustomLayers...>::Forward(const InputType& input)
{
  boost::apply_visitor(ForwardVisitor(input,
      boost::apply_visitor(outputParameterVisitor, network.front())),
      network.front());

  if (!reset)
  {
    if (boost::apply_visitor(outputWidthVisitor, network.front()) != 0)
    {
      width = boost::apply_visitor(outputWidthVisitor, network.front());
    }

    if (boost::apply_visitor(outputHeightVisitor, network.front()) != 0)
    {
      height = boost::apply_visitor(outputHeightVisitor, network.front());
    }
  }

  for (size_t i = 1; i < network.size(); ++i)
  {
    if (!reset)
    {
      // Set the input width.
      boost::apply_visitor(SetInputWidthVisitor(width), network[i]);

      // Set the input height.
      boost::apply_visitor(SetInputHeightVisitor(height), network[i]);
    }

    boost::apply_visitor(ForwardVisitor(boost::apply_visitor(
        outputParameterVisitor, network[i - 1]),
        boost::apply_visitor(outputParameterVisitor, network[i])), network[i]);

    if (!reset)
    {
      // Get the output width.
      if (boost::apply_visitor(outputWidthVisitor, network[i]) != 0)
      {
        width = boost::apply_visitor(outputWidthVisitor, network[i]);
      }

      // Get the output height.
      if (boost::apply_visitor(outputHeightVisitor, network[i]) != 0)
      {
        height = boost::apply_visitor(outputHeightVisitor, network[i]);
      }
    }
  }

  if (!reset)
    reset = true;
}

Visitor 类我就不展示了,因为它无非是调用某 layer 的对应的函数

比如说第一个语句,就是调用 network 的第一层 layer 的 Forward 函数,并将 input 以及 network 第一层的 outputParameter 函数结果作为参数

接着,如果是第一次调用该函数,reset 应该为 false (初始化的结果),就取得网络第一层输出的宽度及高度(不为零的话),接着设置第二层输入的形状与第一层输出进行对接,再次调用 Visitor 类的函数,即:调用这层的 Forward 函数,并将上一层的 outputParameter,以及这层的 outputParameter 作为参数,然后在整个网络中依次进行这个过程

最后 reset 置位

Loss

然后又调用了 outputLayer 的 Forward 函数,默认情况下就是之前介绍的负对数似然损失函数
将网络最后一层的 outputParameter 以及相应的 responses 列作为参数

返回的结果再加上每一层网络的 loss 函数结果,得到 Evaluate 的最终结果

Gradient

Gradient 头文件:

  /**
   * Evaluate the gradient of the feedforward network with the given parameters,
   * and with respect to only a number of points in the dataset. This is useful
   * for optimizers such as SGD, which require a separable objective function.
   *
   * @param parameters Matrix of the model parameters to be optimized.
   * @param begin Index of the starting point to use for objective function
   *        gradient evaluation.
   * @param gradient Matrix to output gradient into.
   * @param batchSize Number of points to be processed as a batch for objective
   *        function gradient evaluation.
   */
  void Gradient(const arma::mat& parameters,
                const size_t begin,
                arma::mat& gradient,
                const size_t batchSize);

实现:

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Gradient(
    const arma::mat& parameters,
    const size_t begin,
    arma::mat& gradient,
    const size_t batchSize)
{
  this->EvaluateWithGradient(parameters, begin, gradient, batchSize);
}

没什么好说的,下一个

EvaluateWithGradient

EvaluateWithGradient 头文件:

  /**
   * Evaluate the feedforward network with the given parameters.
   * This function is usually called by the optimizer to train the model.
   * This just calls the overload of EvaluateWithGradient() with batchSize = 1.
   *
   * @param parameters Matrix model parameters.
   * @param gradient Matrix to output gradient into.
   */
  template<typename GradType>
  double EvaluateWithGradient(const arma::mat& parameters, GradType& gradient);

   /**
   * Evaluate the feedforward network with the given parameters, but using only
   * a number of data points. This is useful for optimizers such as SGD, which
   * require a separable objective function.
   *
   * @param parameters Matrix model parameters.
   * @param begin Index of the starting point to use for objective function
   *        evaluation.
   * @param gradient Matrix to output gradient into.
   * @param batchSize Number of points to be passed at a time to use for
   *        objective function evaluation.
   */
  template<typename GradType>
  double EvaluateWithGradient(const arma::mat& parameters,
                              const size_t begin,
                              GradType& gradient,
                              const size_t batchSize);

实现:

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
template<typename GradType>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::
EvaluateWithGradient(const arma::mat& parameters, GradType& gradient)
{
  double res = 0;
  for (size_t i = 0; i < predictors.n_cols; ++i)
    res += EvaluateWithGradient(parameters, i, gradient, 1);

  return res;
}

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
template<typename GradType>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::
EvaluateWithGradient(const arma::mat& /* parameters */,
                     const size_t begin,
                     GradType& gradient,
                     const size_t batchSize)
{
  if (gradient.is_empty())
  {
    if (parameter.is_empty())
      ResetParameters();

    gradient = arma::zeros<arma::mat>(parameter.n_rows, parameter.n_cols);
  }
  else
  {
    gradient.zeros();
  }

  if (this->deterministic)
  {
    this->deterministic = false;
    ResetDeterministic();
  }

  Forward(predictors.cols(begin, begin + batchSize - 1));
  double res = outputLayer.Forward(
      boost::apply_visitor(outputParameterVisitor, network.back()),
      responses.cols(begin, begin + batchSize - 1));

  for (size_t i = 0; i < network.size(); ++i)
  {
    res += boost::apply_visitor(lossVisitor, network[i]);
  }

  outputLayer.Backward(
      boost::apply_visitor(outputParameterVisitor, network.back()),
      responses.cols(begin, begin + batchSize - 1),
      error);

  Backward();
  ResetGradients(gradient);
  Gradient(predictors.cols(begin, begin + batchSize - 1));

  return res;
}

开始是一些变量初始化,之前已经介绍过了

然后是 Forward 函数以及构造损失函数,和 Evaluate 中的一样

不一样的在于之后,调用了 outputLayer 的 Backward 函数,并将最后一层的 outputParameter 函数结果,相应的 responses 列,以及 error 作为参数传入,默认情况下就是先前介绍的负对数似然损失

接着又调用了无参的 Backward 函数:

Backward

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Backward()
{
  boost::apply_visitor(BackwardVisitor(boost::apply_visitor(
      outputParameterVisitor, network.back()), error,
      boost::apply_visitor(deltaVisitor, network.back())), network.back());

  for (size_t i = 2; i < network.size(); ++i)
  {
    boost::apply_visitor(BackwardVisitor(boost::apply_visitor(
        outputParameterVisitor, network[network.size() - i]),
        boost::apply_visitor(deltaVisitor, network[network.size() - i + 1]),
        boost::apply_visitor(deltaVisitor, network[network.size() - i])),
        network[network.size() - i]);
  }
}

Backward 和 Forward 是成双入对的,调用第 i 层的 Backward 函数,将第 i 层的 outputParameter 函数结果,第 i + 1 层的 delta 函数结果,以及第 i 层 delta 函数结果作为参数,整个过程从后向前依次进行,最后一层单独处理

下一步是 ResetGradients 函数:

ResetGradients

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType,
         CustomLayers...>::ResetGradients(arma::mat& gradient)
{
  size_t offset = 0;
  for (size_t i = 0; i < network.size(); ++i)
  {
    offset += boost::apply_visitor(GradientSetVisitor(gradient, offset),
        network[i]);
  }
}

这个无非是针对网络中的每一层调用了自己的 GradientSet 函数

Gradient 的最后一步,是调用了一个接受一个矩阵的 Gradient 函数:

Gradient 头文件:

  /**
   * Iterate through all layer modules and update the the gradient using the
   * layer defined optimizer.
   */
  template<typename InputType>
  void Gradient(const InputType& input);

实现:

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
template<typename InputType>
void FFN<OutputLayerType, InitializationRuleType,
         CustomLayers...>::Gradient(const InputType& input)
{
  boost::apply_visitor(GradientVisitor(input,
      boost::apply_visitor(deltaVisitor, network[1])), network.front());

  for (size_t i = 1; i < network.size() - 1; ++i)
  {
    boost::apply_visitor(GradientVisitor(boost::apply_visitor(
        outputParameterVisitor, network[i - 1]),
        boost::apply_visitor(deltaVisitor, network[i + 1])), network[i]);
  }

  boost::apply_visitor(GradientVisitor(boost::apply_visitor(
      outputParameterVisitor, network[network.size() - 2]), error),
      network[network.size() - 1]);
}

有了先前的经验,再来看这段代码就容易理解了:

先调用网络第一层的 Gradient 函数,将 input 和第二层的 delta 函数结果作为参数传入
接着进入循环,调用第 i 层的 Gradient 函数,并将 i - 1 层的 outputParameter 函数结果以及 i + 1 层的 delta 函数结果作为参数
最后调用网络最后一层的 Gradient 函数,将前一层的 outputParameter 函数结果以及 error 作为参数传入

Predict

Predict 头文件:

  /**
   * Predict the responses to a given set of predictors. The responses will
   * reflect the output of the given output layer as returned by the
   * output layer function.
   *
   * If you want to pass in a parameter and discard the original parameter
   * object, be sure to use std::move to avoid unnecessary copy.
   *
   * @param predictors Input predictors.
   * @param results Matrix to put output predictions of responses into.
   */
  void Predict(arma::mat predictors, arma::mat& results);

实现:

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
void FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Predict(
    arma::mat predictors, arma::mat& results)
{
  if (parameter.is_empty())
    ResetParameters();

  if (!deterministic)
  {
    deterministic = true;
    ResetDeterministic();
  }

  arma::mat resultsTemp;
  Forward(arma::mat(predictors.colptr(0), predictors.n_rows, 1, false, true));
  resultsTemp = boost::apply_visitor(outputParameterVisitor,
      network.back()).col(0);

  results = arma::mat(resultsTemp.n_elem, predictors.n_cols);
  results.col(0) = resultsTemp.col(0);

  for (size_t i = 1; i < predictors.n_cols; ++i)
  {
    Forward(arma::mat(predictors.colptr(i), predictors.n_rows, 1, false, true));

    resultsTemp = boost::apply_visitor(outputParameterVisitor,
        network.back());
    results.col(i) = resultsTemp.col(0);
  }
}

整个过程大概就是将 predictors 的每一列依次进行 Forward ,取出网络最后的输出作为结果矩阵,只是将第一步分开进行,以便确认结果矩阵的形状

Layer

Linear

Constructor

头文件:

/**
 * Implementation of the Linear layer class. The Linear class represents a
 * single layer of a neural network.
 *
 * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
 *         arma::sp_mat or arma::cube).
 * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
 *         arma::sp_mat or arma::cube).
 */
template <
    typename InputDataType = arma::mat,
    typename OutputDataType = arma::mat,
    typename RegularizerType = NoRegularizer
>
class Linear
{
 public:
  //! Create the Linear object.
  Linear();

  /**
   * Create the Linear layer object using the specified number of units.
   *
   * @param inSize The number of input units.
   * @param outSize The number of output units.
   * @param regularizer The regularizer to use, optional.
   */
  Linear(const size_t inSize,
         const size_t outSize,
         RegularizerType regularizer = RegularizerType());

实现:

template<typename InputDataType, typename OutputDataType,
    typename RegularizerType>
Linear<InputDataType, OutputDataType, RegularizerType>::Linear() :
    inSize(0),
    outSize(0)
{
  // Nothing to do here.
}

template<typename InputDataType, typename OutputDataType,
    typename RegularizerType>
Linear<InputDataType, OutputDataType, RegularizerType>::Linear(
    const size_t inSize,
    const size_t outSize,
    RegularizerType regularizer) :
    inSize(inSize),
    outSize(outSize),
    regularizer(regularizer)
{
  weights.set_size(outSize * inSize + outSize, 1);
}

NoRegularizer 顾名思义,就是没有 Regularizer

这里构造函数主要就是设置 weights 向量的形状 :
o u t S i z e × i n S i z e = w e i g h t . S i z e o u t S i z e × 1 = b i a s . S i z e outSize \times inSize = weight.Size \\[5pt] outSize \times 1 = bias.Size outSize×inSize=weight.SizeoutSize×1=bias.Size

这点从 Reset 函数中也可看出:

Reset

template<typename InputDataType, typename OutputDataType,
    typename RegularizerType>
void Linear<InputDataType, OutputDataType, RegularizerType>::Reset()
{
  weight = arma::mat(weights.memptr(), outSize, inSize, false, false);
  bias = arma::mat(weights.memptr() + weight.n_elem,
      outSize, 1, false, false);
}

Forward

头文件:

  /**
   * Ordinary feed forward pass of a neural network, evaluating the function
   * f(x) by propagating the activity forward through f.
   *
   * @param input Input data used for evaluating the specified function.
   * @param output Resulting output activation.
   */
  template<typename eT>
  void Forward(const arma::Mat<eT>& input, arma::Mat<eT>& output);

实现:

template<typename InputDataType, typename OutputDataType,
    typename RegularizerType>
template<typename eT>
void Linear<InputDataType, OutputDataType, RegularizerType>::Forward(
    const arma::Mat<eT>& input, arma::Mat<eT>& output)
{
  output = weight * input;
  output.each_col() += bias;
}

正如 linear 头文件的注释中所说,线性层主要作为全连接层或者仿射变换:
o u t p u t = w e i g h t ⋅ i n p u t + b i a s output = weight \cdot input + bias output=weightinput+bias

回顾整个网络的 Forward 函数,可知,input 要么是输入的数据点(第一层),要么是上一层的输出(除第一层外),而 output 都是这一层的 outputParameter

另外,所有参数传递都是引用类型

Backward

头文件:

  /**
   * Ordinary feed backward pass of a neural network, calculating the function
   * f(x) by propagating x backwards trough f. Using the results from the feed
   * forward pass.
   *
   * @param * (input) The propagated input activation.
   * @param gy The backpropagated error.
   * @param g The calculated gradient.
   */
  template<typename eT>
  void Backward(const arma::Mat<eT>& /* input */,
                const arma::Mat<eT>& gy,
                arma::Mat<eT>& g);

实现:

template<typename InputDataType, typename OutputDataType,
    typename RegularizerType>
template<typename eT>
void Linear<InputDataType, OutputDataType, RegularizerType>::Backward(
    const arma::Mat<eT>& /* input */, const arma::Mat<eT>& gy, arma::Mat<eT>& g)
{
  g = weight.t() * gy;
}

回顾整个网络的 Backward 函数,可知,gy(The backpropagated error)要么是 error (最后一层),要么是下一层的 delta (除最后一层),而 g(The calculated gradient)都是这一层的 delta

另外,所有参数传递都是引用类型

Gradient

头文件:

  /*
   * Calculate the gradient using the output delta and the input activation.
   *
   * @param input The input parameter used for calculating the gradient.
   * @param error The calculated error.
   * @param gradient The calculated gradient.
   */
  template<typename eT>
  void Gradient(const arma::Mat<eT>& input,
                const arma::Mat<eT>& error,
                arma::Mat<eT>& gradient);

实现:

template<typename InputDataType, typename OutputDataType,
    typename RegularizerType>
template<typename eT>
void Linear<InputDataType, OutputDataType, RegularizerType>::Gradient(
    const arma::Mat<eT>& input,
    const arma::Mat<eT>& error,
    arma::Mat<eT>& gradient)
{
  gradient.submat(0, 0, weight.n_elem - 1, 0) = arma::vectorise(
      error * input.t());
  gradient.submat(weight.n_elem, 0, gradient.n_elem - 1, 0) =
      arma::sum(error, 1);
  regularizer.Evaluate(weights, gradient);
}

同样道理,这里的 input 要么是 input (第一层),要么是上一层的 outputParameter ,而 error 要么是下一层的 delta,要么是 error(最后一层),而 gradient 就是这层的 gradient 矩阵

另外,所有参数传递都是引用类型

.submat 原型:

X.submat ( first_row, first_col, last_row, last_col )

.vectorise(X, dim) 官方解释:

Generate a flattened version of matrix X or cube Q
The argument dim is optional; by default dim=0 is used

因此,这里的 gradient 矩阵的更新应该是分为 weights 部分和 bias 部分,weights 部分更新为 e r r o r ⋅ i n p u t T error \cdot input^{\mathsf{T}} errorinputT ,bias 部分更新为 error 相应整行的和

最后再调用 regularizer 的 Evaluate 函数,这里没有就不讨论了

Convolution

Constructor

头文件:

/**
 * Implementation of the Convolution class. The Convolution class represents a
 * single layer of a neural network.
 *
 * @tparam ForwardConvolutionRule Convolution to perform forward process.
 * @tparam BackwardConvolutionRule Convolution to perform backward process.
 * @tparam GradientConvolutionRule Convolution to calculate gradient.
 * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
 *         arma::sp_mat or arma::cube).
 * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
 *         arma::sp_mat or arma::cube).
 */
template <
    typename ForwardConvolutionRule = NaiveConvolution<ValidConvolution>,
    typename BackwardConvolutionRule = NaiveConvolution<FullConvolution>,
    typename GradientConvolutionRule = NaiveConvolution<ValidConvolution>,
    typename InputDataType = arma::mat,
    typename OutputDataType = arma::mat
>
class Convolution
{
 public:
  //! Create the Convolution object.
  Convolution();

  /**
   * Create the Convolution object using the specified number of input maps,
   * output maps, filter size, stride and padding parameter.
   *
   * @param inSize The number of input maps.
   * @param outSize The number of output maps.
   * @param kernelWidth Width of the filter/kernel.
   * @param kernelHeight Height of the filter/kernel.
   * @param strideWidth Stride of filter application in the x direction.
   * @param strideHeight Stride of filter application in the y direction.
   * @param padW Padding width of the input.
   * @param padH Padding height of the input.
   * @param inputWidth The width of the input data.
   * @param inputHeight The height of the input data.
   * @param paddingType The type of padding (Valid or Same). Defaults to None.
   */
  Convolution(const size_t inSize,
              const size_t outSize,
              const size_t kernelWidth,
              const size_t kernelHeight,
              const size_t strideWidth = 1,
              const size_t strideHeight = 1,
              const size_t padW = 0,
              const size_t padH = 0,
              const size_t inputWidth = 0,
              const size_t inputHeight = 0,
              const std::string& paddingType = "None");

  /**
   * Create the Convolution object using the specified number of input maps,
   * output maps, filter size, stride and padding parameter.
   *
   * @param inSize The number of input maps.
   * @param outSize The number of output maps.
   * @param kernelWidth Width of the filter/kernel.
   * @param kernelHeight Height of the filter/kernel.
   * @param strideWidth Stride of filter application in the x direction.
   * @param strideHeight Stride of filter application in the y direction.
   * @param padW A two-value tuple indicating padding widths of the input.
   *             First value is padding at left side. Second value is padding on
   *             right side.
   * @param padH A two-value tuple indicating padding heights of the input.
   *             First value is padding at top. Second value is padding on
   *             bottom.
   * @param inputWidth The width of the input data.
   * @param inputHeight The height of the input data.
   * @param paddingType The type of padding (Valid or Same). Defaults to None.
   */
  Convolution(const size_t inSize,
              const size_t outSize,
              const size_t kernelWidth,
              const size_t kernelHeight,
              const size_t strideWidth,
              const size_t strideHeight,
              const std::tuple<size_t, size_t>& padW,
              const std::tuple<size_t, size_t>& padH,
              const size_t inputWidth = 0,
              const size_t inputHeight = 0,
              const std::string& paddingType = "None");

实现:

template<
    typename ForwardConvolutionRule,
    typename BackwardConvolutionRule,
    typename GradientConvolutionRule,
    typename InputDataType,
    typename OutputDataType
>
Convolution<
    ForwardConvolutionRule,
    BackwardConvolutionRule,
    GradientConvolutionRule,
    InputDataType,
    OutputDataType
>::Convolution()
{
  // Nothing to do here.
}

template<
    typename ForwardConvolutionRule,
    typename BackwardConvolutionRule,
    typename GradientConvolutionRule,
    typename InputDataType,
    typename OutputDataType
>
Convolution<
    ForwardConvolutionRule,
    BackwardConvolutionRule,
    GradientConvolutionRule,
    InputDataType,
    OutputDataType
>::Convolution(
    const size_t inSize,
    const size_t outSize,
    const size_t kernelWidth,
    const size_t kernelHeight,
    const size_t strideWidth,
    const size_t strideHeight,
    const size_t padW,
    const size_t padH,
    const size_t inputWidth,
    const size_t inputHeight,
    const std::string& paddingType) :
    Convolution(
      inSize,
      outSize,
      kernelWidth,
      kernelHeight,
      strideWidth,
      strideHeight,
      std::tuple<size_t, size_t>(padW, padW),
      std::tuple<size_t, size_t>(padH, padH),
      inputWidth,
      inputHeight,
      paddingType)
{
  // Nothing to do here.
}

template<
    typename ForwardConvolutionRule,
    typename BackwardConvolutionRule,
    typename GradientConvolutionRule,
    typename InputDataType,
    typename OutputDataType
>
Convolution<
    ForwardConvolutionRule,
    BackwardConvolutionRule,
    GradientConvolutionRule,
    InputDataType,
    OutputDataType
>::Convolution(
    const size_t inSize,
    const size_t outSize,
    const size_t kernelWidth,
    const size_t kernelHeight,
    const size_t strideWidth,
    const size_t strideHeight,
    const std::tuple<size_t, size_t>& padW,
    const std::tuple<size_t, size_t>& padH,
    const size_t inputWidth,
    const size_t inputHeight,
    const std::string& paddingType) :
    inSize(inSize),
    outSize(outSize),
    kernelWidth(kernelWidth),
    kernelHeight(kernelHeight),
    strideWidth(strideWidth),
    strideHeight(strideHeight),
    padWLeft(std::get<0>(padW)),
    padWRight(std::get<1>(padW)),
    padHBottom(std::get<1>(padH)),
    padHTop(std::get<0>(padH)),
    inputWidth(inputWidth),
    inputHeight(inputHeight),
    outputWidth(0),
    outputHeight(0)
{
  weights.set_size(WeightSize(), 1);

  // Transform paddingType to lowercase.
  std::string paddingTypeLow = paddingType;
  util::ToLower(paddingType, paddingTypeLow);

  if (paddingTypeLow == "valid")
  {
    padWLeft = 0;
    padWRight = 0;
    padHTop = 0;
    padHBottom = 0;
  }
  else if (paddingTypeLow == "same")
  {
    InitializeSamePadding();
  }

  padding = ann::Padding<>(padWLeft, padWRight, padHTop, padHBottom);
}

主要分析一下第三个构造函数

weights 矩阵的行数为:

  //! Get size of weights for the layer.
  size_t WeightSize() const
  {
    return (outSize * inSize * kernelWidth * kernelHeight) + outSize;
  }

看一下 Reset 函数:

template<
    typename ForwardConvolutionRule,
    typename BackwardConvolutionRule,
    typename GradientConvolutionRule,
    typename InputDataType,
    typename OutputDataType
>
void Convolution<
    ForwardConvolutionRule,
    BackwardConvolutionRule,
    GradientConvolutionRule,
    InputDataType,
    OutputDataType
>::Reset()
{
    weight = arma::cube(weights.memptr(), kernelWidth, kernelHeight,
        outSize * inSize, false, false);
    bias = arma::mat(weights.memptr() + weight.n_elem,
        outSize, 1, false, false);
}

weight 是一个 Cube ,拥有 o u t S i z e × i n S i z e outSize \times inSize outSize×inSize 个 Slice,每个 Slice 包含了 k e r n e l W i d t h × k e r n e l H e i g h t kernelWidth \times kernelHeight kernelWidth×kernelHeight 的矩阵
而 bias 是一个列矩阵,有 outSize 行

由此就不难解释 weights 的行数了:
o u t S i z e × i n S i z e × k e r n e l W i d t h × k e r n e l H e i g h t + o u t S i z e outSize \times inSize \times kernelWidth \times kernelHeight + outSize outSize×inSize×kernelWidth×kernelHeight+outSize

接着是 padding ,我们先按照默认的 None 来

Padding 头文件:

/**
 * Implementation of the Padding module class. The Padding module applies a bias term
 * to the incoming data.
 *
 * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
 *         arma::sp_mat or arma::cube).
 * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
 *         arma::sp_mat or arma::cube).
 */
template <
    typename InputDataType = arma::mat,
    typename OutputDataType = arma::mat
>
class Padding
{
 public:
  /**
   * Create the Padding object using the specified number of output units.
   *
   * @param padWLeft Left padding width of the input.
   * @param padWRight Right padding width of the input.
   * @param padHTop Top padding height of the input.
   * @param padHBottom Bottom padding height of the input.
   */
  Padding(const size_t padWLeft = 0,
          const size_t padWRight = 0,
          const size_t padHTop = 0,
          const size_t padHBottom = 0);

  /**
   * Ordinary feed forward pass of a neural network, evaluating the function
   * f(x) by propagating the activity forward through f.
   *
   * @param input Input data used for evaluating the specified function.
   * @param output Resulting output activation.
   */
  template<typename eT>
  void Forward(const arma::Mat<eT>& input, arma::Mat<eT>& output);

  /**
   * Ordinary feed backward pass of a neural network, calculating the function
   * f(x) by propagating x backwards trough f. Using the results from the feed
   * forward pass.
   *
   * @param * (input) The propagated input activation.
   * @param gy The backpropagated error.
   * @param g The calculated gradient.
   */
  template<typename eT>
  void Backward(const arma::Mat<eT>& /* input */,
                const arma::Mat<eT>& gy,
                arma::Mat<eT>& g);

  //! Get the output parameter.
  OutputDataType const& OutputParameter() const { return outputParameter; }
  //! Modify the output parameter.
  OutputDataType& OutputParameter() { return outputParameter; }

  //! Get the delta.
  OutputDataType const& Delta() const { return delta; }
  //! Modify the delta.
  OutputDataType& Delta() { return delta; }

  //! Get the left padding width.
  size_t PadWLeft() const { return padWLeft; }
  //! Modify the left padding width.
  size_t& PadWLeft() { return padWLeft; }

  //! Get the right padding width.
  size_t PadWRight() const { return padWRight; }
  //! Modify the right padding width.
  size_t& PadWRight() { return padWRight; }

  //! Get the top padding width.
  size_t PadHTop() const { return padHTop; }
  //! Modify the top padding width.
  size_t& PadHTop() { return padHTop; }

  //! Get the bottom padding width.
  size_t PadHBottom() const { return padHBottom; }
  //! Modify the bottom padding width.
  size_t& PadHBottom() { return padHBottom; }

  /**
   * Serialize the layer.
   */
  template<typename Archive>
  void serialize(Archive& ar, const unsigned int /* version */);

 private:
  //! Locally-stored left padding width.
  size_t padWLeft;

  //! Locally-stored right padding width.
  size_t padWRight;

  //! Locally-stored top padding height.
  size_t padHTop;

  //! Locally-stored bottom padding height.
  size_t padHBottom;

  //! Locally-stored number of rows and columns of input.
  size_t nRows, nCols;

  //! Locally-stored delta object.
  OutputDataType delta;

  //! Locally-stored output parameter object.
  OutputDataType outputParameter;
}; // class Padding

实现:

template<typename InputDataType, typename OutputDataType>
Padding<InputDataType, OutputDataType>::Padding(
    const size_t padWLeft,
    const size_t padWRight,
    const size_t padHTop,
    const size_t padHBottom) :
    padWLeft(padWLeft),
    padWRight(padWRight),
    padHTop(padHTop),
    padHBottom(padHBottom),
    nRows(0),
    nCols(0)
{
  // Nothing to do here.
}

template<typename InputDataType, typename OutputDataType>
template<typename eT>
void Padding<InputDataType, OutputDataType>::Forward(
    const arma::Mat<eT>& input, arma::Mat<eT>& output)
{
  nRows = input.n_rows;
  nCols = input.n_cols;
  output = arma::zeros(nRows + padWLeft + padWRight,
      nCols + padHTop + padHBottom);
  output.submat(padWLeft, padHTop, padWLeft + nRows - 1,
      padHTop + nCols - 1) = input;
}

template<typename InputDataType, typename OutputDataType>
template<typename eT>
void Padding<InputDataType, OutputDataType>::Backward(
    const arma::Mat<eT>& /* input */,
    const arma::Mat<eT>& gy,
    arma::Mat<eT>& g)
{
  g = gy.submat(padWLeft, padHTop, padWLeft + nRows - 1,
      padHTop + nCols - 1);
}

template<typename InputDataType, typename OutputDataType>
template<typename Archive>
void Padding<InputDataType, OutputDataType>::serialize(
    Archive& ar, const unsigned int /* version */)
{
  ar & BOOST_SERIALIZATION_NVP(padWLeft);
  ar & BOOST_SERIALIZATION_NVP(padWRight);
  ar & BOOST_SERIALIZATION_NVP(padHTop);
  ar & BOOST_SERIALIZATION_NVP(padHBottom);
}

可以看到 padding 层的 Forward 和 Backward 函数只是单纯地扩充一下输出的矩阵,默认的填充数值为零

Forward

头文件:

  /**
   * Ordinary feed forward pass of a neural network, evaluating the function
   * f(x) by propagating the activity forward through f.
   *
   * @param input Input data used for evaluating the specified function.
   * @param output Resulting output activation.
   */
  template<typename eT>
  void Forward(const arma::Mat<eT>& input, arma::Mat<eT>& output);

实现:

template<
    typename ForwardConvolutionRule,
    typename BackwardConvolutionRule,
    typename GradientConvolutionRule,
    typename InputDataType,
    typename OutputDataType
>
template<typename eT>
void Convolution<
    ForwardConvolutionRule,
    BackwardConvolutionRule,
    GradientConvolutionRule,
    InputDataType,
    OutputDataType
>::Forward(const arma::Mat<eT>& input, arma::Mat<eT>& output)
{
  batchSize = input.n_cols;
  arma::cube inputTemp(const_cast<arma::Mat<eT>&>(input).memptr(),
      inputWidth, inputHeight, inSize * batchSize, false, false);

  if (padWLeft != 0 || padWRight != 0 || padHTop != 0 || padHBottom != 0)
  {
    inputPaddedTemp.set_size(inputTemp.n_rows + padWLeft + padWRight,
        inputTemp.n_cols + padHTop + padHBottom, inputTemp.n_slices);

    for (size_t i = 0; i < inputTemp.n_slices; ++i)
    {
      padding.Forward(inputTemp.slice(i), inputPaddedTemp.slice(i));
    }
  }

  size_t wConv = ConvOutSize(inputWidth, kernelWidth, strideWidth, padWLeft,
      padWRight);
  size_t hConv = ConvOutSize(inputHeight, kernelHeight, strideHeight, padHTop,
      padHBottom);

  output.set_size(wConv * hConv * outSize, batchSize);
  outputTemp = arma::Cube<eT>(output.memptr(), wConv, hConv,
      outSize * batchSize, false, false);
  outputTemp.zeros();

  for (size_t outMap = 0, outMapIdx = 0, batchCount = 0; outMap <
      outSize * batchSize; outMap++)
  {
    if (outMap != 0 && outMap % outSize == 0)
    {
      batchCount++;
      outMapIdx = 0;
    }

    for (size_t inMap = 0; inMap < inSize; inMap++, outMapIdx++)
    {
      arma::Mat<eT> convOutput;

      if (padWLeft != 0 || padWRight != 0 || padHTop != 0 || padHBottom != 0)
      {
        ForwardConvolutionRule::Convolution(inputPaddedTemp.slice(inMap +
            batchCount * inSize), weight.slice(outMapIdx), convOutput,
            strideWidth, strideHeight);
      }
      else
      {
        ForwardConvolutionRule::Convolution(inputTemp.slice(inMap +
            batchCount * inSize), weight.slice(outMapIdx), convOutput,
            strideWidth, strideHeight);
      }

      outputTemp.slice(outMap) += convOutput;
    }

    outputTemp.slice(outMap) += bias(outMap % outSize);
  }

  outputWidth = outputTemp.n_rows;
  outputHeight = outputTemp.n_cols;
}

先构建一个 Cube ,其 Slice 为 inSize * input.n_cols,每个 Slice 是一个 inputWidth * inputHeight 的矩阵

接着,如果需要扩充的话,就按照 pad 定的尺寸进行扩充,并将 inputTemp 的每一个 Slice 填充到 inputPaddedTemp 的每一个 Slice 上

然后计算:
w C o n v = ⌊ i n p u t W i d t h + p a d W L e f t + p a d W R i g h t − k e r n e l W i d t h ⌋ s t r i d e W i d t h + 1 h C o n v c = ⌊ i n p u t H e i g h t + p a d H T o p + p a d H B o t t o m − k e r n e l H e i g h t ⌋ s r i d e H e i g h t + 1 wConv = \dfrac{\lfloor inputWidth + padWLeft + padWRight - kernelWidth \rfloor}{strideWidth} + 1\\[6pt] hConvc = \dfrac{\lfloor inputHeight + padHTop + padHBottom - kernelHeight \rfloor}{srideHeight} + 1 wConv=strideWidthinputWidth+padWLeft+padWRightkernelWidth+1hConvc=srideHeightinputHeight+padHTop+padHBottomkernelHeight+1

接着按照这个尺寸设置 output 和 outputTemp

接下来的双重循环用到了 ForwardConvolutionRule 默认是 NaiveConvolution

其实现:

/**
 * Computes the two-dimensional convolution. This class allows specification of
 * the type of the border type. The convolution can be compute with the valid
 * border type of the full border type (default).
 *
 * FullConvolution: returns the full two-dimensional convolution.
 * ValidConvolution: returns only those parts of the convolution that are
 * computed without the zero-padded edges.
 *
 * @tparam BorderMode Type of the border mode (FullConvolution or
 * ValidConvolution).
 */
template<typename BorderMode = FullConvolution>
class NaiveConvolution
{
 public:
  /*
   * Perform a convolution (valid mode).
   *
   * @param input Input used to perform the convolution.
   * @param filter Filter used to perform the convolution.
   * @param output Output data that contains the results of the convolution.
   * @param dW Stride of filter application in the x direction.
   * @param dH Stride of filter application in the y direction.
   * @param dilationW The dilation factor in x direction.
   * @param dilationH The dilation factor in y direction.
   */
  template<typename eT, typename Border = BorderMode>
  static typename std::enable_if<
      std::is_same<Border, ValidConvolution>::value, void>::type
  Convolution(const arma::Mat<eT>& input,
              const arma::Mat<eT>& filter,
              arma::Mat<eT>& output,
              const size_t dW = 1,
              const size_t dH = 1,
              const size_t dilationW = 1,
              const size_t dilationH = 1)
  {
    output = arma::zeros<arma::Mat<eT> >(
        (input.n_rows - (filter.n_rows - 1) * dilationW - 1) / dW + 1,
        (input.n_cols - (filter.n_cols - 1) * dilationH -  1) / dH + 1);

    // It seems to be about 3.5 times faster to use pointers instead of
    // filter(ki, kj) * input(leftInput + ki, topInput + kj) and output(i, j).
    eT* outputPtr = output.memptr();

    for (size_t j = 0; j < output.n_cols; ++j)
    {
      for (size_t i = 0; i < output.n_rows; ++i, outputPtr++)
      {
        const eT* kernelPtr = filter.memptr();
        for (size_t kj = 0; kj < filter.n_cols; ++kj)
        {
          const eT* inputPtr = input.colptr(kj * dilationW + j * dW) + i * dH;
          for (size_t ki = 0; ki < filter.n_rows; ++ki, ++kernelPtr,
              inputPtr += dilationH)
            *outputPtr += *kernelPtr * (*inputPtr);
        }
      }
    }
  }

  /*
   * Perform a convolution (full mode).
   *
   * @param input Input used to perform the convolution.
   * @param filter Filter used to perform the convolution.
   * @param output Output data that contains the results of the convolution.
   * @param dW Stride of filter application in the x direction.
   * @param dH Stride of filter application in the y direction.
   * @param dilationW The dilation factor in x direction.
   * @param dilationH The dilation factor in y direction.
   */
  template<typename eT, typename Border = BorderMode>
  static typename std::enable_if<
      std::is_same<Border, FullConvolution>::value, void>::type
  Convolution(const arma::Mat<eT>& input,
              const arma::Mat<eT>& filter,
              arma::Mat<eT>& output,
              const size_t dW = 1,
              const size_t dH = 1,
              const size_t dilationW = 1,
              const size_t dilationH = 1)
  {
    size_t outputRows = (input.n_rows - 1) * dW + 2 * (filter.n_rows - 1)
        * dilationW + 1;
    size_t outputCols = (input.n_cols - 1) * dH + 2 * (filter.n_cols - 1)
        * dilationH + 1;

    for (size_t i = 0; i < dW; ++i)
    {
      if (((((i + outputRows - 2 * (filter.n_rows - 1) * dilationW - 1) % dW)
          + dW) % dW) == i){
        outputRows += i;
        break;
      }
    }
    for (size_t i = 0; i < dH; ++i)
    {
      if (((((i + outputCols - 2 * (filter.n_cols - 1) * dilationH - 1) % dH)
          + dH) % dH) == i){
        outputCols += i;
        break;
      }
    }

    // Pad filter and input to the working output shape.
    arma::Mat<eT> inputPadded = arma::zeros<arma::Mat<eT> >(outputRows,
        outputCols);
    inputPadded.submat((filter.n_rows - 1) * dilationW, (filter.n_cols - 1)
        * dilationH, (filter.n_rows - 1) * dilationW + input.n_rows - 1,
        (filter.n_cols - 1) * dilationH + input.n_cols - 1) = input;

    NaiveConvolution<ValidConvolution>::Convolution(inputPadded, filter,
        output, 1, 1, dilationW, dilationH);
  }

  /*
   * Perform a convolution using 3rd order tensors.
   *
   * @param input Input used to perform the convolution.
   * @param filter Filter used to perform the convolution.
   * @param output Output data that contains the results of the convolution.
   * @param dW Stride of filter application in the x direction.
   * @param dH Stride of filter application in the y direction.
   * @param dilationW The dilation factor in x direction.
   * @param dilationH The dilation factor in y direction.
   */
  template<typename eT>
  static void Convolution(const arma::Cube<eT>& input,
                          const arma::Cube<eT>& filter,
                          arma::Cube<eT>& output,
                          const size_t dW = 1,
                          const size_t dH = 1,
                          const size_t dilationW = 1,
                          const size_t dilationH = 1)
  {
    arma::Mat<eT> convOutput;
    NaiveConvolution<BorderMode>::Convolution(input.slice(0), filter.slice(0),
        convOutput, dW, dH, dilationW, dilationH);

    output = arma::Cube<eT>(convOutput.n_rows, convOutput.n_cols,
        input.n_slices);
    output.slice(0) = convOutput;

    for (size_t i = 1; i < input.n_slices; ++i)
    {
      NaiveConvolution<BorderMode>::Convolution(input.slice(i), filter.slice(i),
          output.slice(i), dW, dH, dilationW, dilationH);
    }
  }

  /*
   * Perform a convolution using dense matrix as input and a 3rd order tensors
   * as filter and output.
   *
   * @param input Input used to perform the convolution.
   * @param filter Filter used to perform the convolution.
   * @param output Output data that contains the results of the convolution.
   * @param dW Stride of filter application in the x direction.
   * @param dH Stride of filter application in the y direction.
   * @param dilationW The dilation factor in x direction.
   * @param dilationH The dilation factor in y direction.
   */
  template<typename eT>
  static void Convolution(const arma::Mat<eT>& input,
                          const arma::Cube<eT>& filter,
                          arma::Cube<eT>& output,
                          const size_t dW = 1,
                          const size_t dH = 1,
                          const size_t dilationW = 1,
                          const size_t dilationH = 1)
  {
    arma::Mat<eT> convOutput;
    NaiveConvolution<BorderMode>::Convolution(input, filter.slice(0),
        convOutput, dW, dH, dilationW, dilationH);

    output = arma::Cube<eT>(convOutput.n_rows, convOutput.n_cols,
        filter.n_slices);
    output.slice(0) = convOutput;

    for (size_t i = 1; i < filter.n_slices; ++i)
    {
      NaiveConvolution<BorderMode>::Convolution(input, filter.slice(i),
          output.slice(i), dW, dH, dilationW, dilationH);
    }
  }

  /*
   * Perform a convolution using a 3rd order tensors as input and output and a
   * dense matrix as filter.
   *
   * @param input Input used to perform the convolution.
   * @param filter Filter used to perform the convolution.
   * @param output Output data that contains the results of the convolution.
   * @param dW Stride of filter application in the x direction.
   * @param dH Stride of filter application in the y direction.
   * @param dilationW The dilation factor in x direction.
   * @param dilationH The dilation factor in y direction.
   */
  template<typename eT>
  static void Convolution(const arma::Cube<eT>& input,
                          const arma::Mat<eT>& filter,
                          arma::Cube<eT>& output,
                          const size_t dW = 1,
                          const size_t dH = 1,
                          const size_t dilationW = 1,
                          const size_t dilationH = 1)
  {
    arma::Mat<eT> convOutput;
    NaiveConvolution<BorderMode>::Convolution(input.slice(0), filter,
        convOutput, dW, dH, dilationW, dilationH);

    output = arma::Cube<eT>(convOutput.n_rows, convOutput.n_cols,
        input.n_slices);
    output.slice(0) = convOutput;

    for (size_t i = 1; i < input.n_slices; ++i)
    {
      NaiveConvolution<BorderMode>::Convolution(input.slice(i), filter,
          output.slice(i), dW, dH, dilationW, dilationH);
    }
  }
};  // class NaiveConvolution

Forward 默认使用的是 Valid 模式,简单看一下其实现(默认 dilationW = dilationH = 1):
i n p u t : ( m × n ) f i l t e r : ( p × q ) o u t p u t : ( a × b ) = ( m − p d W + 1 , n − q d H + 1 ) ⇒ o u t p u t ( i , j ) = ∑ k j = 0 q ∑ k i = 0 p i n p u t ( k i + i ⋅ d H   ,   k j + j ⋅ d W ) × k e r n e l ( k i   ,   k j ) input : (m \times n) \\[5pt] filter : (p \times q) \\[5pt] output: (a \times b) = ( \dfrac{m-p}{dW} + 1 , \dfrac{n-q}{dH} + 1) \\[6pt] \Rightarrow output_{(i , j)} = \sum_{k_j = 0}^{q} \sum_{k_i = 0}^p input_{(k_i + i \cdot dH \ , \ k_j + j \cdot dW)} \times kernel_{(k_i \ , \ k_j)} input:(m×n)filter:(p×q)output:(a×b)=(dWmp+1,dHnq+1)output(i,j)=kj=0qki=0pinput(ki+idH , kj+jdW)×kernel(ki , kj)

(注意 Armadillo 中的矩阵以列为主序)
官方说明:

.memptr()
Data for matrices is stored in a column-by-column order
Data for cubes is stored in a slice-by-slice (matrix-by-matrix) order

第一重循环是 outMap 的循环,而 outMap 是在遍历 outputTemp 的每一个 Slice ,其中,每到一个 outSize 的倍数时(除了零),递增 batchCount 以及置零 outMapIdx

outMapIdx 是在遍历 weight 的 Slice (weight 的 Slice 个数是:outSize × \times × inSize)

第二重循环是 inMap 的循环,inMap 在其中遍历 inSize ,同时 outMapIdx 也在每次循环中递增(因此,outMapIdx 的递增将循环 weights 的全部 Slice),循环内部则使用上面介绍的卷积操作,将 inputTemp (或相应填充过的)与 weight 进行卷积,结果加到 outputTemp 中

这里的 batchCount 就起到了统一每个 batch 的作用,因为 inputTemp 的 Slice 个数是 inSize × \times × batchSize
batchCount × \times × inSize 相当于是基准,inMap 就是每次的偏移

当一个 batch 卷积结束后,还要再加上相应的 bias

最后调整 outputWidth 和 outputHeight

Backward

头文件:

  /**
   * Ordinary feed backward pass of a neural network, calculating the function
   * f(x) by propagating x backwards through f. Using the results from the feed
   * forward pass.
   *
   * @param * (input) The propagated input activation.
   * @param gy The backpropagated error.
   * @param g The calculated gradient.
   */
  template<typename eT>
  void Backward(const arma::Mat<eT>& /* input */,
                const arma::Mat<eT>& gy,
                arma::Mat<eT>& g);

实现:

template<
    typename ForwardConvolutionRule,
    typename BackwardConvolutionRule,
    typename GradientConvolutionRule,
    typename InputDataType,
    typename OutputDataType
>
template<typename eT>
void Convolution<
    ForwardConvolutionRule,
    BackwardConvolutionRule,
    GradientConvolutionRule,
    InputDataType,
    OutputDataType
>::Backward(
    const arma::Mat<eT>& /* input */, const arma::Mat<eT>& gy, arma::Mat<eT>& g)
{
  arma::cube mappedError(((arma::Mat<eT>&) gy).memptr(), outputWidth,
      outputHeight, outSize * batchSize, false, false);

  g.set_size(inputWidth * inputHeight * inSize, batchSize);
  gTemp = arma::Cube<eT>(g.memptr(), inputWidth, inputHeight,
      inSize * batchSize, false, false);
  gTemp.zeros();

  for (size_t outMap = 0, outMapIdx = 0, batchCount = 0; outMap <
      outSize * batchSize; outMap++)
  {
    if (outMap != 0 && outMap % outSize == 0)
    {
      batchCount++;
      outMapIdx = 0;
    }

    for (size_t inMap = 0; inMap < inSize; inMap++, outMapIdx++)
    {
      arma::Mat<eT> output, rotatedFilter;
      Rotate180(weight.slice(outMapIdx), rotatedFilter);

      BackwardConvolutionRule::Convolution(mappedError.slice(outMap),
          rotatedFilter, output, strideWidth, strideHeight);

      if (padWLeft != 0 || padWRight != 0 || padHTop != 0 || padHBottom != 0)
      {
        gTemp.slice(inMap + batchCount * inSize) += output.submat(padWLeft,
            padHTop, padWLeft + gTemp.n_rows - 1, padHTop + gTemp.n_cols - 1);
      }
      else
      {
        gTemp.slice(inMap + batchCount * inSize) += output;
      }
    }
  }
}

双重循环里用到了 BackwardConvolutionRule::Convolution 其默认是 NaiveConvolution 去看一下其实现:

  /*
   * Perform a convolution (full mode).
   *
   * @param input Input used to perform the convolution.
   * @param filter Filter used to perform the convolution.
   * @param output Output data that contains the results of the convolution.
   * @param dW Stride of filter application in the x direction.
   * @param dH Stride of filter application in the y direction.
   * @param dilationW The dilation factor in x direction.
   * @param dilationH The dilation factor in y direction.
   */
  template<typename eT, typename Border = BorderMode>
  static typename std::enable_if<
      std::is_same<Border, FullConvolution>::value, void>::type
  Convolution(const arma::Mat<eT>& input,
              const arma::Mat<eT>& filter,
              arma::Mat<eT>& output,
              const size_t dW = 1,
              const size_t dH = 1,
              const size_t dilationW = 1,
              const size_t dilationH = 1)
  {
    size_t outputRows = (input.n_rows - 1) * dW + 2 * (filter.n_rows - 1)
        * dilationW + 1;
    size_t outputCols = (input.n_cols - 1) * dH + 2 * (filter.n_cols - 1)
        * dilationH + 1;

    for (size_t i = 0; i < dW; ++i)
    {
      if (((((i + outputRows - 2 * (filter.n_rows - 1) * dilationW - 1) % dW)
          + dW) % dW) == i){
        outputRows += i;
        break;
      }
    }
    for (size_t i = 0; i < dH; ++i)
    {
      if (((((i + outputCols - 2 * (filter.n_cols - 1) * dilationH - 1) % dH)
          + dH) % dH) == i){
        outputCols += i;
        break;
      }
    }

    // Pad filter and input to the working output shape.
    arma::Mat<eT> inputPadded = arma::zeros<arma::Mat<eT> >(outputRows,
        outputCols);
    inputPadded.submat((filter.n_rows - 1) * dilationW, (filter.n_cols - 1)
        * dilationH, (filter.n_rows - 1) * dilationW + input.n_rows - 1,
        (filter.n_cols - 1) * dilationH + input.n_cols - 1) = input;

    NaiveConvolution<ValidConvolution>::Convolution(inputPadded, filter,
        output, 1, 1, dilationW, dilationH);
  }

FullConvolution 就是在 ValidConvolution 之前找到适当 outputRows 和 outputCols 构造出 padded 后的 input ,再调用 ValidConvolution 的卷积

回到 Backward ,大概的过程和 Forward 差不多,根据误差 (gy) 和梯度 (g) 分别构造出临时的 cube ,接着将逆时针旋转了 18 0 ∘ 180^{\circ} 180 的 weight 与误差进行卷积并加入到梯度矩阵里

Gradient

头文件:

  /*
   * Calculate the gradient using the output delta and the input activation.
   *
   * @param input The input parameter used for calculating the gradient.
   * @param error The calculated error.
   * @param gradient The calculated gradient.
   */
  template<typename eT>
  void Gradient(const arma::Mat<eT>& /* input */,
                const arma::Mat<eT>& error,
                arma::Mat<eT>& gradient);

实现:

template<
    typename ForwardConvolutionRule,
    typename BackwardConvolutionRule,
    typename GradientConvolutionRule,
    typename InputDataType,
    typename OutputDataType
>
template<typename eT>
void Convolution<
    ForwardConvolutionRule,
    BackwardConvolutionRule,
    GradientConvolutionRule,
    InputDataType,
    OutputDataType
>::Gradient(
    const arma::Mat<eT>& input,
    const arma::Mat<eT>& error,
    arma::Mat<eT>& gradient)
{
  arma::cube mappedError(((arma::Mat<eT>&) error).memptr(), outputWidth,
      outputHeight, outSize * batchSize, false, false);
  arma::cube inputTemp(((arma::Mat<eT>&) input).memptr(), inputWidth,
      inputHeight, inSize * batchSize, false, false);

  gradient.set_size(weights.n_elem, 1);
  gradientTemp = arma::Cube<eT>(gradient.memptr(), weight.n_rows,
      weight.n_cols, weight.n_slices, false, false);
  gradientTemp.zeros();

  for (size_t outMap = 0, outMapIdx = 0, batchCount = 0; outMap <
      outSize * batchSize; outMap++)
  {
    if (outMap != 0 && outMap % outSize == 0)
    {
      batchCount++;
      outMapIdx = 0;
    }

    for (size_t inMap = 0; inMap < inSize; inMap++, outMapIdx++)
    {
      arma::Mat<eT> inputSlice;
      if (padWLeft != 0 || padWRight != 0 || padHTop != 0 || padHBottom != 0)
      {
        inputSlice = inputPaddedTemp.slice(inMap + batchCount * inSize);
      }
      else
      {
        inputSlice = inputTemp.slice(inMap + batchCount * inSize);
      }

      arma::Mat<eT> deltaSlice = mappedError.slice(outMap);

      arma::Mat<eT> output;
      GradientConvolutionRule::Convolution(inputSlice, deltaSlice,
          output, strideWidth, strideHeight);

      if (gradientTemp.n_rows < output.n_rows ||
          gradientTemp.n_cols < output.n_cols)
      {
        gradientTemp.slice(outMapIdx) += output.submat(0, 0,
            gradientTemp.n_rows - 1, gradientTemp.n_cols - 1);
      }
      else if (gradientTemp.n_rows > output.n_rows ||
          gradientTemp.n_cols > output.n_cols)
      {
        gradientTemp.slice(outMapIdx).submat(0, 0, output.n_rows - 1,
            output.n_cols - 1) += output;
      }
      else
      {
        gradientTemp.slice(outMapIdx) += output;
      }
    }

    gradient.submat(weight.n_elem + (outMap % outSize), 0, weight.n_elem +
        (outMap % outSize), 0) = arma::accu(mappedError.slice(outMap));
  }
}

首先是 input 和 error 进行卷积
接着将这个卷积的结果以不出界的形式加到 gradientTemp 中
一轮 batch 后,gradient 再加上 mappedError 相应 Slice 的元素之和

Test

iris

#include 
#include 
#include 
#include 

using namespace arma;
using namespace mlpack;
using namespace mlpack::ann;

void ffn_test()
{
    // load data
    mat train_data;
    mat train_labels;
    mat test_data;
    mat test_labels;

    mlpack::data::Load("/home/aurainting/下载/mlpack-3.4.2/build/iris_train.csv", train_data);
    mlpack::data::Load("/home/aurainting/下载/mlpack-3.4.2/build/iris_train_labels.csv", train_labels);
    mlpack::data::Load("/home/aurainting/下载/mlpack-3.4.2/build/iris_test.csv", test_data);
    mlpack::data::Load("/home/aurainting/下载/mlpack-3.4.2/build/iris_test_labels.csv", test_labels);

    // build model
    FFN<> model;
    model.Add<Linear<>>(train_data.n_rows, 6);
    model.Add<ReLULayer<>>();
    model.Add<Linear<>>(6, 4);
    model.Add<ReLULayer<>>();
    model.Add<Linear<>>(4, 3);
    model.Add<LogSoftMax<>>();

    // train
    model.Train<ens::Adam>(train_data, train_labels + 1, ens::ProgressBar());

    // predict
    mat res;
    model.Predict(test_data, res);
    mat pred(1, test_labels.n_cols);
    for (size_t i = 0; i < res.n_cols; ++i)
        pred(0, i) = arma::index_max(res.col(i));
    cout << "accuracy: "
         << static_cast<double>(arma::accu(pred == test_labels)) / test_labels.n_cols << endl;
}


int main()
{
    ffn_test();
}

结果:
FFN(mlpack)_第1张图片

mnist

#include 
#include 
#include 
#include 
#include 
#include 

using namespace std;
using namespace arma;
using namespace mlpack;
using namespace mlpack::ann;

int reverseInt(int i)
{
    unsigned char ch1, ch2, ch3, ch4;
    ch1 = i & 255;
    ch2 = (i >> 8) & 255;
    ch3 = (i >> 16) & 255;
    ch4 = (i >> 24) & 255;
    return ((int)ch1 << 24) + ((int)ch2 << 16) + ((int)ch3 << 8) + ch4;
}

void read_mnist_labels(const string& filepath, mat& labels)
{
    ifstream file(filepath, ios::binary);
    if (file.is_open()) {
        int magic_number = 0;
        int number_of_items = 0;

        file.read((char*)&magic_number, sizeof (magic_number));
        file.read((char*)&number_of_items, sizeof (number_of_items));

        magic_number = reverseInt(magic_number);
        number_of_items = reverseInt(number_of_items);

        labels.resize(1, number_of_items);
        for (int i = 0; i < number_of_items; ++i) {
            unsigned char label = 0;
            file.read((char*)&label, sizeof (label));
            labels(0, i) = label;
        }
    }
}

void read_mnist_images(const string& filepath, mat& images)
{
    ifstream file(filepath, ios::binary);
    if (file.is_open()) {
        int magic_number = 0;
        int number_of_images = 0;
        int n_rows = 0;
        int n_cols = 0;

        file.read((char*)&magic_number, sizeof (magic_number));
        file.read((char*)&number_of_images, sizeof (number_of_images));
        file.read((char*)&n_rows, sizeof (n_rows));
        file.read((char*)&n_cols, sizeof (n_cols));

        magic_number = reverseInt(magic_number);
        number_of_images = reverseInt(number_of_images);
        n_rows = reverseInt(n_rows);
        n_cols = reverseInt(n_cols);

        images.reshape(n_rows * n_cols, number_of_images);
        for (int i = 0; i < number_of_images; ++i)
            for (int j = 0; j < n_rows * n_cols; ++j) {
                unsigned char pixel = 0;
                file.read((char*)&pixel, sizeof (pixel));
                images(j, i) = pixel;
            }
    }
}

void ffn_test()
{
    // load data
    string train_labels_path = "/home/aurainting/文档/data/mnist/train-labels-idx1-ubyte";
    string train_images_path = "/home/aurainting/文档/data/mnist/train-images-idx3-ubyte";
    string test_labels_path = "/home/aurainting/文档/data/mnist/t10k-labels-idx1-ubyte";
    string test_images_path = "/home/aurainting/文档/data/mnist/t10k-images-idx3-ubyte";

    mat train_labels;
    mat test_labels;
    mat train_images;
    mat test_images;

    read_mnist_labels(train_labels_path, train_labels);
    read_mnist_labels(test_labels_path, test_labels);
    read_mnist_images(train_images_path, train_images);
    read_mnist_images(test_images_path, test_images);

    // normalize
    uword nPoints = train_images.n_cols;
    for (uword i = 0; i < nPoints; ++i)
        train_images.col(i) /= norm(train_images.col(i), 2);
    nPoints = test_images.n_cols;
    for (uword i = 0; i < nPoints; ++i)
        test_images.col(i) /= norm(test_images.col(i), 2);

    // build model
    FFN<> model;
    model.Add<Convolution<>>(1, 8, 5, 5, 1, 1, 0, 0, 28, 28);
    model.Add<ReLULayer<>>();
    model.Add<MaxPooling<>>(8, 8, 2, 2);
    model.Add<Convolution<>>(8, 12, 2, 2);
    model.Add<ReLULayer<>>();
    model.Add<MaxPooling<>>(2, 2, 2, 2);
    model.Add<Linear<>>(192, 32);
    model.Add<ReLULayer<>>();
    model.Add<Linear<>>(32, 10);
    model.Add<LogSoftMax<>>();

    // train
    ens::Adam opt(0.001, 8, 0.9, 0.999, 1e-8, 8 * train_images.n_cols);
    model.Train<ens::Adam>(train_images, train_labels + 1, opt, ens::ProgressBar());

    // predict
    mat results;
    model.Predict(test_images, results);
    mat pred(1, results.n_cols);
    for (size_t i = 0; i < results.n_cols; ++i)
        pred(0, i) = arma::index_max(results.col(i));
    cout << "accuracy: "
         << static_cast<double>(arma::accu(pred == test_labels)) / test_labels.n_cols << endl;
}


int main()
{
    ffn_test();
}

结果:
FFN(mlpack)_第2张图片

Reference

Artificial Neural Network
Armadillo

你可能感兴趣的:(神经网络,深度学习)