今天来看一下全连接层的代码。首先,我们要知道全连接层在做什么。先来看一下caffe的官方文档,介绍如下:
可以看到,输入为n c h w,输出为n c_o 1 1
那么,它究竟做了什么那?
假设conv2的输入是256*27*27,那么conv2的输出即50*22*22,conv2的输入即
pool2的输入,pool2的输出为50*11*11,即ip1的输入,ip1的输出为500*1*1,那
么pool2->ip1的参数个数是多少呢?这里就要理解好什么是fully_connected了,
即wTx,x为列向量,w的长度与x相同。在本文的例子中x的维度为50*11*11,那么
pool2->ip1的参数个数为500*50*11*11 。50*11*11即是一个有50个通道大小为
11*11的图片,那么在做完全卷积的时候,需要把对所有通道一起作卷积,即把图
片转化成一个50*11*11的向量
上段话转自http://www.cnblogs.com/dupuleng/articles/4312149.html
我们再来看一下它的头文件,其中有以下参数:
同大部分layer层一样,它也必须实现setup、reshape、Forward_cpu、Backward_cpu。
* 其中,setup定义参数:M_样本个数、K_单个样本特征长度、N_全连接之后神经元的个数。
* Forward_cpu,主要是计算y=W’*x + b, X表示输入,y表示输出 。x为输入,维度 M_*K_ 、 y为输出,维度 M_*N_ 、W为权重,维度 N_*K_, W_diff权重的梯度维度也为N_*K_ 、 b为偏置,维度 N_*1_ 。
* Backward_cpu:反向传播就是在更新w、b,计算delta。
我们来具体看一下代码:
#include
#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/filler.hpp"
#include "caffe/layer.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/vision_layers.hpp"
namespace caffe {
/*
输入层:(M_, N_, 1, 1);
输出层: (M_, K_, 1, 1);
W矩阵:(N_,K_,1,1);
b矩阵:(N_,1,1,1);
M_样本个数,K_单个样本特征长度,N_全连接之后神经元的个数。
*/
template <typename Dtype>
void InnerProductLayer::LayerSetUp(const vector *>& bottom,
const vector *>& top) {
//通过读取配置proto文件获得输出神经元的个数及是否使用偏置项
const int num_output = this->layer_param_.inner_product_param().num_output();
bias_term_ = this->layer_param_.inner_product_param().bias_term();
//全连接之后输出的神经元的个数
N_ = num_output;
//全连接层输出的Blob维数为样本的个数*输出神经元的个数*1*1(M*N)
//这里axis=1,即从C开始展开,即,CHW
//输出:n_1 * (c_1 + c_2 + ... + c_K) * h * w
const int axis = bottom[0]->CanonicalAxisIndex(
this->layer_param_.inner_product_param().axis());
// Dimensions starting from "axis" are "flattened" into a single
// length K_ vector. For example, if bottom[0]'s shape is (N, C, H, W),
// and axis == 1, N inner products with dimension CHW are performed.
//表示单个样本的特征长度,
K_ = bottom[0]->count(axis);
// Check if we need to set up the weights
if (this->blobs_.size() > 0) {
LOG(INFO) << "Skipping parameter initialization";
} else {
//如果配置文件使用偏置项,则开辟2个Blob类智能指针,否则开辟一个
if (bias_term_) {
this->blobs_.resize(2);
} else {
this->blobs_.resize(1);
}
// Intialize the weight
vector<int> weight_shape(2);
//lobs_[0]指向权重矩阵,blobs_[1]指向偏置矩阵 ,全连接层,形状为N_*K_*1*1
weight_shape[0] = N_;
weight_shape[1] = K_;
//新开辟一个Blob,指针返回给blobs_[0],weight_shape[2]为刚刚初始化的;
this->blobs_[0].reset(new Blob(weight_shape));
// fill the weights
//根据配置文件中的权重核( weight_filler )的类型初始化填充权重矩阵blobs_[0];
shared_ptr > weight_filler(GetFiller(
this->layer_param_.inner_product_param().weight_filler()));
weight_filler->Fill(this->blobs_[0].get());
// If necessary, intiialize and fill the bias term
//填充偏置矩阵blobs_[1],每个输出单元对应一个偏置,共N_个
if (bias_term_) {
vector<int> bias_shape(1, N_);
this->blobs_[1].reset(new Blob(bias_shape));
shared_ptr > bias_filler(GetFiller(
this->layer_param_.inner_product_param().bias_filler()));
bias_filler->Fill(this->blobs_[1].get());
}
} // parameter initialization
this->param_propagate_down_.resize(this->blobs_.size(), true);
}
template <typename Dtype>
//一批次处理多个样本,在每一批次中权重矩阵与偏置矩阵是不变的
void InnerProductLayer::Reshape(const vector *>& bottom,
const vector *>& top) {
// Figure out the dimensions
const int axis = bottom[0]->CanonicalAxisIndex(
this->layer_param_.inner_product_param().axis());
const int new_K = bottom[0]->count(axis);
CHECK_EQ(K_, new_K)
<< "Input size incompatible with inner product parameters.";
// The first "axis" dimensions are independent inner products; the total
// number of these is M_, the product over these dimensions.
M_ = bottom[0]->count(0, axis);
// The top shape will be the bottom shape with the flattened axes dropped,
// and replaced by a single axis with dimension num_output (N_).
vector<int> top_shape = bottom[0]->shape();
top_shape.resize(axis + 1);
top_shape[axis] = N_;
top[0]->Reshape(top_shape);
// Set up the bias multiplier
if (bias_term_) {
vector<int> bias_shape(1, M_);
bias_multiplier_.Reshape(bias_shape);
caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data());
}
}
// 计算y=W'*x + b, X表示输入,y表示输出
// x为输入,维度 M_*K_
// y为输出,维度 M_*N_
// W为权重,维度 N_*K_, W_diff权重的梯度维度也为N_*K_
// b为偏置,维度 N_*1_
template <typename Dtype>
void InnerProductLayer::Forward_cpu(const vector *>& bottom,
const vector *>& top) {
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
const Dtype* weight = this->blobs_[0]->cpu_data();//内存中的权重矩阵是N*K
//caffe_cpu_gemm, C←αA×B+βC,前两个参数控制A,B是否转置
//其中A(bottom_data)维度是M_xK_,B(weight')维度是K_xN_,C(top_data)维度为M_xN_
//最终 y = X*W', 维度为 M_xN_
caffe_cpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
bottom_data, weight, (Dtype)0., top_data);
// 表示y= y + b (bias_multiplier维度为M_*1, b为1*N_(b实际上是N_*1,但是存储方式与1*N_等价,
// top_data为M_*N_)
// 实际是相当于将b复制成了M_*N_的矩阵,类似matlab的repmat(b, [M_, 1]),然后和top_data相加
if (bias_term_) {
caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
bias_multiplier_.cpu_data(),
this->blobs_[1]->cpu_data(), (Dtype)1., top_data);
}
}
template <typename Dtype>
void InnerProductLayer::Backward_cpu(const vector *>& top,
const vector<bool>& propagate_down,
const vector *>& bottom) {
//data传递的是数据,diff传递的是梯度,top_diff的维度是N*M,每一列代表一个样本的error term
if (this->param_propagate_down_[0]) {
const Dtype* top_diff = top[0]->cpu_diff();
const Dtype* bottom_data = bottom[0]->cpu_data();
// Gradient with respect to weight
//A(top_diff'):N_*M_, B(bottom_data):M_*K_, C(W_diff):N_*K_
//W_diff = top_diff' * bottom_data
caffe_cpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff());
}
if (bias_term_ && this->param_propagate_down_[1]) {
const Dtype* top_diff = top[0]->cpu_diff();
// Gradient with respect to bias
// top_diff(M_*N_), bias_multiplier(M_*1), b_diff(N_1)
// b_diff = top_diff' * bias_multiplier, 注意和gemm接口的区别
caffe_cpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff,
bias_multiplier_.cpu_data(), (Dtype)0.,
this->blobs_[1]->mutable_cpu_diff());
}
if (propagate_down[0]) {
const Dtype* top_diff = top[0]->cpu_diff();
// Gradient with respect to bottom data
// A(top_diff) M_*N_ , B(weight) N_*K_, C(bottom_diff) M_*K_
// bottom_diff = top_diff * weight
caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
top_diff, this->blobs_[0]->cpu_data(), (Dtype)0.,
bottom[0]->mutable_cpu_diff());
}
}
#ifdef CPU_ONLY
STUB_GPU(InnerProductLayer);
#endif
INSTANTIATE_CLASS(InnerProductLayer);
REGISTER_LAYER_CLASS(InnerProduct);
} // namespace caffe
这里有三点要介绍一下:
(1)axis的理解,这里,在官方文档中介绍如下:
在这里,axis=1,也就是说,我们从c开始暂开,这里n代表的是样本个数。
(2)这里涉及到一个函数,caffe_cpu_gemm,它的实现如下:
void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
const float alpha, const float* A, const float* B, const float beta,
float* C) {
int lda = (TransA == CblasNoTrans) ? K : M;
int ldb = (TransB == CblasNoTrans) ? N : K;
cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
ldb, beta, C, N);
}
也就是矩阵相乘后相加,功能: C(vetor)←alpba*AB+ beta*C. This function multiplies A * C (after transposing A, if needed) and multiplies the resulting matrix by alpha. It then multiplies vector C by beta. It stores the sum of these two products in vector C.
(3)caffe_cpu_gemv,是矩阵相乘,功能: C(vetor)←alpba*AB+ beta*C. This function multiplies A * C (after transposing A, if needed) and multiplies the resulting matrix by alpha. It then multiplies vector C by beta. It stores the sum of these two products in vector C.