提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
我在onnx模型转caffe模型时发现,caffe缺少一些onnx的算子。比如矩阵乘法算子,caffe默认是通过InnerProductLayer来实现。但是,InnerProductLayer的实现方式为输入一个矩阵X,全连接层提供权重矩阵W(这个权重矩阵W我们不能输入,为算子内部自动提供的parameter blob),之后输出Y(Y = WX),这与我们希望的矩阵乘法的实现方式有所不同。我们所希望的矩阵乘法应该满足两个输入X与W,通过算子得到输出Y,与Y = np.multiply(X,W)相似。
总言而之,caffe默认的矩阵乘法算子InnerProductLayer与我们所需要的乘法算子不同。不点在于:
假设损失函数(loss function)为 I \Iota I : R M × N R^{M×N} RM×N → R. 此损失函数的输入为 Z \Zeta Z: ∈ \in ∈ R M × N R^{M×N} RM×N, Z \Zeta Z = XY, 其中X与Y为矩阵乘法的输入,X ∈ \in ∈ R M × K R^{M×K} RM×K, Y ∈ \in ∈ R K × N R^{K×N} RK×N, Z \Zeta Z 就自然对应为矩阵乘法的输出。当神经网络做向后传递时(backward propagation), 损失函数的偏导数(partial derivatives)为 ∂ l ( Z ) ∂ X {∂ l(Z)}\over{∂X} ∂X∂l(Z) + ∂ l ( Z ) ∂ Y {∂ l(Z)}\over{∂Y} ∂Y∂l(Z)。所以,我们只要能够求解 ∂ l ( Z ) ∂ X {∂ l(Z)}\over{∂X} ∂X∂l(Z) ∈ \in ∈ R M × K R^{M×K} RM×K与 ∂ l ( Z ) ∂ Y {∂ l(Z)}\over{∂Y} ∂Y∂l(Z) ∈ \in ∈ R K × N R^{K×N} RK×N,那么就能够成功推导出反向传递的公式。
首先看 ∂ l ( Z ) ∂ X {∂ l(Z)}\over{∂X} ∂X∂l(Z), 通过链式法则(chain rule)可得: ∂ l ( Z ) ∂ X {∂ l(Z)}\over{∂X} ∂X∂l(Z) = ∂ l ( Z ) ∂ Z {∂ l(Z)}\over{∂Z} ∂Z∂l(Z) ∂ Z ∂ X {∂Z}\over{∂X} ∂X∂Z = ∂ l ( Z ) ∂ Z {∂ l(Z)}\over{∂Z} ∂Z∂l(Z) ∂ ( X Y ) ∂ X {∂(XY)}\over{∂X} ∂X∂(XY) = ∂ l ( Z ) ∂ Z {∂ l(Z)}\over{∂Z} ∂Z∂l(Z) Y T Y^{T} YT。 其中 ∂ l ( Z ) ∂ X {∂ l(Z)}\over{∂X} ∂X∂l(Z) ∈ \in ∈ R M × K R^{M×K} RM×K, ∂ l ( Z ) ∂ Z {∂ l(Z)}\over{∂Z} ∂Z∂l(Z) ∈ \in ∈ R M × N R^{M×N} RM×N, Y T Y^{T} YT ∈ \in ∈ R N × K R^{N×K} RN×K,上述等式结构上成立。如果将这个公式翻译为caffe代码,就是X.differential = Top.differential * Y.data.transpose。
同理, ∂ l ( Z ) ∂ Y {∂ l(Z)}\over{∂Y} ∂Y∂l(Z) = ∂ l ( Z ) ∂ Z {∂ l(Z)}\over{∂Z} ∂Z∂l(Z) ∂ Z ∂ Y {∂Z}\over{∂Y} ∂Y∂Z = ∂ l ( Z ) ∂ Z {∂ l(Z)}\over{∂Z} ∂Z∂l(Z) ∂ ( X Y ) ∂ Y {∂(XY)}\over{∂Y} ∂Y∂(XY) = X T X^{T} XT ∂ l ( Z ) ∂ Z {∂ l(Z)}\over{∂Z} ∂Z∂l(Z)。其中 ∂ l ( Z ) ∂ Y {∂ l(Z)}\over{∂Y} ∂Y∂l(Z) ∈ \in ∈ R K × N R^{K×N} RK×N, X T X^{T} XT ∈ \in ∈ R K × M R^{K×M} RK×M, ∂ l ( Z ) ∂ Z {∂ l(Z)}\over{∂Z} ∂Z∂l(Z) ∈ \in ∈ R M × N R^{M×N} RM×N,上述等式结构上成立。接着翻译为caffe代码,为Y.differential = X.data.transpose * Top.differential。
void MatMulLayer::LayerSetUp(const vector
const vector
LOG_IF(INFO, Caffe::root_solver()) << "begin setting up";
// use axis to check two bottom block if satisfied standard
const int xaxes = bottom[0]->num_axes();
LOG_IF(INFO, Caffe::root_solver()) << "xaxes" << xaxes;
const int yaxes = bottom[1]->num_axes();
LOG_IF(INFO, Caffe::root_solver()) << "yaxes" << yaxes;
CHECK(xaxes>=2)
<< "X blob must be of shape (B1xB2...xBn,M,K) or (M,K)!";
CHECK(yaxes>=2)
<< "y blob must be of shape (B1xB2...xBn,K,N) or (K,N)!";
CHECK(xaxes==yaxes)
<< "X blob must hold same shape as yaxes";
// get M, K and N
const int xK_ = bottom[0]->shape(-1);
const int yK_ = bottom[1]->shape(-2);
CHECK(xK_ == yK_)
<< "demension of X is incomplicatiable with y, not match formula of mat mul ";
M_ = bottom[0]->shape(-2);
K_ = xK_;
N_ = bottom[1]->shape(-1);
}
void MatMulLayer::Reshape(const vector
const vector
// create top block
const int end_index = bottom[0]->num_axes() - 1;
vector top_shape = bottom[0]->shape();
LOG_IF(INFO, Caffe::root_solver()) << "bottom shape end" << top_shape[end_index];
LOG_IF(INFO, Caffe::root_solver()) << "bottom shape end+1" << top_shape[end_index-1];
N_ = bottom[1]->shape(-1);
top_shape[end_index] = N_;
LOG_IF(INFO, Caffe::root_solver()) << "top shape end" << top_shape[end_index];
LOG_IF(INFO, Caffe::root_solver()) << "top shape end+1" << top_shape[end_index-1];
top[0]->Reshape(top_shape);
}
void MatMulLayer::Forward_cpu(const vector
const vector
// get matrix1, matrix2 and output
LOG_IF(INFO, Caffe::root_solver()) << "begin forward";
const Dtype* bottom_matrix1 = bottom[0]->cpu_data();
const Dtype* bottom_matrix2 = bottom[1]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
// address to skip when passing each 2 * 2matrix
const int botm1_stride = M_ * K_;
const int botm2_stride = K_ * N_;
const int top_stride = M_ * N_;
int batch_size = 1;
// number of batch size if exist
const int xaxes = bottom[0]->num_axes();
if(xaxes > 2){
batch_size = top[0]->count(0, top[0]->CanonicalAxisIndex(-2));
}
for(int batch=0; batch(CblasNoTrans,
CblasNoTrans,
M_, N_, K_,
(Dtype)1.,
bottom_matrix1 + botm1_stride * batch,
bottom_matrix2 + botm2_stride * batch,
(Dtype)0.,
top_data + top_stride * batch );
}
}
void MatMulLayer::Backward_cpu(const vector
const vector& propagate_down,
const vector
LOG_IF(INFO, Caffe::root_solver()) << "begin backward";
// address to skip when passing each 2 * 2matrix
const int botm1_stride = M_ * K_;
const int botm2_stride = K_ * N_;
const int top_stride = M_ * N_;
int batch_size = 1;
// number of batch size if exist
const int xaxes = bottom[0]->num_axes();
if(xaxes > 2){
batch_size = top[0]->count(0, top[0]->CanonicalAxisIndex(-2));
}
const Dtype* top_diff = top[0]->cpu_diff();
const Dtype* bottom_1 = bottom[0]->cpu_data();
const Dtype* bottom_2 = bottom[1]->cpu_data();
Dtype* botm_1_diff = bottom[0]->mutable_cpu_diff();
Dtype* botm_2_diff = bottom[1]->mutable_cpu_diff();
if(propagate_down[0]){
// X.diff = Z.diff∗Y.data.transpose()
for(int batch=0; batch(CblasNoTrans,
CblasTrans,
M_, K_, N_,
(Dtype)1.,
top_diff + top_stride * batch,
bottom_2 + botm2_stride * batch,
(Dtype)0.,
botm_1_diff + botm1_stride * batch );
}
}
if(propagate_down[1]){
// Y.diff = X.data.transpose()∗Z.diff
for(int batch=0; batch(CblasTrans,
CblasNoTrans,
K_, N_, M_,
(Dtype)1.,
bottom_1 + botm1_stride * batch,
top_diff + top_stride * batch,
(Dtype)0.,
botm_2_diff + botm2_stride * batch );
}
}
}
#include
#include “caffe/filler.hpp”
#include “caffe/layers/mat_mul_layer.hpp”
#include “caffe/util/math_functions.hpp”
namespace caffe {
template
void MatMulLayer::LayerSetUp(const vector
const vector
LOG_IF(INFO, Caffe::root_solver()) << "begin setting up";
// use axis to check two bottom block if satisfied standard
const int xaxes = bottom[0]->num_axes();
LOG_IF(INFO, Caffe::root_solver()) << "xaxes" << xaxes;
const int yaxes = bottom[1]->num_axes();
LOG_IF(INFO, Caffe::root_solver()) << "yaxes" << yaxes;
CHECK(xaxes>=2)
<< "X blob must be of shape (B1xB2...xBn,M,K) or (M,K)!";
CHECK(yaxes>=2)
<< "y blob must be of shape (B1xB2...xBn,K,N) or (K,N)!";
CHECK(xaxes==yaxes)
<< "X blob must hold same shape as yaxes";
// get M, K and N
const int xK_ = bottom[0]->shape(-1);
const int yK_ = bottom[1]->shape(-2);
CHECK(xK_ == yK_)
<< "demension of X is incomplicatiable with y, not match formula of mat mul ";
M_ = bottom[0]->shape(-2);
K_ = xK_;
N_ = bottom[1]->shape(-1);
}
template
void MatMulLayer::Reshape(const vector
const vector
// create top block
const int end_index = bottom[0]->num_axes() - 1;
vector top_shape = bottom[0]->shape();
LOG_IF(INFO, Caffe::root_solver()) << "bottom shape end" << top_shape[end_index];
LOG_IF(INFO, Caffe::root_solver()) << "bottom shape end+1" << top_shape[end_index-1];
N_ = bottom[1]->shape(-1);
top_shape[end_index] = N_;
LOG_IF(INFO, Caffe::root_solver()) << "top shape end" << top_shape[end_index];
LOG_IF(INFO, Caffe::root_solver()) << "top shape end+1" << top_shape[end_index-1];
top[0]->Reshape(top_shape);
}
template
void MatMulLayer::Forward_cpu(const vector
const vector
// get matrix1, matrix2 and output
LOG_IF(INFO, Caffe::root_solver()) << "begin forward";
const Dtype* bottom_matrix1 = bottom[0]->cpu_data();
const Dtype* bottom_matrix2 = bottom[1]->cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
// address to skip when passing each 2 * 2matrix
const int botm1_stride = M_ * K_;
const int botm2_stride = K_ * N_;
const int top_stride = M_ * N_;
int batch_size = 1;
// number of batch size if exist
const int xaxes = bottom[0]->num_axes();
if(xaxes > 2){
batch_size = top[0]->count(0, top[0]->CanonicalAxisIndex(-2));
}
for(int batch=0; batch(CblasNoTrans,
CblasNoTrans,
M_, N_, K_,
(Dtype)1.,
bottom_matrix1 + botm1_stride * batch,
bottom_matrix2 + botm2_stride * batch,
(Dtype)0.,
top_data + top_stride * batch );
}
}
template
void MatMulLayer::Backward_cpu(const vector
const vector& propagate_down,
const vector
LOG_IF(INFO, Caffe::root_solver()) << "begin backward";
// address to skip when passing each 2 * 2matrix
const int botm1_stride = M_ * K_;
const int botm2_stride = K_ * N_;
const int top_stride = M_ * N_;
int batch_size = 1;
// number of batch size if exist
const int xaxes = bottom[0]->num_axes();
if(xaxes > 2){
batch_size = top[0]->count(0, top[0]->CanonicalAxisIndex(-2));
}
const Dtype* top_diff = top[0]->cpu_diff();
const Dtype* bottom_1 = bottom[0]->cpu_data();
const Dtype* bottom_2 = bottom[1]->cpu_data();
Dtype* botm_1_diff = bottom[0]->mutable_cpu_diff();
Dtype* botm_2_diff = bottom[1]->mutable_cpu_diff();
if(propagate_down[0]){
// X.diff = Z.diff∗Y.data.transpose()
for(int batch=0; batch(CblasNoTrans,
CblasTrans,
M_, K_, N_,
(Dtype)1.,
top_diff + top_stride * batch,
bottom_2 + botm2_stride * batch,
(Dtype)0.,
botm_1_diff + botm1_stride * batch );
}
}
if(propagate_down[1]){
// Y.diff = X.data.transpose()∗Z.diff
for(int batch=0; batch(CblasTrans,
CblasNoTrans,
K_, N_, M_,
(Dtype)1.,
bottom_1 + botm1_stride * batch,
top_diff + top_stride * batch,
(Dtype)0.,
botm_2_diff + botm2_stride * batch );
}
}
}
#ifdef CPU_ONLY
STUB_GPU(MatMulLayer);
#endif
INSTANTIATE_CLASS(MatMulLayer);
REGISTER_LAYER_CLASS(MatMul);
}
以上就caffe矩阵乘法算子的是全部内容了。gpu的实现和cpu一致,只是把一些函数名称的cpu换成gpu因此不再过多赘述。