- InfogainLossLayer类
- EuclideanLossLayer类
- HingeLossLayer类
- ContrastiveLossLayer类
InfogainLossLayer类简介
InfogainLossLayer与SoftmaxWithLossLayer类似,只不过增加了一个信息增益矩阵\(H\),用于指定某真实类别的数据被预测为某一类别时的权重,常用于类间样本数不均衡的情况。当矩阵\(H\)为单位矩阵时,等同于SoftmaxWithLossLayer。
- 第一个输入blob为网络的预测值,大小\(\tilde{N} \times C \times \tilde H \times \tilde W\),范围\(x_{n,k} \in [-\infty, +\infty]\)。计算loss时使用softmax函数值作为其概率,\(\hat{p}_{n,k} = \frac{e^{x_{n,k}}}{\sum\limits_{k'=1}^{K} e^{x_{n,k'}}}\)。
- 后续假设计算softmax时是沿着第1维(维度\(C\))进行的,则维度\(C\)的大小即为类别总数\(K\),数据的总个数为外部个数(对应代码中的
outer_num_
)乘上内部个数inner_num_
,即\(N=\tilde N * \tilde H * \tilde W\)。
- 第二个输入blob为标签值,大小\(\tilde{N} \times 1 \times \tilde H \times \tilde W\),也即\((N \times 1 \times 1 \times 1)\),范围\(l_n \in [0, 1, 2, ..., K - 1]\)之间的整数,第\(n\)个数据的真实类别为\(l_n\)。
- 与SoftmaxWithLossLayer类似,caffe代码中并没有严格限制标签blob的形状,只要求预测blob与标签blob的第0维相等(LossLayer中要求),和标签blob的总个数等于\(N\)。
- 前向计算时,loss的计算公式为: \(E = -\frac{1}{N} \sum\limits_{n=1}^N \sum\limits_{k=1}^{K} H_{l_n,k} \log(\hat{p}_{n,k})\)
- \(H_{l_n,k}\)为信息增益矩阵\(H\)中的元素,表示真实类别为\(l_n\),预测类别为\(k\)的值。矩阵\(H\)的大小为\(K \times K\)
- 反向计算时,预测blob的梯度的计算过程如下:
- \(\frac{{\partial {{\hat p}_{n,k'}}}}{{\partial {x_{n,k}}}}{\rm{ = }}{\left( {\frac{{{e^{{x_{n,k'}}}}}}{{{e^{{x_{n,1}}}}{\rm{ + }}{e^{{x_{n,{\rm{2}}}}}}{\rm{ + }}...{\rm{ + }}{e^{{x_{n,K}}}}}}} \right)_{{x_{n,k}}}}^\prime {\rm{ = }}\left\{ {\begin{array}{*{20}{c}} {{{\hat p}_{n,k'}} - {{\hat p}_{n,k'}}*{{\hat p}_{n,k}},k = {k'}}\\ { - {{\hat p}_{n,k'}}*{{\hat p}_{n,k}},k \ne {k'}} \end{array}} \right.\)
- \(E = - \frac{1}{N}\sum\limits_{n = 1}^N {\sum\limits_{k = 1}^K {{H_{{l_n},k}}} } \log \left( {{{\hat p}_{n,k}}} \right) = - \frac{1}{N}\sum\limits_{n = 1}^N {\left( {{H_{{l_n},1}}\log {{\hat p}_{n,1}} + {H_{{l_n},2}}\log {{\hat p}_{n,2}} + ... + {H_{{l_n},K}}\log {{\hat p}_{n,K}}} \right)}\)
- \(\frac{{\partial E}}{{\partial {{\hat p}_{n,k'}}}} = - \frac{1}{N}{H_{{l_n},k'}}\frac{1}{{{{\hat p}_{n,k}}}}\)
- \(\frac{{\partial E}}{{\partial {x_{n,k}}}} = \sum\limits_{k' = 1}^K {\frac{{\partial E}}{{\partial {{\hat p}_{n,k'}}}}\frac{{\partial {{\hat p}_{n,k'}}}}{{\partial {x_{n,k}}}}} = \frac{{\partial E}}{{\partial {{\hat p}_{n,1}}}}\frac{{\partial {{\hat p}_{n,1}}}}{{\partial {x_{n,k}}}} + \frac{{\partial E}}{{\partial {{\hat p}_{n,2}}}}\frac{{\partial {{\hat p}_{n,2}}}}{{\partial {x_{n,k}}}} + ... + \frac{{\partial E}}{{\partial {{\hat p}_{n,k}}}}\frac{{\partial {{\hat p}_{n,k}}}}{{\partial {x_{n,k}}}} + ... + \frac{{\partial E}}{{\partial {{\hat p}_{n,K}}}}\frac{{\partial {{\hat p}_{n,K}}}}{{\partial {x_{n,k}}}}\)
- \(= - \frac{1}{N}\left( {{H_{l_n,1}}\left( { - {{\hat p}_{n,k}}} \right) + {H_{l_n,2}}\left( { - {{\hat p}_{n,k}}} \right) + ... + {H_{{l_n},k}}\left( {{\rm{1}} - {{\hat p}_{n,k}}} \right){\rm{ + }}...{\rm{ + }}{H_{{l_n},K}}\left( { - {{\hat p}_{n,k}}} \right)} \right)\)
- \(= \frac{1}{N}\left( {{{\hat p}_{n,k}}\sum\limits_{k' = 1}^K {{H_{{l_n},k'}}} - {H_{{l_n},k}}} \right)\)
- 最后可计算:\(\frac{\partial J}{\partial {x_{n,k}}} = \frac{\partial J}{\partial E}*\frac{\partial E}{\partial {x_{n,k}}}\)
infogain_loss_layer.cpp源码
template
void InfogainLossLayer::LayerSetUp(
const vector*>& bottom, const vector*>& top) {
LossLayer::LayerSetUp(bottom, top); //基类的初始化函数
// internal softmax layer
LayerParameter softmax_layer_param(this->layer_param_); //layer参数,用于创建softmax层
SoftmaxParameter* softmax_param = softmax_layer_param.mutable_softmax_param(); //layer参数中的softmax参数
softmax_param->set_axis(this->layer_param_.infogain_loss_param().axis()); //设置计算softmax时的沿着的轴
softmax_layer_param.set_type("Softmax"); //设置层的类型
softmax_layer_param.clear_loss_weight();
softmax_layer_param.add_loss_weight(1); //清空权重参数,并设置为1
softmax_layer_ = LayerRegistry::CreateLayer(softmax_layer_param); //根据layer参数创建softmax层
softmax_bottom_vec_.clear();
softmax_bottom_vec_.push_back(bottom[0]); //设置softmax层的输入blob
softmax_top_vec_.clear();
softmax_top_vec_.push_back(&prob_); //设置softmax层的输出blob
softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_); //调用softmax层的初始化函数
// ignore label
has_ignore_label_ = this->layer_param_.loss_param().has_ignore_label(); //设置了无效标签
if (has_ignore_label_) {
ignore_label_ = this->layer_param_.loss_param().ignore_label(); //存入当前layer中
}
// normalization
CHECK(!this->layer_param_.loss_param().has_normalize())
<< "normalize is deprecated. use \"normalization\""; //normalize参数为旧版本,已弃用
normalization_ = this->layer_param_.loss_param().normalization(); //normalization参数制定了规范化方式
// matrix H
if (bottom.size() < 3) { //输入blob的个数小于3,则输入中不带信息增益矩阵H
CHECK(this->layer_param_.infogain_loss_param().has_source())
<< "Infogain matrix source must be specified."; //检查,在layer参数中必须指定增益矩阵H的来源文件
BlobProto blob_proto;
//从二进制文件中读取消息到blob_proto中
ReadProtoFromBinaryFile(this->layer_param_.infogain_loss_param().source(), &blob_proto);
infogain_.FromProto(blob_proto); //将blob_proto中的数据转成blob类型,存储到信息增益矩阵H中
}
}
template
void InfogainLossLayer::Reshape(
const vector*>& bottom, const vector*>& top) {
LossLayer::Reshape(bottom, top); //调整形状
softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_); //调整softmax层的形状
//读取消息中的axis参数,计算对应的维度存入infogain_axis_中.后续则是沿着第infogain_axis_维计算softmax值
infogain_axis_ = bottom[0]->CanonicalAxisIndex(this->layer_param_.infogain_loss_param().axis());
outer_num_ = bottom[0]->count(0, infogain_axis_); //外部个数,第 [0, infogain_axis_) 维的乘积
inner_num_ = bottom[0]->count(infogain_axis_ + 1); //内部个数,第 [infogain_axis_ + 1, end) 维的乘积
CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) //数据的总个数等于外部个数乘上内部个数,必须等于标签blob的总个数
<< "Number of labels must match number of predictions; "
<< "e.g., if infogain axis == 1 and prediction shape is (N, C, H, W), "
<< "label count (number of labels) must be N*H*W, "
<< "with integer values in {0, 1, ..., C-1}.";
//同样,假设infogain_axis_=1.则 outer_num_ = N, inner_num_ = H*W, 类别总数 K=C
num_labels_ = bottom[0]->shape(infogain_axis_); //类别总数K
Blob* infogain = NULL; //信息增益矩阵
if (bottom.size() < 3) {
infogain = &infogain_; //在layer参数中指定
} else {
infogain = bottom[2]; //在输入blob中指定
}
CHECK_EQ(infogain->count(), num_labels_*num_labels_); //检查,信息增益矩阵H的大小必须维K*K,K为类别总数
sum_rows_H_.Reshape(vector(1, num_labels_)); //用于存放矩阵H的每行的和
if (bottom.size() == 2) {
// H is provided as a parameter and will not change. sum rows once
sum_rows_of_H(infogain); //如果是在layer参数中指定信息增益矩阵H,则每行的和在每次训练时是固定值,可先计算出来
}
if (top.size() >= 2) {
// softmax output
top[1]->ReshapeLike(*bottom[0]); //如果设置了多个输出blob,则将top[1]作为softmax层的输出,调整对应的形状
}
}
template
Dtype InfogainLossLayer::get_normalizer(
LossParameter_NormalizationMode normalization_mode, int valid_count) { //根据规范化方式计算规范化系数
Dtype normalizer;
switch (normalization_mode) {
case LossParameter_NormalizationMode_FULL:
normalizer = Dtype(outer_num_ * inner_num_); //FULL模式,规范化系数即为数据的总个数
break;
case LossParameter_NormalizationMode_VALID:
if (valid_count == -1) {
normalizer = Dtype(outer_num_ * inner_num_); //VALID模式,如果未设置无效标签则等同于FULL模式
} else {
normalizer = Dtype(valid_count); //设置了无效标签,则规范化系数为有效数据的个数
}
break;
case LossParameter_NormalizationMode_BATCH_SIZE: //BATCH_SIZE模式,规范化系数为外部个数
normalizer = Dtype(outer_num_);
break;
case LossParameter_NormalizationMode_NONE: //NONE模式,无需规范化,规范化系数为1
normalizer = Dtype(1);
break;
default:
LOG(FATAL) << "Unknown normalization mode: "
<< LossParameter_NormalizationMode_Name(normalization_mode);
}
// Some users will have no labels for some examples in order to 'turn off' a
// particular loss in a multi-task setup. The max prevents NaNs in that case.
return std::max(Dtype(1.0), normalizer); //同样,防止有效标签个数为0而出现的除0错误
}
template
void InfogainLossLayer::sum_rows_of_H(const Blob* H) { //计算H矩阵每行的和,存入sum_rows_H_中
CHECK_EQ(H->count(), num_labels_*num_labels_)
<< "H must be " << num_labels_ << "x" << num_labels_; //检查,H的大小必须为K*K
const Dtype* infogain_mat = H->cpu_data(); //H矩阵的数据指针
Dtype* sum = sum_rows_H_.mutable_cpu_data(); //sum_rows_H_的数据指针
for ( int row = 0; row < num_labels_ ; row++ ) {
sum[row] = 0;
for ( int col = 0; col < num_labels_ ; col++ ) {
sum[row] += infogain_mat[row*num_labels_+col]; //累加每行的和
}
}
}
template
void InfogainLossLayer::Forward_cpu(const vector*>& bottom,
const vector*>& top) {
// The forward pass computes the softmax prob values.
softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); //先计算softmax层的输出
const Dtype* prob_data = prob_.cpu_data(); //softmax层的输出数据指针
const Dtype* bottom_label = bottom[1]->cpu_data(); //标签数据指针
const Dtype* infogain_mat = NULL; //信息增益矩阵的数据指针
if (bottom.size() < 3) {
infogain_mat = infogain_.cpu_data(); //来自layer参数
} else {
infogain_mat = bottom[2]->cpu_data(); //来自输入blob
}
int count = 0;
Dtype loss = 0;
for (int i = 0; i < outer_num_; ++i) { //N
for (int j = 0; j < inner_num_; j++) { //H*W
//bottom_label数据的大小为N*H*W,获取(i,j)位置数据的真实标签
const int label_value = static_cast(bottom_label[i * inner_num_ + j]);
if (has_ignore_label_ && label_value == ignore_label_) {
continue; //设置了无效标签,并且当前数据标签无效,忽略
}
DCHECK_GE(label_value, 0); //数据的标签值必须在 [0, num_labels_) 之间
DCHECK_LT(label_value, num_labels_);
for (int l = 0; l < num_labels_; l++) {
//infogain_mat[label_value * num_labels_ + l]为真实标签为label_value,预测标签为l的权重
//prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j]为(i,j)位置的数据的预测标签为l的概率(softmax值)
loss -= infogain_mat[label_value * num_labels_ + l] *
log(std::max(prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j],
Dtype(kLOG_THRESHOLD)));
}
++count; //有效标签的数据个数
}
}
top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count); //再除以规范化系数
if (top.size() == 2) {
top[1]->ShareData(prob_); //输入blob个数为2,则将softmax层的输出作为第二个输出
}
}
template
void InfogainLossLayer::Backward_cpu(const vector*>& top,
const vector& propagate_down,
const vector*>& bottom) {
if (propagate_down[1]) { //标签blob禁止设置梯度反传
LOG(FATAL) << this->type() << " Layer cannot backpropagate to label inputs.";
}
if (propagate_down.size() > 2 && propagate_down[2]) { //信息增益矩阵H同样禁止设置梯度反传
LOG(FATAL) << this->type() << " Layer cannot backpropagate to infogain inputs.";
}
if (propagate_down[0]) { //预测blob需要梯度反传
const Dtype* prob_data = prob_.cpu_data(); //softmax层的输出
const Dtype* bottom_label = bottom[1]->cpu_data(); //标签数据
const Dtype* infogain_mat = NULL;
if (bottom.size() < 3) {
infogain_mat = infogain_.cpu_data(); //增益矩阵H来自layer参数(每行的和已经在Reshape()中计算出)
} else {
infogain_mat = bottom[2]->cpu_data(); //增益矩阵H来自输入blob
// H is provided as a "bottom" and might change. sum rows every time.
sum_rows_of_H(bottom[2]); //则计算每行的和
}
const Dtype* sum_rows_H = sum_rows_H_.cpu_data(); //增益矩阵H每行的和
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); //输入blob的梯度数据指针
const int dim = bottom[0]->count() / outer_num_; //C*H*W
int count = 0;
for (int i = 0; i < outer_num_; ++i) { //N
for (int j = 0; j < inner_num_; ++j) { //H*W
const int label_value = static_cast(bottom_label[i * inner_num_ + j]); //(i,j)位置的真实标签
DCHECK_GE(label_value, 0); //检查标签值在 [0, num_labels_) 之间
DCHECK_LT(label_value, num_labels_);
if (has_ignore_label_ && label_value == ignore_label_) { //当前位置的标签无效
for (int l = 0; l < num_labels_; ++l) {
bottom_diff[i * dim + l * inner_num_ + j] = 0; //清空(i,j)位置的数据对每种类别的预测值的梯度
}
} else {
for (int l = 0; l < num_labels_; ++l) {
//prob_data[i*dim + l*inner_num_ + j] 为(i,j)位置的数据对类别l的预测概率
//sum_rows_H[label_value] 为(i,j)位置的数据的真实标签label_value在信息增益矩阵H中所在行的和
//infogain_mat[label_value * num_labels_ + l] 为真实标签为label_value,预测标签为l的权重
bottom_diff[i * dim + l * inner_num_ + j] =
prob_data[i*dim + l*inner_num_ + j]*sum_rows_H[label_value]
- infogain_mat[label_value * num_labels_ + l];
}
++count; //有效数据个数
}
}
}
// Scale gradient
Dtype loss_weight = top[0]->cpu_diff()[0] / get_normalizer(normalization_, count); //除以规范化系数,得到缩放系数
caffe_scal(bottom[0]->count(), loss_weight, bottom_diff); //bottom_diff *= loss_weight
}
}
EuclideanLossLayer类简介
EuclideanLossLayer类用于计算预测值与真实值的欧式距离损失,用于回归任务中。
- 第一个输入blob为网络的预测值,大小\(N \times C \times H \times W\),范围\(\hat{y}_n \in [-\infty, +\infty]\)
- 第二个输入blob为标签值,大小\(N \times C \times H \times W\),范围\(y_{n} \in [-\infty, +\infty]\)
- 注意实际预测值与标签值的位置可互换,并且反向传播时允许计算两个输入blob的梯度。
- 前向计算时,loss的计算公式为:\(E = \frac{1}{2N} \sum\limits_{n=1}^N \|{\hat{y}_n - y_n}\|_2^2\)
- 反向计算时,第一个输入blob的梯度为:\(\frac{\partial J}{\partial {\hat{y}_n}} = \frac{\partial J}{\partial E}*\frac{\partial E}{\partial {\hat{y}_n}} = \frac{\partial J}{\partial E} * \frac{1}{2N}*2(\hat{y}_n-y_n)=\frac{\partial J}{\partial E}*\frac{\hat{y}_n-y_n}{N}\)
- 反向计算时,第二个输入blob的梯度为:\(\frac{\partial J}{\partial {y_n}} = \frac{\partial J}{\partial E}*\frac{\partial E}{\partial {y_n}} =-\frac{\partial J}{\partial E}*\frac{\hat{y}_n-y_n}{N}\)
euclidean_loss_layer.cpp源码
template
void EuclideanLossLayer::Reshape(
const vector*>& bottom, const vector*>& top) {
LossLayer::Reshape(bottom, top); //调用基类的调整形状
CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
<< "Inputs must have the same dimension."; //检查C*H*W的总数相等
diff_.ReshapeLike(*bottom[0]); //diff_调整为bottom[0]的形状
}
template
void EuclideanLossLayer::Forward_cpu(const vector*>& bottom,
const vector*>& top) {
int count = bottom[0]->count(); //数据的总个数N*C*H*W
//diff_ = bottom[0] - bottom[1] //a - b
caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), diff_.mutable_cpu_data());
Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); //计算内积,dot = diff_ * diff_
Dtype loss = dot / bottom[0]->num() / Dtype(2); //得到 loss = dot / N / 2 //E = 1 / 2 / N * (a - b) * (a - b)
top[0]->mutable_cpu_data()[0] = loss;
}
//EuclideanLossLayer并没有严格限制输入blob中预测值和标签值的位置,并且会计算两个输入blob的梯度
template
void EuclideanLossLayer::Backward_cpu(const vector*>& top,
const vector& propagate_down, const vector*>& bottom) {
for (int i = 0; i < 2; ++i) {
if (propagate_down[i]) { //允许梯度反传
const Dtype sign = (i == 0) ? 1 : -1;
const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
//bottom[i] = alpha * diff_ + 0 * bottom[i]
//a_diff = 1 * λ / N * (a - b)
//b_diff = -1 * λ / N * (a - b)
caffe_cpu_axpby(bottom[i]->count(), alpha, diff_.cpu_data(), Dtype(0), bottom[i]->mutable_cpu_diff());
}
}
}
HingeLossLayer类简介
HingeLossLayer类用于计算合页损失,用于一对多的分类任务中。hinge loss用于SVM中,也正是hinge loss的特性使得SVM中的超平面仅依赖少数样本。
- 第一个输入blob为网络的预测值,大小\(N \times C \times H \times W\),范围\(t_{n,k} \in [-\infty, +\infty]\)。其中数据总个数为\(N\),数据的类别总数为\(K = CHW\)
- 第二个输入blob为标签值,大小\(N \times 1 \times 1 \times 1\),范围\(l_n \in [0, 1, 2, ..., K - 1]\)之间的整数,第\(n\)个数据的真实类别为\(l_n\)。
- 前向计算时,loss的计算公式为:\(E = \frac{1}{N} \sum\limits_{n=1}^N \sum\limits_{k=1}^K [\max(0, 1 - \delta * t_{n,k})] ^ p\)
- 其中,\(\delta=\left\{\begin{matrix} 1 & k=l_n\\ -1 & k \neq l_n \end{matrix}\right.\),\(p\)为正则化系数,\(p=1\)表示L1正则化,\(p=2\)表示L2正则化
- 从loss中可以看出,当\(t_{n,k}>=1\)(样本与超平面较远),并且预测正确\(\delta=1,(k=l_n)\)时,\(1 - \delta * t_{n,k} < 0\),该样本对loss无贡献。只有那些超平面附近的数据(\(t_{n,k}<1\)),或者预测错误的数据(\(\delta=-1\)),才会计入loss中。
- 反向计算时,第一个输入blob的梯度计算公式如下。
- \(\frac{\partial J}{\partial {t_{n,k}}} = \frac{\partial J}{\partial E}*\frac{\partial E}{\partial {t_{n,k}}}\)
- \(p=1\)时,\(E = \frac{1}{N} \sum\limits_{n=1}^N \sum\limits_{k=1}^K |\max(0, 1 - \delta * t_{n,k})|\)
- \(\frac{\partial E}{\partial {t_{n,k}}}=\left\{\begin{matrix} 1/N*(-\delta) & 1 - \delta * t_{n,k} > 0 \\ 0 & 1 - \delta * t_{n,k} \leqslant 0 \end{matrix}\right.=\left\{\begin{matrix} -1/N & 1 - \delta * t_{n,k} > 0,k=l_n \\ 1/N & 1 - \delta * t_{n,k} > 0,k \neq l_n \\ 0 & 1 - \delta * t_{n,k} \leqslant 0 \end{matrix}\right.\)
- \(p=2\)时,\(E = \frac{1}{N} \sum\limits_{n=1}^N \sum\limits_{k=1}^K |\max(0, 1 - \delta * t_{n,k})|^2\)
- \(\frac{\partial E}{\partial {t_{n,k}}}=2/N*\max(0, 1 - \delta * t_{n,k})*(-\delta)=\left\{\begin{matrix} -2/N*\max(0, 1 - \delta * t_{n,k}) & k=l_n\\ 2/N*\max(0, 1 - \delta * t_{n,k}) & k \neq l_n \end{matrix}\right.\)
hinge_loss_layer.cpp源码
template
void HingeLossLayer::Forward_cpu(const vector*>& bottom,
const vector*>& top) {
const Dtype* bottom_data = bottom[0]->cpu_data(); //预测值数据指针
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); //预测值梯度数据指针
const Dtype* label = bottom[1]->cpu_data(); //标签值数据指针
int num = bottom[0]->num(); //N,为数据的总个数
int count = bottom[0]->count(); //N*C*H*W
int dim = count / num; //C*H*W,为标签的总类别数K
caffe_copy(count, bottom_data, bottom_diff); //bottom_diff = bottom_data
for (int i = 0; i < num; ++i) {
//label[i]为第i个数据的真实标签
bottom_diff[i * dim + static_cast(label[i])] *= -1; //得到 -δ*t_nk, δ= -1(k≠l_n)或1(k=l_n)
}
for (int i = 0; i < num; ++i) {
for (int j = 0; j < dim; ++j) {
//第i个数据的第j类别的值
bottom_diff[i * dim + j] = std::max(Dtype(0), 1 + bottom_diff[i * dim + j]); //max(0, 1-δ*t_nk)
}
}
Dtype* loss = top[0]->mutable_cpu_data(); //输出loss
switch (this->layer_param_.hinge_loss_param().norm()) { //正则化方式
case HingeLossParameter_Norm_L1:
loss[0] = caffe_cpu_asum(count, bottom_diff) / num; //L1正则化,计算各数据的绝对值之和,再除以个数
break;
case HingeLossParameter_Norm_L2:
loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num; //L2正则化,计算各数据的平方和,在除以个数
break;
default:
LOG(FATAL) << "Unknown Norm";
}
}
template
void HingeLossLayer::Backward_cpu(const vector*>& top,
const vector& propagate_down, const vector*>& bottom) {
if (propagate_down[1]) { //标签blob不允许梯度反传
LOG(FATAL) << this->type() << " Layer cannot backpropagate to label inputs.";
}
if (propagate_down[0]) {
//预测值的梯度数据,在Forward_cpu()函数中已保存了max(0, 1-δ*t_nk)
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
const Dtype* label = bottom[1]->cpu_data(); //标签值
int num = bottom[0]->num(); //N,为数据的总个数
int count = bottom[0]->count(); //N*C*H*W
int dim = count / num; //C*H*W,为标签的总类别数K
for (int i = 0; i < num; ++i) {
//label[i]为第i个数据的真实标签,得到:
//bottom_diff = max(0, 1-δ*t_nk) {k≠l_n},
// -max(0, 1-δ*t_nk) {k=l_n}
bottom_diff[i * dim + static_cast(label[i])] *= -1;
}
//该段的具体计算过程可参考博客上面的说明
const Dtype loss_weight = top[0]->cpu_diff()[0];
switch (this->layer_param_.hinge_loss_param().norm()) {
case HingeLossParameter_Norm_L1: //L1正则化方式
//sign(bottom_diff) = 1 {k≠l_n, 1-δ*t_nk > 0},
// 0 {k≠l_n, 1-δ*t_nk ≤ 0},
// -1 {k=l_n, 1-δ*t_nk > 0},
// 0 {k=l_n, 1-δ*t_nk ≤ 0}
caffe_cpu_sign(count, bottom_diff, bottom_diff); //计算符号,bottom_diff = sign(bottom_diff)
caffe_scal(count, loss_weight / num, bottom_diff); //bottom_diff *= loss_weight / num
break;
case HingeLossParameter_Norm_L2: //L2正则化方式
caffe_scal(count, loss_weight * 2 / num, bottom_diff); //bottom_diff *= loss_weight * 2 / num
break;
default:
LOG(FATAL) << "Unknown Norm";
}
}
}
ContrastiveLossLayer类简介
ContrastiveLossLayer类用于计算对比损失,该损失函数的思路是同类样本的欧氏距离应尽可能小,非同类样本之间的欧氏距离应该不小于指定阈值,常用于孪生神经网络(siamese network)的训练。
- 第一个输入blob为特征向量\(a\),大小\(N \times C \times 1 \times 1\),范围\(a_{n,k} \in [-\infty, +\infty]\)。其中数据总个数为\(N\),特征向量的长度为\(C\)
- 第二个输入blob为特征向量\(b\),大小\(N \times C \times 1 \times 1\),形状与第一个输入blob完全相同。数据范围\(b_{n,k} \in [-\infty, +\infty]\)。
- 第三个输入blob为二元相似度\(y\),大小\(N \times 1 \times 1 \times 1\),范围\(y_n=1\)(\(a_n\)与\(b_n\)为同类样本)或\(y_n=0\)(\(a_n\)与\(b_n\)非同类样本)
- 前向计算时,loss的计算公式为:\(E = \frac{1}{2N} \sum\limits_{n=1}^N [y_n*d_n^2 + (1-y_n)*\max (margin-d_n, 0)^2]\)(代码中
legacy_version=false
)或\(E = \frac{1}{2N} \sum\limits_{n=1}^N [y_n*d_n^2 + (1-y_n)*\max (margin-d_n^2, 0)]\)(代码中legacy_version=true
)
- 其中,\(margin\)为一个常数,表示非同类样本的最小欧式距离阈值,小于该值则会计入loss中
- \(d_n\)为两个特征向量的欧氏距离,\(d_n^2=\|{a_n - b_n}\|_2^2=\sum\limits_{k=1}^K{(a_{n,k} - b_{n,k})^2}\)
- 反向计算时,输入blob的梯度计算公式如下。
- \(\frac{\partial J}{\partial {a_{n,k}}} = \frac{\partial J}{\partial E}*\frac{\partial E}{\partial {a_{n,k}}},\frac{\partial J}{\partial {b_{n,k}}} = \frac{\partial J}{\partial E}*\frac{\partial E}{\partial {b_{n,k}}}\)
- 当\(a_n\)与\(b_n\)为同类样本时,则\(y_n=1\),此时
\(\frac{\partial E}{\partial {a_{n,k}}}=\frac{\partial E}{\partial d_n^2}*\frac{\partial d_n^2}{\partial a_{n,k}}=\frac{1}{2N}*2(a_{n,k} - b_{n,k})=\frac{a_{n,k} - b_{n,k}}{N}\)
\(\frac{\partial E}{\partial {b_{n,k}}}=\frac{\partial E}{\partial d_n^2}*\frac{\partial d_n^2}{\partial b_{n,k}}=-\frac{a_{n,k} - b_{n,k}}{N}\) - 当\(a_n\)与\(b_n\)为非同类样本时,则\(y_n=0\),此时若
legacy_version=false
,则\(E = \frac{1}{2N} \sum\limits_{n=1}^N [y_n*d_n^2 + (1-y_n)*\max (margin-d_n, 0)^2]\)
\(\frac{\partial E}{\partial {a_{n,k}}}=\frac{{\partial E}}{{\partial {d_n}}}\frac{{\partial {d_n}}}{{\partial {a_{n,k}}}}= \left\{\begin{matrix} \frac{1}{2N}*2(margin-d_n)*(-1)*\frac{{\partial {d_n}}}{{\partial {a_{n,k}}}} & margin-d_n > 0 \\ 0 & margin-d_n \leqslant 0 \end{matrix}\right.\)
\(= \left\{ {\begin{array}{*{20}{c}} { - \frac{{(margin - {d_n})}}{N}*\frac{{{a_{n,k}} - {b_{n,k}}}}{{{d_n}}}}&{margin - {d_n} > 0}\\ 0&{margin - {d_n} \leqslant 0} \end{array}} \right.\)
同理有\(\frac{\partial E}{\partial {b_{n,k}}}= \left\{ {\begin{array}{*{20}{c}} {\frac{{(margin - {d_n})}}{N}*\frac{{{a_{n,k}} - {b_{n,k}}}}{{{d_n}}}}&{margin - {d_n} > 0}\\ 0&{margin - {d_n} \leqslant 0} \end{array}} \right.\) - 当\(a_n\)与\(b_n\)为非同类样本时,则\(y_n=0\),此时若
legacy_version=true
,则\(E = \frac{1}{2N} \sum\limits_{n=1}^N [y_n*d_n^2 + (1-y_n)*\max (margin-d_n^2, 0)]\)
\(\frac{\partial E}{\partial {a_{n,k}}}=\frac{\partial E}{\partial d_n^2}*\frac{\partial d_n^2}{\partial a_{n,k}}= \left\{\begin{matrix} \frac{1}{2N}*(-1)*\frac{\partial d_n^2}{\partial a_{n,k}} & margin - {d_n} > 0\\ 0 & margin - {d_n} \leqslant 0 \end{matrix}\right.\)
\(=\left\{\begin{matrix} -\frac{a_{n,k}-b_{n,k}}{N} & margin - {d_n} > 0 \\ 0 & margin - {d_n} \leqslant 0 \end{matrix}\right.\)
同理有\(\frac{\partial E}{\partial {b_{n,k}}}=\left\{\begin{matrix} \frac{a_{n,k}-b_{n,k}}{N} & margin - {d_n} > 0 \\ 0 & margin - {d_n} \leqslant 0 \end{matrix}\right.\)
contrastive_loss_layer.cpp源码
template
void ContrastiveLossLayer::LayerSetUp(
const vector*>& bottom, const vector*>& top) {
LossLayer::LayerSetUp(bottom, top); //调用基类的初始化函数
CHECK_EQ(bottom[0]->channels(), bottom[1]->channels()); //C维大小相等
CHECK_EQ(bottom[0]->height(), 1); //输入0的形状必须为N*C*1*1
CHECK_EQ(bottom[0]->width(), 1);
CHECK_EQ(bottom[1]->height(), 1); //输入1的形状必须为N*C*1*1
CHECK_EQ(bottom[1]->width(), 1);
CHECK_EQ(bottom[2]->channels(), 1); //输入2的形状必须为N*1*1*1,标签值,表示输入0与输入1的数据是否属于同类
CHECK_EQ(bottom[2]->height(), 1);
CHECK_EQ(bottom[2]->width(), 1);
diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); //形状调整为N*C*1*1 //存放所有数据的所有特征向量的差
diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); //形状调整为N*C*1*1 //gpu计算的临时变量
dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1); //形状调整为N*1*1*1 //存放数据的欧氏距离的平方
// vector of ones used to sum along channels
summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1); //形状调整为C*1*1*1
for (int i = 0; i < bottom[0]->channels(); ++i)
summer_vec_.mutable_cpu_data()[i] = Dtype(1); //初始设置为1
}
template
void ContrastiveLossLayer::Forward_cpu(
const vector*>& bottom,
const vector*>& top) {
int count = bottom[0]->count();
// diff_ = bottom[0] - bottom[1] //a_ij-b_ij
caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), diff_.mutable_cpu_data());
const int channels = bottom[0]->channels(); //每个数据的特征长度
//距离阈值,对比损失中,非同类样本的欧式距离必须大于margin,否则对应的loss值非0
Dtype margin = this->layer_param_.contrastive_loss_param().margin();
//legacy_version为false(默认值)时使用(margin - d)^2公式,为true时使用(margin - d^2)公式
bool legacy_version = this->layer_param_.contrastive_loss_param().legacy_version();
Dtype loss(0.0);
for (int i = 0; i < bottom[0]->num(); ++i) { //每个数据
//diff_.cpu_data() + (i*channels)为第i个数据的特征向量的起始位置 //计算两个特征向量的内积,得到d^2
//d^2 = Σ_{j} (a_ij-b_ij) * (a_ij-b_ij)
dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels,
diff_.cpu_data() + (i*channels), diff_.cpu_data() + (i*channels));
if (static_cast(bottom[2]->cpu_data()[i])) { // similar pairs //两个向量为相同类
loss += dist_sq_.cpu_data()[i]; // E += y*d^2 (y=1)
} else { // dissimilar pairs //非同类
if (legacy_version) {
loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); //E += (1-y)*max(0, margin - d^2) (y=0)
} else {
Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), Dtype(0.0));
loss += dist*dist; //E += (1-y)*max(0, margin - d)^2 (y=0)
}
}
}
loss = loss / static_cast(bottom[0]->num()) / Dtype(2); //E = E / N / 2
top[0]->mutable_cpu_data()[0] = loss; //最终的loss
}
template
void ContrastiveLossLayer::Backward_cpu(const vector*>& top,
const vector& propagate_down, const vector*>& bottom) {
Dtype margin = this->layer_param_.contrastive_loss_param().margin(); //margin距离阈值
bool legacy_version = this->layer_param_.contrastive_loss_param().legacy_version(); //版本
for (int i = 0; i < 2; ++i) {
if (propagate_down[i]) {
const Dtype sign = (i == 0) ? 1 : -1; //δ = 1 (a_ij) 或 -1 (b_ij)
//alpha = δ * λ / N
const Dtype alpha = sign * top[0]->cpu_diff()[0] / static_cast(bottom[i]->num());
int num = bottom[i]->num(); //数据的个数
int channels = bottom[i]->channels(); //数据的特征向量的长度
for (int j = 0; j < num; ++j) {
Dtype* bout = bottom[i]->mutable_cpu_diff(); //梯度数据指针
if (static_cast(bottom[2]->cpu_data()[j])) { // similar pairs //相同类
//相同类,loss的计算公式为 E += y*d^2 (y=1),并且 d^2 = Σ_{j} (a_ij-b_ij) * (a_ij-b_ij)
//则对 a_ij 或 b_ij 的梯度为 δ * λ / N * (a_ij-b_ij)
caffe_cpu_axpby(channels, alpha, diff_.cpu_data() + (j*channels),
Dtype(0.0), bout + (j*channels));
} else { // dissimilar pairs //不同类
Dtype mdist(0.0);
Dtype beta(0.0);
if (legacy_version) { //对应 E += (1-y)*max(0, margin - d^2) (y=0)
mdist = margin - dist_sq_.cpu_data()[j]; //mdist = margin - d^2
beta = -alpha; //beta = -δ * λ / N
} else { //对应 E += (1-y)*max(0, margin - d)^2 (y=0)
Dtype dist = sqrt(dist_sq_.cpu_data()[j]); //d = sqrt(d^2)
mdist = margin - dist; //mdist = margin - d
beta = -alpha * mdist / (dist + Dtype(1e-4)); //beta = -δ * λ / N * (margin - d) / d
}
if (mdist > Dtype(0.0)) { //max(0, mdist)时,取的是mdist
//legacy_version为true时, bout = -δ * λ / N * (a_ij-b_ij)
//legacy_version为false时, bout = -δ * λ / N * (margin - d) / d * (a_ij-b_ij)
caffe_cpu_axpby(channels, beta, diff_.cpu_data() + (j*channels),
Dtype(0.0), bout + (j*channels));
} else { //max(0, mdist)时,取的是0
caffe_set(channels, Dtype(0), bout + (j*channels)); //置为0
}
}
}
}
}
}
Caffe的源码笔者是第一次阅读,一边阅读一边记录,对代码的理解和分析可能会存在错误或遗漏,希望各位读者批评指正,谢谢支持!