根据训练出来的权重文件,参考segnet-tutorial的说明,进行batch normalize操作:
python ./Segnet/Scripts/compute_bn_statistics.py ./SegNet/Models/segnet_basic_train.prototxt ./SegNet/Models/Training/segnet_basic_iter_5000.caffemodel ./Segnet/Models/Inference/
出现错误:
03 13:48:10.377765 11423 accuracy_layer.cpp:72] Check failed: label_value < num_labels (11 vs. 11)
为什么在训练的时候就没有出错?
一个原因是训练的时候,由于test_iter的设置,一直没有进行test,所以没有进入accuracy层进行处理,这次是在accuracy层报错的。
首先看segnet_basic_train.prototxt文件的最后面的layer的定义:
layer {
name: "conv_classifier"
type: "Convolution"
bottom: "conv_decode1"
top: "conv_classifier"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 11
kernel_size: 1
weight_filler {
type: "msra"
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "conv_classifier"
bottom: "label"
top: "loss"
softmax_param {engine: CAFFE}
loss_param: {
weight_by_label_freqs: true
ignore_label: 11
class_weighting: 0.2595
class_weighting: 0.1826
class_weighting: 4.5640
class_weighting: 0.1417
class_weighting: 0.9051
class_weighting: 0.3826
class_weighting: 9.6446
class_weighting: 1.8418
class_weighting: 0.6823
class_weighting: 6.2478
class_weighting: 7.3614
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "conv_classifier"
bottom: "label"
top: "accuracy"
top: "per_class_accuracy"
}
由layer:conv_classifier的filter的个数可以知道,确实是11个类别,所以,label的有效值应该在0-10,但为什么label文件里面会读出11这个数值呢?而且在softmaxloss层也看到了一个参数:
ignore_label: 11
那么就是说,label里确实会出现11这个数值的。
为了验证这个label的数值,在softmax_loss_layer.cu文件进行输出log:
template <typename Dtype>
void SoftmaxWithLossLayer::Forward_gpu(
const vector *>& bottom, const vector *>& top) {
softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
const Dtype* prob_data = prob_.gpu_data();
const Dtype* label = bottom[1]->gpu_data();
//---一下新增
printf("SoftmaxLossForwardGPU has_ignore_label_=%d,bottom[1].0=%d,bottom[1].1=%d,.2=%d,.3=%d\n",has_ignore_label_,bottom[1]->shape(0),bottom[1]->shape(1),
bottom[1]->shape(2),bottom[1]->shape(3));
const Dtype* bottom_label=bottom[1]->cpu_data();
for (int i = 0; i < 1; ++i) {
for (int j = 0; j < 360*480; ++j) {
const int label_value =
static_cast<int>(bottom_label[i * inner_num_ + j]);
if (has_ignore_label_ && label_value == ignore_label_) {
printf("ignore SoftmaxLossForwardGPU.label_value=%d\n",label_value);
continue;
}else{
printf("%d ",label_value);
}
}
}
//---以上新增
上面代码是参考accuracy层的写的,因为我的train的batchsize为1,图片的规格为480*360,所以上面的循环就可以那样写。
输出:
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 ignore SoftmaxLossForwardGPU.label_value=11
4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 4 4 4 4 4 4 4 4 4 4 4 4 ignore SoftmaxLossForwardGPU.label_value=11
ignore SoftmaxLossForwardGPU.label_value=11
ignore SoftmaxLossForwardGPU.label_value=11
ignore SoftmaxLossForwardGPU.label_value=11
ignore SoftmaxLossForwardGPU.label_value=11
4 4 4 4 4 4 ignore SoftmaxLossForwardGPU.label_value=11
ignore SoftmaxLossForwardGPU.label_value=11
ignore SoftmaxLossForwardGPU.label_value=11
ignore SoftmaxLossForwardGPU.label_value=11
ignore SoftmaxLossForwardGPU.label_value=11
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 4 4 4 4 4 4 ignore SoftmaxLossForwardGPU.label_value=11
ignore SoftmaxLossForwardGPU.label_value=11
那么可以知道,label的11这个值是确实存在的,就排除了label文件有问题的原因,那么就可以分析为什么softmaxlosslayer没有报错,而accuracy会报错的原因了。
分析了其层定义,可以发现softmaxloss层的参数中有一个定义,ignore_label:11 。
结合softmaxloss的代码:
template <typename Dtype>
__global__ void SoftmaxLossForwardGPU(const int nthreads,
const Dtype* prob_data, const Dtype* label,
const bool weight_by_label_freqs, const float* label_counts,
Dtype* loss, const int num, const int dim, const int spatial_dim,
const bool has_ignore_label_, const int ignore_label_,
Dtype* counts) {
CUDA_KERNEL_LOOP(index, nthreads) {
const int n = index / spatial_dim;
const int s = index % spatial_dim;
const int label_value = static_cast<int>(label[n * spatial_dim + s]);
if (has_ignore_label_ && label_value == ignore_label_) {
loss[index] = 0;
counts[index] = 0;
} else {
loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
Dtype(FLT_MIN)));
if (weight_by_label_freqs) {
loss[index] *= static_cast(label_counts[label_value]);
}
counts[index] = 1;
}
}
}
可以看到,对于ignore_label,会进行特殊处理了。
在accuracy layer的代码中,也可以看到有对ignore_label的过滤处理:
template <typename Dtype>
void AccuracyLayer::Forward_cpu(const vector *>& bottom,
const vector *>& top) {
Dtype accuracy = 0;
const Dtype* bottom_data = bottom[0]->cpu_data();
const Dtype* bottom_label = bottom[1]->cpu_data();
const int dim = bottom[0]->count() / outer_num_;
const int num_labels = bottom[0]->shape(label_axis_);
vector maxval(top_k_+1);
vector<int> max_id(top_k_+1);
vector accuracies(num_labels, 0);
vector nums(num_labels, 0);
int count = 0;
for (int i = 0; i < outer_num_; ++i) {
for (int j = 0; j < inner_num_; ++j) {
const int label_value =
static_cast<int>(bottom_label[i * inner_num_ + j]);
if (has_ignore_label_ && label_value == ignore_label_) {
continue;
}
DCHECK_GE(label_value, 0);
DCHECK_LT(label_value, num_labels);
所以,解决这个问题的直接方法就是在accuracy网络层增加这个ignore_label参数。
layer {
name: "accuracy"
type: "Accuracy"
bottom: "conv_classifier"
bottom: "label"
top: "accuracy"
top: "per_class_accuracy"
accuracy_param:{
ignore_label: 11
}
}
具体的写法(如accuracy_param这个变量名),参考caffe.proto文件。
gumh@gumh-B85M-DS3H-A:~/OpenSource/SegNet$ python ./Scripts/compute_bn_statistics.py ./Models/segnet_basic_train.prototxt ./Models/Training_better/segnet_basic_iter_5000.caffemodel ./Models/Inference/
Building BN calc net...
Calculate BN stats...
WARNING: Logging before InitGoogleLogging() is written to STDERR
I1103 15:02:45.675106 21206 net.cpp:42] Initializing net from parameters:
name: "segnet"
。。。
I1103 15:02:46.122388 21210 dense_image_data_layer.cpp:201] label values:
progress: 1/367
I1103 15:02:46.472606 21211 dense_image_data_layer.cpp:201] label values:
progress: 2/367
。。。
I1103 15:04:54.483098 21602 dense_image_data_layer.cpp:201] label values:
progress: 364/367
I1103 15:04:54.831765 21603 dense_image_data_layer.cpp:201] label values:
progress: 365/367
I1103 15:04:55.179713 21604 dense_image_data_layer.cpp:201] label values:
progress: 366/367
New data:
[u'conv3_bn', u'conv1_bn', u'conv2_bn', u'conv_decode4_bn', u'conv4_bn', u'conv_decode3_bn', u'conv_decode1_bn', u'conv_decode2_bn']
[u'conv3_bn', u'conv1_bn', u'conv2_bn', u'conv_decode4_bn', u'conv4_bn', u'conv_decode3_bn', u'conv_decode1_bn', u'conv_decode2_bn']
Saving test net weights...
done
gumh@gumh-B85M-DS3H-A:~/OpenSource/SegNet$
为什么会有一个11的label出来呢?这个类别是表示其他的不属于任何以上11中类别的物体。