Ristretto是一种自动CNN近似工具,可压缩32位浮点网络。 Ristretto是Caffe的扩展,允许以有限的数值精度测试,训练和微调网络。
Ristretto允许三种不同的量化策略来近似卷积神经网络:
name: "CIFAR10_quick"
layer {
name: "cifar_small"
type: "Data"
top: "data"
top: "label"
include {
phase: TRAIN
}
transform_param {
crop_size: 28
mean_file: "examples/cifar10/mean.binaryproto"
}
data_param {
source: "examples/cifar10/cifar10_train_lmdb"
batch_size: 128
backend: LMDB
}
}
layer {
name: "cifar_small"
type: "Data"
top: "data"
top: "label"
include {
phase: TEST
}
transform_param {
crop_size: 28
mean_file: "examples/cifar10/mean.binaryproto"
}
data_param {
source: "examples/cifar10/cifar10_test_lmdb"
batch_size: 128
backend: LMDB
}
}
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
convolution_param {
num_output: 32
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "gaussian"
std: 0.0001
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "batch_norm1"
type: "BatchNorm"
bottom: "conv1"
top: "conv1"
batch_norm_param {
use_global_stats: false
}
include {
phase: TRAIN
}
}
layer {
name: "batch_norm1"
type: "BatchNorm"
bottom: "conv1"
top: "conv1"
batch_norm_param {
use_global_stats: true
}
include {
phase: TEST
}
}
layer {
name: "scale1"
type: "Scale"
bottom: "conv1"
top: "conv1"
scale_param {
bias_term: true
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
relu_param{
negative_slope: 0.1
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
convolution_param {
num_output: 16
pad: 0
kernel_size: 1
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "batch_norm2"
type: "BatchNorm"
bottom: "conv2"
top: "conv2"
batch_norm_param {
use_global_stats: false
}
include {
phase: TRAIN
}
}
layer {
name: "batch_norm2"
type: "BatchNorm"
bottom: "conv2"
top: "conv2"
batch_norm_param {
use_global_stats: true
}
include {
phase: TEST
}
}
layer {
name: "scale2"
type: "Scale"
bottom: "conv2"
top: "conv2"
scale_param {
bias_term: true
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
relu_param{
negative_slope: 0.1
}
}
layer {
name: "conv3"
type: "Convolution"
bottom: "conv2"
top: "conv3"
convolution_param {
num_output: 64
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "batch_norm3"
type: "BatchNorm"
bottom: "conv3"
top: "conv3"
batch_norm_param {
use_global_stats: false
}
include {
phase: TRAIN
}
}
layer {
name: "batch_norm3"
type: "BatchNorm"
bottom: "conv3"
top: "conv3"
batch_norm_param {
use_global_stats: true
}
include {
phase: TEST
}
}
layer {
name: "scale3"
type: "Scale"
bottom: "conv3"
top: "conv3"
scale_param {
bias_term: true
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
relu_param{
negative_slope: 0.1
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv3"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layer {
name: "conv4"
type: "Convolution"
bottom: "pool2"
top: "conv4"
convolution_param {
num_output: 32
pad: 0
kernel_size: 1
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "batch_norm4"
type: "BatchNorm"
bottom: "conv4"
top: "conv4"
batch_norm_param {
use_global_stats: false
}
include {
phase: TRAIN
}
}
layer {
name: "batch_norm4"
type: "BatchNorm"
bottom: "conv4"
top: "conv4"
batch_norm_param {
use_global_stats: true
}
include {
phase: TEST
}
}
layer {
name: "scale4"
type: "Scale"
bottom: "conv4"
top: "conv4"
scale_param {
bias_term: true
}
}
layer {
name: "relu4"
type: "ReLU"
bottom: "conv4"
top: "conv4"
relu_param{
negative_slope: 0.1
}
}
layer {
name: "conv5"
type: "Convolution"
bottom: "conv4"
top: "conv5"
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "batch_norm5"
type: "BatchNorm"
bottom: "conv5"
top: "conv5"
batch_norm_param {
use_global_stats: false
}
include {
phase: TRAIN
}
}
layer {
name: "batch_norm5"
type: "BatchNorm"
bottom: "conv5"
top: "conv5"
batch_norm_param {
use_global_stats: true
}
include {
phase: TEST
}
}
layer {
name: "scale5"
type: "Scale"
bottom: "conv5"
top: "conv5"
scale_param {
bias_term: true
}
}
layer {
name: "relu5"
type: "ReLU"
bottom: "conv5"
top: "conv5"
relu_param{
negative_slope: 0.1
}
}
layer {
name: "conv6"
type: "Convolution"
bottom: "conv5"
top: "conv6"
convolution_param {
num_output: 64
pad: 0
kernel_size: 1
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "batch_norm6"
type: "BatchNorm"
bottom: "conv6"
top: "conv6"
batch_norm_param {
use_global_stats: false
}
include {
phase: TRAIN
}
}
layer {
name: "batch_norm6"
type: "BatchNorm"
bottom: "conv6"
top: "conv6"
batch_norm_param {
use_global_stats: true
}
include {
phase: TEST
}
}
layer {
name: "scale6"
type: "Scale"
bottom: "conv6"
top: "conv6"
scale_param {
bias_term: true
}
}
layer {
name: "relu6"
type: "ReLU"
bottom: "conv6"
top: "conv6"
relu_param{
negative_slope: 0.1
}
}
layer {
name: "conv7"
type: "Convolution"
bottom: "conv6"
top: "conv7"
convolution_param {
num_output: 10
pad: 0
kernel_size: 1
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu7"
type: "ReLU"
bottom: "conv7"
top: "conv7"
relu_param{
negative_slope: 0.1
}
}
layer {
name: "pool3"
type: "Pooling"
bottom: "conv7"
top: "pool3"
pooling_param {
pool: AVE
global_pooling: true
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "pool3"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "pool3"
bottom: "label"
top: "loss"
}
# reduce the learning rate after 8 epochs (4000 iters) by a factor of 10
# The train/test net protocol buffer definition
net: "examples/cifar10/cifar_small_train_test.prototxt"
# test_iter specifies how many forward passes the test should carry out.
# In the case of MNIST, we have test batch size 100 and 100 test iterations,
# covering the full 10,000 testing images.
test_iter: 100
# Carry out testing every 500 training iterations.
test_interval: 500
# The base learning rate, momentum and the weight decay of the network.
base_lr: 0.1
momentum: 0.9
weight_decay: 0.0005
# The learning rate policy
lr_policy: "poly"
power: 4
# Display every 100 iterations
display: 100
# The maximum number of iterations
max_iter: 5000
# snapshot intermediate results
snapshot: 5000
snapshot_prefix: "examples/cifar10/cifar_small"
# solver mode: CPU or GPU
solver_mode: CPU
./build/tools/caffe train --solver=examples/cifar10/cifar_small_solver.prototxt
I1220 22:43:28.052917 23562 solver.cpp:239] Iteration 4900 (2.47072 iter/s, 40.474s/100 iters), loss = 0.714026
I1220 22:43:28.053019 23562 solver.cpp:258] Train net output #0: loss = 0.714026 (* 1 = 0.714026 loss)
I1220 22:43:28.053027 23562 sgd_solver.cpp:112] Iteration 4900, lr = 1.59999e-08
I1220 22:44:08.196622 23562 solver.cpp:468] Snapshotting to binary proto file examples/cifar10/cifar_small_iter_5000.caffemodel
I1220 22:44:08.197582 23562 sgd_solver.cpp:280] Snapshotting solver state to binary proto file examples/cifar10/cifar_small_iter_5000.solverstate
I1220 22:44:08.403373 23562 solver.cpp:331] Iteration 5000, loss = 0.651131
I1220 22:44:08.403400 23562 solver.cpp:351] Iteration 5000, Testing net (#0)
I1220 22:44:10.372725 23565 data_layer.cpp:73] Restarting data prefetching from start.
I1220 22:44:23.157145 23565 data_layer.cpp:73] Restarting data prefetching from start.
I1220 22:44:24.784054 23562 solver.cpp:418] Test net output #0: accuracy = 0.731719
I1220 22:44:24.784085 23562 solver.cpp:418] Test net output #1: loss = 0.764806 (* 1 = 0.764806 loss)
I1220 22:44:24.784091 23562 solver.cpp:336] Optimization Done.
I1220 22:44:24.784095 23562 caffe.cpp:250] Optimization Done.
I1224 13:35:49.443615 26921 solver.cpp:239] Iteration 9900 (59.9533 iter/s, 1.66796s/100 iters), loss = 0.491564
I1224 13:35:49.443645 26921 solver.cpp:258] Train net output #0: loss = 0.491564 (* 1 = 0.491564 loss)
I1224 13:35:49.443652 26921 sgd_solver.cpp:112] Iteration 9900, lr = 9.99996e-10
I1224 13:35:51.090950 26921 solver.cpp:468] Snapshotting to binary proto file examples/cifar10/cifar_small_iter_10000.caffemodel
I1224 13:35:51.093062 26921 sgd_solver.cpp:280] Snapshotting solver state to binary proto file examples/cifar10/cifar_small_iter_10000.solverstate
I1224 13:35:51.097295 26921 solver.cpp:331] Iteration 10000, loss = 0.559846
I1224 13:35:51.097313 26921 solver.cpp:351] Iteration 10000, Testing net (#0)
I1224 13:35:51.197440 26929 data_layer.cpp:73] Restarting data prefetching from start.
I1224 13:35:51.450372 26921 solver.cpp:418] Test net output #0: accuracy = 0.800234
I1224 13:35:51.450397 26921 solver.cpp:418] Test net output #1: loss = 0.58406 (* 1 = 0.58406 loss)
I1224 13:35:51.450402 26921 solver.cpp:336] Optimization Done.
I1224 13:35:51.450407 26921 caffe.cpp:250] Optimization Done.
vi examples/cifar10/00_test_cifar_small.sh
#!/usr/bin/env sh
./build/tools/caffe test \
--model=examples/cifar10/cifar_small_train_test.prototxt \
--weights=examples/cifar10/cifar_small_iter_5000.caffemodel \
--iterations=320
sh examples/cifar10/00_test_cifar_small.sh
I1221 17:05:35.942559 1010 caffe.cpp:304] Batch 319, accuracy = 0.765625
I1221 17:05:35.942585 1010 caffe.cpp:304] Batch 319, loss = 0.762705
I1221 17:05:35.942592 1010 caffe.cpp:309] Loss: 0.765284
I1221 17:05:35.942605 1010 caffe.cpp:321] accuracy = 0.732983
I1221 17:05:35.942616 1010 caffe.cpp:321] loss = 0.765284 (* 1 = 0.765284 loss)
vi examples/ristretto/00_quantize_cifar_small.sh
#!/usr/bin/env sh
./build/tools/ristretto quantize \
--model=examples/cifar10/cifar_small_train_test.prototxt \
--weights=examples/cifar10/cifar_small_iter_5000.caffemodel \
--model_quantized=models/cifar_small/RistrettoDemo/quantized.prototxt \
--trimming_mode=dynamic_fixed_point --iterations=2000 \
--error_margin=3
其中,model_quantized参数对应的是量化后的文件存放路径,因此需要提前创建examples/cifar_small/RistrettoDemo目录
sh examples/ristretto/00_quantize_cifar10_quick.sh
I1221 10:39:21.821512 27270 quantization.cpp:136] Batch 1999, accuracy = 0.0859375
I1221 10:39:21.821568 27270 quantization.cpp:136] Batch 1999, loss = 3.30265
I1221 10:39:21.821573 27270 quantization.cpp:141] Loss: 3.2011
I1221 10:39:21.821578 27270 quantization.cpp:153] accuracy = 0.0999219
I1221 10:39:21.821585 27270 quantization.cpp:153] loss = 3.2011 (* 1 = 3.2011 loss)
I1221 10:39:21.827525 27270 quantization.cpp:276] ------------------------------
I1221 10:39:21.827541 27270 quantization.cpp:277] Network accuracy analysis for
I1221 10:39:21.827545 27270 quantization.cpp:278] Convolutional (CONV) and fully
I1221 10:39:21.827564 27270 quantization.cpp:279] connected (FC) layers.
I1221 10:39:21.827566 27270 quantization.cpp:280] Baseline 32bit float: 0.732895
I1221 10:39:21.827572 27270 quantization.cpp:281] Dynamic fixed point CONV
I1221 10:39:21.827574 27270 quantization.cpp:282] weights:
I1221 10:39:21.827577 27270 quantization.cpp:284] 16bit: 0.732895
I1221 10:39:21.827580 27270 quantization.cpp:284] 8bit: 0.73082
I1221 10:39:21.827596 27270 quantization.cpp:284] 4bit: 0.471715
I1221 10:39:21.827600 27270 quantization.cpp:287] Dynamic fixed point FC
I1221 10:39:21.827602 27270 quantization.cpp:288] weights:
I1221 10:39:21.827620 27270 quantization.cpp:290] 16bit: 0.732895
I1221 10:39:21.827622 27270 quantization.cpp:290] 8bit: 0.732895
I1221 10:39:21.827625 27270 quantization.cpp:290] 4bit: 0.732895
I1221 10:39:21.827630 27270 quantization.cpp:290] 2bit: 0.732895
I1221 10:39:21.827633 27270 quantization.cpp:290] 1bit: 0.732895
I1221 10:39:21.827636 27270 quantization.cpp:292] Dynamic fixed point layer
I1221 10:39:21.827652 27270 quantization.cpp:293] activations:
I1221 10:39:21.827656 27270 quantization.cpp:295] 16bit: 0.0999219
I1221 10:39:21.827674 27270 quantization.cpp:298] Dynamic fixed point net:
I1221 10:39:21.827677 27270 quantization.cpp:299] 8bit CONV weights,
I1221 10:39:21.827680 27270 quantization.cpp:300] 1bit FC weights,
I1221 10:39:21.827683 27270 quantization.cpp:301] 32bit layer activations:
I1221 10:39:21.827687 27270 quantization.cpp:302] Accuracy: 0.0999219
I1221 10:39:21.827689 27270 quantization.cpp:303] Please fine-tune.
准确率:Accuracy: 0.0999219,明显量化失败,分析原因…
vi models/cifar_small/00_test_cifar_small_quantized.sh
#!/usr/bin/env sh
./build/tools/caffe test \
--model=models/cifar_small/RistrettoDemo/quantized.prototxt \
--weights=examples/cifar10/cifar_small_iter_5000.caffemodel \
--iterations=2000
sh models/cifar_small/00_test_cifar_small_quantized.sh
I1221 18:09:04.598676 2917 caffe.cpp:304] Batch 54, accuracy = 0.109375
I1221 18:09:04.598700 2917 caffe.cpp:304] Batch 54, loss = 3.19723
I1221 18:09:07.994002 2917 caffe.cpp:304] Batch 55, accuracy = 0.078125
I1221 18:09:07.994027 2917 caffe.cpp:304] Batch 55, loss = 3.24805
I1221 18:09:11.363610 2917 caffe.cpp:304] Batch 56, accuracy = 0.0859375
I1221 18:09:11.363633 2917 caffe.cpp:304] Batch 56, loss = 3.25478
真的是量化失败,强制停止…param {
lr_mult: 1
}
param {
lr_mult: 2
}
再次执行量化脚本,得到以下量化结果,量化依旧未成功。I1222 05:15:23.697180 5472 quantization.cpp:276] ------------------------------
I1222 05:15:23.697196 5472 quantization.cpp:277] Network accuracy analysis for
I1222 05:15:23.697201 5472 quantization.cpp:278] Convolutional (CONV) and fully
I1222 05:15:23.697204 5472 quantization.cpp:279] connected (FC) layers.
I1222 05:15:23.697207 5472 quantization.cpp:280] Baseline 32bit float: 0.732895
I1222 05:15:23.697213 5472 quantization.cpp:281] Dynamic fixed point CONV
I1222 05:15:23.697216 5472 quantization.cpp:282] weights:
I1222 05:15:23.697219 5472 quantization.cpp:284] 16bit: 0.732895
I1222 05:15:23.697223 5472 quantization.cpp:284] 8bit: 0.73082
I1222 05:15:23.697227 5472 quantization.cpp:284] 4bit: 0.471715
I1222 05:15:23.697232 5472 quantization.cpp:287] Dynamic fixed point FC
I1222 05:15:23.697234 5472 quantization.cpp:288] weights:
I1222 05:15:23.697237 5472 quantization.cpp:290] 16bit: 0.732895
I1222 05:15:23.697242 5472 quantization.cpp:290] 8bit: 0.732895
I1222 05:15:23.697244 5472 quantization.cpp:290] 4bit: 0.732895
I1222 05:15:23.697249 5472 quantization.cpp:290] 2bit: 0.732895
I1222 05:15:23.697252 5472 quantization.cpp:290] 1bit: 0.732895
I1222 05:15:23.697257 5472 quantization.cpp:292] Dynamic fixed point layer
I1222 05:15:23.697259 5472 quantization.cpp:293] activations:
I1222 05:15:23.697263 5472 quantization.cpp:295] 16bit: 0.0999219
I1222 05:15:23.697266 5472 quantization.cpp:298] Dynamic fixed point net:
I1222 05:15:23.697269 5472 quantization.cpp:299] 8bit CONV weights,
I1222 05:15:23.697273 5472 quantization.cpp:300] 1bit FC weights,
I1222 05:15:23.697276 5472 quantization.cpp:301] 32bit layer activations:
I1222 05:15:23.697279 5472 quantization.cpp:302] Accuracy: 0.0999219
I1222 05:15:23.697283 5472 quantization.cpp:303] Please fine-tune.
生成后根据Cifar_quick文件稍加修改如下:
name: "CIFAR10_small"
layer {
name: "cifar"
type: "Data"
top: "data"
top: "label"
include {
phase: TRAIN
}
transform_param {
crop_size: 28
mean_file: "examples/cifar10/mean.binaryproto"
}
data_param {
source: "examples/cifar10/cifar10_train_lmdb"
batch_size: 128
backend: LMDB
}
}
layer {
name: "cifar"
type: "Data"
top: "data"
top: "label"
include {
phase: TEST
}
transform_param {
crop_size: 28
mean_file: "examples/cifar10/mean.binaryproto"
}
data_param {
source: "examples/cifar10/cifar10_test_lmdb"
batch_size: 128
backend: LMDB
}
}
layer {
bottom: "data"
top: "layer1-conv"
name: "layer1-conv"
type: "Convolution"
convolution_param {
num_output: 32
kernel_size: 3
pad: 1
stride: 1
bias_term: false
}
}
layer {
bottom: "layer1-conv"
top: "layer1-conv"
name: "layer1-bn"
type: "BatchNorm"
batch_norm_param {
use_global_stats: true
}
}
layer {
bottom: "layer1-conv"
top: "layer1-conv"
name: "layer1-scale"
type: "Scale"
scale_param {
bias_term: true
}
}
layer {
bottom: "layer1-conv"
top: "layer1-conv"
name: "layer1-act"
type: "ReLU"
relu_param {
negative_slope: 0.1
}
}
layer {
bottom: "layer1-conv"
top: "layer2-maxpool"
name: "layer2-maxpool"
type: "Pooling"
pooling_param {
kernel_size: 2
stride: 2
pool: MAX
}
}
layer {
bottom: "layer2-maxpool"
top: "layer3-conv"
name: "layer3-conv"
type: "Convolution"
convolution_param {
num_output: 16
kernel_size: 1
pad: 0
stride: 1
bias_term: false
}
}
layer {
bottom: "layer3-conv"
top: "layer3-conv"
name: "layer3-bn"
type: "BatchNorm"
batch_norm_param {
use_global_stats: true
}
}
layer {
bottom: "layer3-conv"
top: "layer3-conv"
name: "layer3-scale"
type: "Scale"
scale_param {
bias_term: true
}
}
layer {
bottom: "layer3-conv"
top: "layer3-conv"
name: "layer3-act"
type: "ReLU"
relu_param {
negative_slope: 0.1
}
}
layer {
bottom: "layer3-conv"
top: "layer4-conv"
name: "layer4-conv"
type: "Convolution"
convolution_param {
num_output: 64
kernel_size: 3
pad: 1
stride: 1
bias_term: false
}
}
layer {
bottom: "layer4-conv"
top: "layer4-conv"
name: "layer4-bn"
type: "BatchNorm"
batch_norm_param {
use_global_stats: true
}
}
layer {
bottom: "layer4-conv"
top: "layer4-conv"
name: "layer4-scale"
type: "Scale"
scale_param {
bias_term: true
}
}
layer {
bottom: "layer4-conv"
top: "layer4-conv"
name: "layer4-act"
type: "ReLU"
relu_param {
negative_slope: 0.1
}
}
layer {
bottom: "layer4-conv"
top: "layer5-maxpool"
name: "layer5-maxpool"
type: "Pooling"
pooling_param {
kernel_size: 2
stride: 2
pool: MAX
}
}
layer {
bottom: "layer5-maxpool"
top: "layer6-conv"
name: "layer6-conv"
type: "Convolution"
convolution_param {
num_output: 32
kernel_size: 1
pad: 0
stride: 1
bias_term: false
}
}
layer {
bottom: "layer6-conv"
top: "layer6-conv"
name: "layer6-bn"
type: "BatchNorm"
batch_norm_param {
use_global_stats: true
}
}
layer {
bottom: "layer6-conv"
top: "layer6-conv"
name: "layer6-scale"
type: "Scale"
scale_param {
bias_term: true
}
}
layer {
bottom: "layer6-conv"
top: "layer6-conv"
name: "layer6-act"
type: "ReLU"
relu_param {
negative_slope: 0.1
}
}
layer {
bottom: "layer6-conv"
top: "layer7-conv"
name: "layer7-conv"
type: "Convolution"
convolution_param {
num_output: 128
kernel_size: 3
pad: 1
stride: 1
bias_term: false
}
}
layer {
bottom: "layer7-conv"
top: "layer7-conv"
name: "layer7-bn"
type: "BatchNorm"
batch_norm_param {
use_global_stats: true
}
}
layer {
bottom: "layer7-conv"
top: "layer7-conv"
name: "layer7-scale"
type: "Scale"
scale_param {
bias_term: true
}
}
layer {
bottom: "layer7-conv"
top: "layer7-conv"
name: "layer7-act"
type: "ReLU"
relu_param {
negative_slope: 0.1
}
}
layer {
bottom: "layer7-conv"
top: "layer8-conv"
name: "layer8-conv"
type: "Convolution"
convolution_param {
num_output: 64
kernel_size: 1
pad: 0
stride: 1
bias_term: false
}
}
layer {
bottom: "layer8-conv"
top: "layer8-conv"
name: "layer8-bn"
type: "BatchNorm"
batch_norm_param {
use_global_stats: true
}
}
layer {
bottom: "layer8-conv"
top: "layer8-conv"
name: "layer8-scale"
type: "Scale"
scale_param {
bias_term: true
}
}
layer {
bottom: "layer8-conv"
top: "layer8-conv"
name: "layer8-act"
type: "ReLU"
relu_param {
negative_slope: 0.1
}
}
layer {
bottom: "layer8-conv"
top: "layer9-conv"
name: "layer9-conv"
type: "Convolution"
convolution_param {
num_output: 10
kernel_size: 1
pad: 0
stride: 1
bias_term: true
}
}
layer {
bottom: "layer9-conv"
top: "layer9-conv"
name: "layer9-act"
type: "ReLU"
relu_param {
negative_slope: 0.1
}
}
layer {
bottom: "layer9-conv"
top: "layer10-avgpool"
name: "layer10-avgpool"
type: "Pooling"
pooling_param {
kernel_size: 7
stride: 1
pool: AVE
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "layer10-avgpool"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "layer10-avgpool"
bottom: "label"
top: "loss"
}
Iteration 4500, Testing net (#0)
I0111 14:16:08.761826 19467 data_layer.cpp:73] Restarting data prefetching from start.
I0111 14:16:09.313390 19395 solver.cpp:418] Test net output #0: accuracy = 0.100469
I0111 14:16:09.313412 19395 solver.cpp:418] Test net output #1: loss = 2.30259 (* 1 = 2.30259 loss)
必须添加weight_filler参数:
weight_filler {
type: "gaussian"
std: 0.01
}
Iteration 10000, loss = 87.3365
I0111 14:35:35.405905 1456 solver.cpp:351] Iteration 10000, Testing net (#0)
I0111 14:35:35.612701 1462 data_layer.cpp:73] Restarting data prefetching from start.
I0111 14:35:36.156333 1456 solver.cpp:418] Test net output #0: accuracy = 1
I0111 14:35:36.156356 1456 solver.cpp:418] Test net output #1: loss = 87.3365 (* 1 = 87.3365 loss)
重新进行训练得到预期的训练效果:accuracy = 0.726797
0111 15:07:42.618531 552 sgd_solver.cpp:112] Iteration 9900, lr = 9.99996e-10
I0111 15:07:46.647879 552 solver.cpp:468] Snapshotting to binary proto file models/cifar_small/darknet2caffe/cifar_small_solver_iter_10000.caffemodel
I0111 15:07:46.649987 552 sgd_solver.cpp:280] Snapshotting solver state to binary proto file models/cifar_small/darknet2caffe/cifar_small_solver_iter_10000.solverstate
I0111 15:07:46.661846 552 solver.cpp:331] Iteration 10000, loss = 0.704111
I0111 15:07:46.661859 552 solver.cpp:351] Iteration 10000, Testing net (#0)
I0111 15:07:46.947620 558 data_layer.cpp:73] Restarting data prefetching from start.
I0111 15:07:47.472630 552 solver.cpp:418] Test net output #0: accuracy = 0.726797
I0111 15:07:47.472651 552 solver.cpp:418] Test net output #1: loss = 0.77728 (* 1 = 0.77728 loss)
I0111 15:07:47.472656 552 solver.cpp:336] Optimization Done.
I0111 15:07:47.472659 552 caffe.cpp:250] Optimization Done.
但是darknet的训练效果更好,应该是更多的细节没有考虑到。
# 1: lr = 0.1; 2,3:lr = 0.2
#------------------------max_batches = 10000----------------------------------
# train
10000, 25.600: 0.669140, 0.579837 avg, 0.000000 rate, 0.024381 seconds, 1280000 images
# valid
9999: top 1: 0.784900, top 2: 0.907100
# Gpu-train
forward_softmax_layer_gpu cost = 63.9296
10000, 25.600: 0.499450, 0.588345 avg, 0.000000 rate, 0.024301 seconds, 1280000 images
# Gpu-valid
9999: top 1: 0.791900, top 2: 0.908900
#------------------------max_batches = 20000----------------------------------
# train
forward_softmax_layer_gpu cost = 42.4485
20000, 51.200: 0.331629, 0.452693 avg, 0.000000 rate, 0.024326 seconds, 2560000 images
# valid
9999: top 1: 0.820700, top 2: 0.926000
# cpu-train
20000, 51.200: 0.467073, 0.435266 avg, 0.000000 rate, 0.646882 seconds, 2560000 images
# cpu-valid
9999: top 1: 0.823600, top 2: 0.928000
# 1
20000, 51.200: 0.462403, 0.437111 avg, 0.000000 rate, 0.025494 seconds, 2560000 images
9999: top 1: 0.822500, top 2: 0.929900
# 2
20000, 51.200: 0.536238, 0.423880 avg, 0.000000 rate, 0.025418 seconds, 2560000 images
9999: top 1: 0.827000, top 2: 0.933500
# 3
20000, 51.200: 0.450796, 0.427930 avg, 0.000000 rate, 0.025619 seconds, 2560000 images
9999: top 1: 0.825500, top 2: 0.930900
#------------------------max_batches = 30000----------------------------------
# 1
# train
forward_softmax_layer_gpu cost = 38.3523
30000, 76.800: 0.299627, 0.361766 avg, 0.000000 rate, 0.024431 seconds, 3840000 images
# valid
9999: top 1: 0.838800, top 2: 0.936200
# 2
30000, 76.800: 0.424442, 0.374467 avg, 0.000000 rate, 0.025370 seconds, 3840000 images
9999: top 1: 0.845500, top 2: 0.942400
# 3
30000, 76.800: 0.411451, 0.361035 avg, 0.000000 rate, 0.025524 seconds, 3840000 images
9999: top 1: 0.843500, top 2: 0.938600
#------------------------max_batches = 40000----------------------------------
# 1
# train
forward_softmax_layer_gpu cost = 32.8497
40000, 102.400: 0.256639, 0.335073 avg, 0.000000 rate, 0.024441 seconds, 5120000 images
# valid
9999: top 1: 0.845800, top 2: 0.939000
# 2
40000, 102.400: 0.320675, 0.325991 avg, 0.000000 rate, 0.025444 seconds, 5120000 images
9999: top 1: 0.850300, top 2: 0.942700
# 3
40000, 102.400: 0.375832, 0.322458 avg, 0.000000 rate, 0.026347 seconds, 5120000 images
9999: top 1: 0.848900, top 2: 0.941400
#-----------------------max_batches = 50000-----------------------------------
# 1
# train
forward_softmax_layer_gpu cost = 41.0163
50000, 128.000: 0.320440, 0.307838 avg, 0.000000 rate, 0.024387 seconds, 6400000 images
# valid
9999: top 1: 0.847600, top 2: 0.944800
# 2
50000, 128.000: 0.478606, 0.321987 avg, 0.000000 rate, 0.025830 seconds, 6400000 images
9999: top 1: 0.852700, top 2: 0.944700
# 3
50000, 128.000: 0.280483, 0.286663 avg, 0.000000 rate, 0.025281 seconds, 6400000 images
9999: top 1: 0.853600, top 2: 0.942700
float scale = sqrt(2./(size*size*c));
//scale = .02;
//for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
// 初始化权重:缩放因子*标准正态分布随机数,缩放因子等于sqrt(2./(size*size*c)),为什么取这个值呢??
// 此处初始化权重为正态分布,而在全连接层make_connected_layer()中初始化权重是均匀分布的。
// TODO:个人感觉,这里应该加一个if条件语句:if(weightfile),因为如果导入了预训练权重文件,就没有必要这样初始化了(事实上在detector.c的train_detector()函数中,
// 紧接着parse_network_cfg()函数之后,就添加了if(weightfile)语句判断是否导入权重系数文件,如果导入了权重系数文件,也许这里初始化的值也会覆盖掉,
// 总之这里的权重初始化的处理方式还是值得思考的,也许更好的方式是应该设置专门的函数进行权重的初始化,同时偏置也是,不过这里似乎没有考虑偏置的初始化,在make_connected_layer()中倒是有。。。)
for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal();
if(l.batch_normalize){
forward_batchnorm_layer(l, net);
} else {
add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
}
activate_array(l.output, l.outputs*l.batch, l.activation);
void forward_batchnorm_layer(layer l, network net)
{
if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
if(net.train){
mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
scal_cpu(l.out_c, .99, l.rolling_mean, 1);
axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
scal_cpu(l.out_c, .99, l.rolling_variance, 1);
axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);
normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);
copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
} else {
normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
}
scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
}
若有大神知道,恳请告知~~
公司的老师说ristretto不支持batchnorm,因此用以下脚本remove_batchnorm.py
合并batchnorm层。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import caffe
import numpy as np
import sys, getopt
import caffe.proto.caffe_pb2 as caffe_pb2
import google.protobuf.text_format as txtf
#################################################################################
def parse_prototxt(model_in, model_out):
# initialize net parameter
net_param = caffe_pb2.NetParameter()
# parse file and link to net parameter
with open(model_in) as f:
s = f.read()
txtf.Merge(s, net_param)
# shortcut to layer and do sanity check
layers = net_param.layer
layer_num = len(layers)
layer_list = []
if len(layers) == 0:
raise NotImplementedError('Convert model def prototxt to use new caffe '
'format (layer not layers) [%s]' % model_in)
# delete batch_norm and scale layers
del_list = []
for i in range(0, layer_num):
l = layers[i]
layer_list.append(l.name)
# remove BatchNorm and Scale layers when seeing 3 consecutive layers are
# Convolution->BatchNorm->Scale
# Then, update next layer's bottom from "Scale layer's top" to "Convolution layer's top"
if l.type.find('Convolution')>=0 or l.type.find('Deconvolution')>=0:
if ((i + 1) < layer_num and layers[i+1].type == 'BatchNorm'):
if ((i + 2) < layer_num and layers[i+2].type == 'Scale'):
del_list.append(i+1)
del_list.append(i+2)
l.convolution_param.bias_term = True
# network graph handling
top_name_old = layers[i+2].top[0]
top_name_new = layers[i].top[0]
if ((i + 3) < layer_num) and (top_name_new!=top_name_old):
for j in range(i + 3, layer_num):
n = layers[j]
for k in range(len(n.bottom)):
if(n.bottom[k]==top_name_old):
n.bottom[k]=top_name_new
break
# check delete list content
#print(del_list)
# remove the layers listed in delete list
# from last to the beginning because the index and size will change immediately
# after delete and that will lead to wrong index mapping
for i in range(len(del_list)-1, -1, -1):
j = del_list[i]
#print ('delete layer: ', j)
del layers[j]
# chceck results
#for i in range(0, len(layers)):
# print (i, layers[i].type)
# write file
with open(model_out, 'w') as f:
f.write(str(net_param))
#################################################################################
def merge_conv_bn_scale_weights(weight_in, bn_mean, bn_var, bn_scale,
sc_scale, sc_bias, eps=0.00001):
# eps = 0.00001
# new_scale = sc_scale / ((bn_var + eps) ** 0.5)
# print bn_scale
# tmp = (((bn_var/bn_scale) + eps) ** 0.5)
new_scale = sc_scale / (((bn_var/bn_scale) + eps) ** 0.5)
new_bias = sc_bias - new_scale * bn_mean / bn_scale
return new_scale, new_bias
# weight_out = np.zeros(weight_in.shape)
# weight_out = weight_in * new_scale
# bias_out = new_bias
# return weight_out, bias_out
def convert_weight(model, out_model, weight, out_weight):
if 1: # default value
net_in = caffe.Net(model, weight, caffe.TEST)
net_out = caffe.Net(out_model, caffe.TEST)
else:
net_in = caffe.Net(model, weight, caffe.TRAIN)
net_out = caffe.Net(out_model, caffe.TRAIN)
# initialize net parameter
net_param = caffe_pb2.NetParameter()
# parse file and link to net parameter
with open(model) as f:
s = f.read()
txtf.Merge(s, net_param)
layer_list = []
list_size = len(net_param.layer)
for i in range(list_size):
l = net_param.layer[i]
layer_list.append(l.name)
# need params[][].data and net_param to complete the work
param_list = net_in.params.keys()
params = net_in.params
params_out = net_out.params
pr_idx = 0
while pr_idx < len(param_list):
pr = list(param_list)[pr_idx]
lidx = layer_list.index(pr) # use name to find layer index
l = net_param.layer[lidx]
#print (pr_idx, pr, lidx, l.type)
if l.type.find('Convolution')>=0 or l.type.find('Deconvolution')>=0:
update = 0
if ((lidx + 1 < list_size) and (net_param.layer[lidx+1].type == 'BatchNorm')):
if ((lidx + 2 < list_size) and (net_param.layer[lidx+2].type == 'Scale')):
dim = net_in.params[pr][0].data.shape
bn_name = net_param.layer[lidx+1].name
sc_name = net_param.layer[lidx+2].name
update = 1
out_dim = dim[1] if l.type.find('Deconvolution')>=0 else dim[0]
for i in range(0, out_dim): # for each output channel
if l.type.find('Deconvolution')>=0:
weight_in = params[pr][0].data.transpose(1, 0, 2, 3)[i]
weight_out = params_out[pr][0].data.transpose(1, 0, 2, 3)[i]
else:
weight_in = params[pr][0].data[i]
weight_out = params_out[pr][0].data[i]
bn_mean = params[bn_name][0].data[i]
bn_var = params[bn_name][1].data[i]
bn_scale = params[bn_name][2].data[0]
sc_scale = params[sc_name][0].data[i]
sc_bias = params[sc_name][1].data[i]
bn_eps = net_param.layer[lidx+1].batch_norm_param.eps
s, b = merge_conv_bn_scale_weights(weight_in, bn_mean, bn_var, bn_scale,
sc_scale, sc_bias,
bn_eps)
params_out[pr][1].data[i] = b
weight_out[...] = weight_in[...] * s
pr_idx = pr_idx + 2
# for last conv layer which is not paired with bn and scale
if update == 0:
for i in range(len(params_out[pr])):
params_out[pr][i].data[...] = params[pr][i].data[...].copy()
else:
for i in range(len(params_out[pr])):
params_out[pr][i].data[...] = params[pr][i].data[...].copy()
pr_idx = pr_idx + 1
# save caffemodel
net_out.save(out_weight)
#################################################################################
def main(argv):
model = ''
weight = ''
out_model = ''
out_weight = ''
# parse files
try:
opts, args = getopt.getopt(argv, "hm:w:o:c:")
print( opts )
except getopt.GetoptError:
print( 'convert_proto.py -m -w -o -c ' )
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print( 'convert_proto.py -m -w -o -c ' )
sys.exit()
elif opt == "-m":
model = arg
elif opt == "-w":
weight = arg
elif opt == "-o":
out_model = arg
elif opt == "-c":
out_weight = arg
# cpu mode
#caffe.set_device(0) # if we have multiple GPUs, pick the first one
caffe.set_mode_cpu()
# print filenames
print ("")
print ('model_in = ', model)
print ('weight_in= ', weight)
print ('model_out= ', out_model)
print ('weight_out=', out_weight)
# parse model_in and write model_out (prototxt)
# remove BatchNorm and Scale layers
parse_prototxt(model, out_model)
# load new prototxt, convert old weight to new weight, save new
# caffemodel.
convert_weight(model, out_model, weight, out_weight)
if __name__=='__main__':
main(sys.argv[1:])
sh models/cifar10_quick/00_test_cifar_quick_quantized.sh
#!/usr/bin/env sh
./build/tools/caffe test \
--model=models/cifar10_quick/RistrettoDemo/quantized.prototxt \
--weights=examples/cifar10/cifar10_quick_iter_5000.caffemodel.h5 \
--iterations=200
name: "CIFAR10_quick"
layer {
name: "cifar"
type: "Data"
top: "data"
top: "label"
include {
phase: TRAIN
}
transform_param {
mean_file: "examples/cifar10/mean.binaryproto"
}
data_param {
source: "examples/cifar10/cifar10_train_lmdb"
batch_size: 100
backend: LMDB
}
}
layer {
name: "cifar"
type: "Data"
top: "data"
top: "label"
include {
phase: TEST
}
transform_param {
mean_file: "examples/cifar10/mean.binaryproto"
}
data_param {
source: "examples/cifar10/cifar10_test_lmdb"
batch_size: 100
backend: LMDB
}
}
layer {
name: "conv1"
type: "ConvolutionRistretto"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 32
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.0001
}
bias_filler {
type: "constant"
}
}
quantization_param {
bw_layer_in: 8
bw_layer_out: 8
bw_params: 4
fl_layer_in: 0
fl_layer_out: -1
fl_params: 5
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "pool1"
top: "pool1"
}
layer {
name: "conv2"
type: "ConvolutionRistretto"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 32
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
quantization_param {
bw_layer_in: 8
bw_layer_out: 8
bw_params: 4
fl_layer_in: -1
fl_layer_out: -1
fl_params: 6
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2"
top: "pool2"
pooling_param {
pool: AVE
kernel_size: 3
stride: 2
}
}
layer {
name: "conv3"
type: "ConvolutionRistretto"
bottom: "pool2"
top: "conv3"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 64
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
quantization_param {
bw_layer_in: 8
bw_layer_out: 8
bw_params: 4
fl_layer_in: 0
fl_layer_out: 2
fl_params: 6
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
}
layer {
name: "pool3"
type: "Pooling"
bottom: "conv3"
top: "pool3"
pooling_param {
pool: AVE
kernel_size: 3
stride: 2
}
}
layer {
name: "ip1"
type: "FcRistretto"
bottom: "pool3"
top: "ip1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 64
weight_filler {
type: "gaussian"
std: 0.1
}
bias_filler {
type: "constant"
}
}
quantization_param {
bw_layer_in: 8
bw_layer_out: 8
bw_params: 4
fl_layer_in: 2
fl_layer_out: 3
fl_params: 4
}
}
layer {
name: "ip2"
type: "FcRistretto"
bottom: "ip1"
top: "ip2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 10
weight_filler {
type: "gaussian"
std: 0.1
}
bias_filler {
type: "constant"
}
}
quantization_param {
bw_layer_in: 8
bw_layer_out: 8
bw_params: 4
fl_layer_in: 3
fl_layer_out: 3
fl_params: 4
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "ip2"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "ip2"
bottom: "label"
top: "loss"
}
I1224 17:24:28.137130 31540 caffe.cpp:304] Batch 199, accuracy = 0.76
I1224 17:24:28.137156 31540 caffe.cpp:304] Batch 199, loss = 0.73176
I1224 17:24:28.137163 31540 caffe.cpp:309] Loss: 0.865315
I1224 17:24:28.137190 31540 caffe.cpp:321] accuracy = 0.7099
I1224 17:24:28.137203 31540 caffe.cpp:321] loss = 0.865315 (* 1 = 0.865315 loss)
sh examples/cifar10/00_test_cifar10_quick.sh
#!/usr/bin/env sh
./build/tools/caffe test \
--model=examples/cifar10/cifar10_quick_train_test.prototxt \
--weights=examples/cifar10/cifar10_quick_iter_5000.caffemodel.h5 \
--iterations=320
I1224 17:54:21.790515 1264 caffe.cpp:304] Batch 319, accuracy = 0.68
I1224 17:54:21.790540 1264 caffe.cpp:304] Batch 319, loss = 0.944417
I1224 17:54:21.790546 1264 caffe.cpp:309] Loss: 0.73186
I1224 17:54:21.790558 1264 caffe.cpp:321] accuracy = 0.758562
I1224 17:54:21.790570 1264 caffe.cpp:321] loss = 0.73186 (* 1 = 0.73186 loss)
sh examples/ristretto/01_finetune_cifar10_quick.sh
#!/usr/bin/env sh
./build/tools/caffe train \
--solver=models/cifar10_quick/RistrettoDemo/cifar10_quick_solver_finetune.prototxt \
--weights=examples/cifar10/cifar10_quick_iter_5000.caffemodel.h5
# Ristretto cifar10_quick example
# Fine-tuning of 8-bit dynamic fixed point network
# The train/test net protocol buffer definition
# test_iter specifies how many forward passes the test should carry out.
test_iter: 2000
# Carry out testing every 100 training iterations.
test_interval: 100
# The base learning rate, momentum and the weight decay of the network.
base_lr: 0.000001
# Display every 100 iterations
display: 100
# The maximum number of iterations
max_iter: 2000
iter_size: 32 #global batch size = batch_size * iter_size
# The learning rate policy
lr_policy: "fixed"
momentum: 0.9
delta: 0.00000001
weight_decay: 0.0002
# snapshot intermediate results
snapshot: 100
snapshot_prefix: "models/cifar10_quick/RistrettoDemo/cifar10_quick"
# solver mode: CPU or GPU
solver_mode: CPU
random_seed: 42
net: "models/cifar10_quick/RistrettoDemo/quantized.prototxt"
average_loss: 40
test_initialization: true
solver_type: ADAM
I1224 18:26:02.715363 29887 solver.cpp:418] Test net output #0: accuracy = 0.716696
I1224 18:26:02.715381 29887 solver.cpp:418] Test net output #1: loss = 0.814206 (* 1 = 0.814206 loss)
I1224 18:26:02.715385 29887 solver.cpp:336] Optimization Done.
I1224 18:26:02.715389 29887 caffe.cpp:250] Optimization Done.
cp models/cifar10_quick/RistrettoDemo/cifar10_quick_iter_2000.caffemodel models/cifar10_quick/RistrettoDemo/cifar10_quick_finetuned.caffemodel
#!/usr/bin/env sh
./build/tools/caffe test \
--model=models/cifar10_quick/RistrettoDemo/quantized.prototxt \
--weights=models/cifar10_quick/RistrettoDemo/cifar10_quick_finetuned.caffemodel \
--gpu=0 --iterations=2000
# darknet 的cfg文件
[convolutional]
batch_normalize=1
filters=32
size=3
stride=1
pad=1
activation=leaky
对应于caffe的prototxt文件:
# caffe的prototxt书写
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
convolution_param {
num_output: 32
pad: 1
kernel_size: 3
stride: 1
weight_filler {
type: "gaussian"
std: 0.0001
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "batch_norm1"
type: "BatchNorm"
bottom: "conv1"
top: "conv1"
batch_norm_param {
use_global_stats: false
}
include {
phase: TRAIN
}
}
layer {
name: "batch_norm1"
type: "BatchNorm"
bottom: "conv1"
top: "conv1"
batch_norm_param {
use_global_stats: true
}
include {
phase: TEST
}
}
layer {
name: "scale1"
type: "Scale"
bottom: "conv1"
top: "conv1"
scale_param {
bias_term: true
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
relu_param{
negative_slope: 0.1
}
}
use_global_stats:如果为真,则使用保存的均值和方差,否则采用滑动平均计算新的均值和方差。该参数缺省的时候,如果是测试阶段则等价为真,如果是训练阶段则等价为假。