权重压缩:三值神经网络
背景
卷积神经网络(Convolutional Neural Network,CNN)尤其适合于目标识别、分类、检测及图像分割等计算机视觉应用。 典型的模型有数百万参数并运算量大;例如,AlexNet有6100万参数(浮点数权值共占用249MB存储空间),分类一张图片需要15亿高精度运算。 所以为降低CNN规模和使用资源,现在有模型修剪(model pruning,去掉值较小的权值)和权值压缩(weight compression,利用少数几位量化权值) 两种方法。
权值压缩:二值神经网络
三值神经网络
重要点:网络中的权重更新使用三值{+1,0,-1};并最小化全精度权重W和三值权重W的欧式距离。
原理:
- 原始问题:设置传播中的权重为三值,用三值的权重逼近全精度权重, 三值权重;目标:
传播过程:
- 使用特定阈值来设置权重 为 {+1, 0, 1}, 寻找一个适当的阈值和来拟合上述问题;这样用阈值和来确定权重,简化了计算量;而且使用+1/-1/0更使计算从原先的乘法变成加法(+1/-1直接是本身的加减);此外和都是正数。
公式1的优化问题化解为如下:
从而 解:
最终确定一个阈值、 来构建三值神经网络;根据W具体的分布情况来确定阈值:
均匀分布:在大于阈值的条件下 : 约等于均值()*数量()
[图片上传失败...(image-885207-1554553493996)]
正态分布:
[图片上传失败...(image-2a3804-1554553493996)]
最后本文章作者根据经验:
确定三值网络中的阈值与W期望的关系::
均值分布: (注在[-a,a]均匀取值,均值为)
正态分布: (注:
算法:
[图片上传失败...(image-d013ce-1554553493996)]
只在前向和后向过程中使用使用权值简化,但是更新是仍然是使用连续的权值。
优化方法:随机梯度下降(SGD+动量)、批量标准化和学习率衰减
Momentum动量:
实验过程:
本实验基于数据集MNIST、CIFAR-10、ImageNet以及三种网络结构(LeNet-5、VGG-7、ResNet-18(B))进行测试,分别使用二值神经网络、三值神经网络及全精度网络,评测其效果。
论文结果:
TWNs在benchmark表现比全精度网络稍微差一点,但是模型压缩率达到了16/32倍。
源码:
前向后向 更改在conv_layer.cpp line 30-100 Forward_cpu | backward_cpu
const Dtype* weight = (BINARY || TERNARY) ? this->blobs_[0]->cpu_binary() : this->blobs_[0]->cpu_data();
caffe_cpu_ternary:
//将权重三值化1,0,-1
//math_functions.cpp line294
template<>
void caffe_cpu_ternary(const int N, const double delta, const double* X, double* Y){
for(int i=0; idelta) - (x<-delta);
}
}
delta:
//delta 设定,当前均值×0.7,delta在[-100,100]
template
void Blob::set_delta(){
float scale_factor = TERNARY_DELTA * 1.0 / 10;
Dtype delta = (Dtype) scale_factor * this->asum_data() / this->count();
delta = (delta <= 100) ? delta : 100;
delta = (delta >= -100) ? delta : -100;
this->delta_ = delta;
}
ternarize_data:
//blob.cpp line146
//权重更新,量化三值
// revised 2016-3-21
template
void Blob::ternarize_data(Phase phase){
if(phase == RUN){
// if(DEBUG) print_head();
//LOG(INFO) << "RUN phase...";
// caffe_sleep(3);
return; // do nothing for the running phase
}else if(phase == TRAIN){
//LOG(INFO) << "TRAIN phase ...";
// caffe_sleep(3);
}else{
//LOG(INFO) << "TEST phase ...";
// caffe_sleep(3);
}
// const Dtype delta = 0; // default value;
// const Dtype delta = (Dtype) 0.8 * this->asum_data() / this->count();
this->set_delta();
const Dtype delta = this->get_delta();
Dtype alpha = 1;
if (!data_) { return; }
switch (data_->head()) {
case SyncedMemory::HEAD_AT_CPU:
{
caffe_cpu_ternary(this->count(), delta, this->cpu_data(), this->mutable_cpu_binary());
alpha = caffe_cpu_dot(this->count(), this->cpu_binary(), this->cpu_data());
alpha /= caffe_cpu_dot(this->count(), this->cpu_binary(), this->cpu_binary());
caffe_cpu_scale(this->count(), alpha, this->cpu_binary(), this->mutable_cpu_binary());
// this->set_alpha(alpha);
}
return;
case SyncedMemory::HEAD_AT_GPU:
case SyncedMemory::SYNCED:
#ifndef CPU_ONLY
{
caffe_gpu_ternary(this->count(), delta, this->gpu_data(), this->mutable_gpu_binary());
Dtype* pa = new Dtype(0);
caffe_gpu_dot(this->count(), this->gpu_binary(), this->gpu_data(), pa);
Dtype* pb = new Dtype(0);
caffe_gpu_dot(this->count(), this->gpu_binary(), this->gpu_binary(), pb);
alpha = (*pa) / ((*pb) + 1e-6);
this->set_alpha(alpha);
caffe_gpu_scale(this->count(), alpha, this->gpu_binary(), this->mutable_gpu_binary());
// this->set_alpha((Dtype)1);
// LOG(INFO) << "alpha = " << alpha;
// caffe_sleep(3);
}
return;
#else
NO_GPU;
#endif
case SyncedMemory::UNINITIALIZED:
return;
default:
LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
}
}
// Implemented 2016-3-16
template
void Blob::quantize_data(const Dtype left, const Dtype right){
if (!data_) { return; }
switch (data_->head()) {
case SyncedMemory::HEAD_AT_CPU:
// cpu codes
if(DEBUG){
LOG(INFO) << "CPU codes.";
caffe_sleep(3);
}
caffe_quantize(this->count(), left, right, this->cpu_data(), this->mutable_cpu_quantum());
return;
case SyncedMemory::HEAD_AT_GPU:
case SyncedMemory::SYNCED:
#ifndef CPU_ONLY
// gpu codes
if(DEBUG){
LOG(INFO) << "GPU codes.";
caffe_sleep(3);
}
caffe_gpu_quantize(this->count(), left, right, this->gpu_data(), this->mutable_gpu_quantum());
// DEBUG caffe_gpu_quantize
if(DEBUG){
std::cout << "--------------- data ----------------" << std::endl;
for(int i=0; i<10; i++){
std::cout << this->cpu_data()[i] << " ";
}
std::cout << std::endl;
std::cout << "-------------- quantum ----------------" << std::endl;
for(int i=0; i<10; i++){
std::cout << this->cpu_quantum()[i] << " ";
}
std::cout << std::endl;
caffe_sleep(10);
}
return;
#else
NO_GPU;
#endif
case SyncedMemory::UNINITIALIZED:
return;
default:
LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
}
}
// Implemented @ 2016-3-18
template
void Blob::acmean_data(){
if (!data_) { return; }
switch (data_->head()) {
case SyncedMemory::HEAD_AT_CPU:
// CPU codes
return;
case SyncedMemory::HEAD_AT_GPU:
case SyncedMemory::SYNCED:
#ifndef CPU_ONLY
// GPU codes
Dtype* X;
X = mutable_cpu_acmean();
/*
LOG(INFO) << "shape.size() = " << shape().size();
for(int i=0; igpu_data() + offset(i), &X[i]);
// NOTE: cann't take this form of operation on GPU!
X[i] = caffe_cpu_asum(n, this->cpu_data() + offset(i)) / n;
// caffe_gpu_asum(n, this->gpu_data() + offset(i), &x);
// X[i] /= n;
}
return;
#else
NO_GPU;
#endif
case SyncedMemory::UNINITIALIZED:
return;
default:
LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
}
}
// Implemented @ 2016-3-18
template
void Blob::scale_binary(){
if (!data_) { return; }
switch (data_->head()) {
case SyncedMemory::HEAD_AT_CPU:
// CPU codes
return;
case SyncedMemory::HEAD_AT_GPU:
case SyncedMemory::SYNCED:
#ifndef CPU_ONLY
// GPU codes
/*
for(int i=0; i(n, this->gpu_acmean()[i], this->mutable_gpu_binary() + offset(i));
}
*/
return;
#else
NO_GPU;
#endif
case SyncedMemory::UNINITIALIZED:
return;
default:
LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
}
}