有一篇不错的blog描述梯度下降的方法。我前面有介绍solver的优化方法。
这节是新版caffe solver的5个求解方法:
adagrad_solver.cpp:
#include <vector> #include "caffe/sgd_solvers.hpp" namespace caffe { #ifndef CPU_ONLY template <typename Dtype> void adagrad_update_gpu(int N, Dtype* g, Dtype* h, Dtype delta, Dtype local_rate); #endif template <typename Dtype> void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) { CHECK(Caffe::root_solver()); const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const vector<float>& net_params_lr = this->net_->params_lr(); Dtype delta = this->param_.delta(); Dtype local_rate = rate * net_params_lr[param_id]; //求解公式是按照math_functions里面的函数进行实现的,新版caffe和旧版的caffe有点区别,里面函数没什么太大区别 switch (Caffe::mode()) { case Caffe::CPU: { // compute square of gradient in update caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), Dtype(2), this->update_[param_id]->mutable_cpu_data());//update[i] =cpu_diff[i] ^ 2 // update history caffe_add(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), this->history_[param_id]->cpu_data(), this->history_[param_id]->mutable_cpu_data());//history[i] = history[i] + update[i] // prepare update caffe_powx(net_params[param_id]->count(), this->history_[param_id]->cpu_data(), Dtype(0.5), this->update_[param_id]->mutable_cpu_data());//update[i]=history[i]开根号 caffe_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_cpu_data());//update[i] += delta caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data());//update[i] = cpu_diff[i] \ update[i] // scale and copy //Y= alpha*X+beta*Y cpu_diff[i]=local_rate*update[i] caffe_cpu_axpby(net_params[param_id]->count(), local_rate, this->update_[param_id]->cpu_data(), Dtype(0), net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY adagrad_update_gpu(net_params[param_id]->count(), net_params[param_id]->mutable_gpu_diff(), this->history_[param_id]->mutable_gpu_data(), delta, local_rate); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } INSTANTIATE_CLASS(AdaGradSolver); REGISTER_SOLVER_CLASS(AdaGrad); } // namespace caffeadadelta_solver.cpp:
#include <vector> #include "caffe/sgd_solvers.hpp" namespace caffe { template <typename Dtype> void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() { // Add the extra history entries for AdaDelta after those from // SGDSolver::PreSolve const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); for (int i = 0; i < net_params.size(); ++i) { const vector<int>& shape = net_params[i]->shape(); this->history_.push_back( shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape))); } } #ifndef CPU_ONLY template <typename Dtype> void adadelta_update_gpu(int N, Dtype* g, Dtype* h, Dtype* h2, Dtype momentum, Dtype delta, Dtype local_rate); #endif template <typename Dtype> void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) { const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const vector<float>& net_params_lr = this->net_->params_lr(); Dtype delta = this->param_.delta(); Dtype momentum = this->param_.momentum(); Dtype local_rate = rate * net_params_lr[param_id]; size_t update_history_offset = net_params.size(); switch (Caffe::mode()) { case Caffe::CPU: { caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), Dtype(2), this->update_[param_id]->mutable_cpu_data());//update[i] =cpu_diff[i] ^ 2 caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, this->update_[param_id]->cpu_data(), momentum, this->history_[param_id]->mutable_cpu_data());//history[i]=(1 - momentum)*update[i]+momentum*history[i] caffe_set(net_params[param_id]->count(), delta, this->temp_[param_id]->mutable_cpu_data());//temp=delta caffe_add(net_params[param_id]->count(), this->temp_[param_id]->cpu_data(), this->history_[update_history_offset + param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data());//update[i]=temp+history[i] caffe_add(net_params[param_id]->count(), this->temp_[param_id]->cpu_data(), this->history_[param_id]->cpu_data(), this->temp_[param_id]->mutable_cpu_data());//temp=temp+history[i] caffe_div(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), this->temp_[param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data());//update[i]=update[i]/temp caffe_powx(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), Dtype(0.5), this->update_[param_id]->mutable_cpu_data());//update[i]=update[i]的开根号 caffe_mul(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff());//cpu_diff[i]=cpu_diff[i]*update[i] caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), Dtype(2), this->update_[param_id]->mutable_cpu_data());//update[i]=cpu_diff[i]^ 2 //history[i]=(1 - momentum)*update[i]+momentum*history[i] caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, this->update_[param_id]->cpu_data(), momentum, this->history_[update_history_offset + param_id]->mutable_cpu_data()); caffe_cpu_scale(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), net_params[param_id]->mutable_cpu_diff());//cpu_diff[i]=local_rate*cpu_diff[i] break; } case Caffe::GPU: { #ifndef CPU_ONLY adadelta_update_gpu(net_params[param_id]->count(), net_params[param_id]->mutable_gpu_diff(), this->history_[param_id]->mutable_gpu_data(), this->history_[update_history_offset + param_id]->mutable_gpu_data(), momentum, delta, local_rate); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } INSTANTIATE_CLASS(AdaDeltaSolver); REGISTER_SOLVER_CLASS(AdaDelta); } // namespace caffeadam_solver.cpp:
按照上图的实现:
#include <vector> #include "caffe/sgd_solvers.hpp" namespace caffe { template <typename Dtype> void AdamSolver<Dtype>::AdamPreSolve() { // Add the extra history entries for Adam after those from // SGDSolver::PreSolve const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); for (int i = 0; i < net_params.size(); ++i) { const vector<int>& shape = net_params[i]->shape(); this->history_.push_back( shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape))); } } #ifndef CPU_ONLY template <typename Dtype> void adam_update_gpu(int N, Dtype* g, Dtype* m, Dtype* v, Dtype beta1, Dtype beta2, Dtype eps_hat, Dtype corrected_local_rate); #endif template <typename Dtype> void AdamSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) { const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const vector<float>& net_params_lr = this->net_->params_lr(); Dtype local_rate = rate * net_params_lr[param_id]; const Dtype beta1 = this->param_.momentum(); const Dtype beta2 = this->param_.momentum2(); // we create aliases for convenience size_t update_history_offset = net_params.size(); Blob<Dtype>* val_m = this->history_[param_id].get(); Blob<Dtype>* val_v = this->history_[param_id + update_history_offset].get(); Blob<Dtype>* val_t = this->temp_[param_id].get(); const int t = this->iter_ + 1; const Dtype correction = std::sqrt(Dtype(1) - pow(beta2, t)) (Dtype(1.) - pow(beta1, t));//correction=()^0.5 *(); const int N = net_params[param_id]->count(); const Dtype eps_hat = this->param_.delta(); switch (Caffe::mode()) { case Caffe::CPU: { // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t caffe_cpu_axpby(N, Dtype(1)-beta1, net_params[param_id]->cpu_diff(), beta1, val_m->mutable_cpu_data()); // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2 caffe_mul(N, net_params[param_id]->cpu_diff(), net_params[param_id]->cpu_diff(), val_t->mutable_cpu_data()); caffe_cpu_axpby(N, Dtype(1)-beta2, val_t->cpu_data(), beta2, val_v->mutable_cpu_data()); // set update //t=v^0.5 caffe_powx(N, val_v->cpu_data(), Dtype(0.5), val_t->mutable_cpu_data()); //t=t+eps caffe_add_scalar(N, eps_hat, val_t->mutable_cpu_data()); // t=m/t caffe_div(N, val_m->cpu_data(), val_t->cpu_data(), val_t->mutable_cpu_data()); //cpu_diff[i]=local_rate*correction*t caffe_cpu_scale(N, local_rate*correction, val_t->cpu_data(), net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY adam_update_gpu(N, net_params[param_id]->mutable_gpu_diff(), val_m->mutable_gpu_data(), val_v->mutable_gpu_data(), beta1, beta2, eps_hat, local_rate*correction); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } INSTANTIATE_CLASS(AdamSolver); REGISTER_SOLVER_CLASS(Adam); } // namespace caffenesterov_solver.cpp:
这个求解方法可以参考知乎上的图解:here
实现:
#include <vector> #include "caffe/sgd_solvers.hpp" namespace caffe { #ifndef CPU_ONLY template <typename Dtype> void nesterov_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum, Dtype local_rate); #endif template <typename Dtype> void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) { CHECK(Caffe::root_solver()); const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const vector<float>& net_params_lr = this->net_->params_lr(); Dtype momentum = this->param_.momentum(); Dtype local_rate = rate * net_params_lr[param_id]; switch (Caffe::mode()) { case Caffe::CPU: { // save history momentum for stepping back //如果梯度求解时出现迭代倒退时需要将当前动量保存下来,一起不是很理解动量,其实 就是每次梯度下降的时候往前迭代需要一定的动量(按照物理理解) caffe_copy(net_params[param_id]->count(), this->history_[param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data()); // update history //这个算法也不是很复杂,所以代码比较简单点 //history=local_rate*diff+momentum*history caffe_cpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), momentum, this->history_[param_id]->mutable_cpu_data()); // compute update: step back then over step //update=(1+ momentum)*history-momentum*update caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, this->history_[param_id]->cpu_data(), -momentum, this->update_[param_id]->mutable_cpu_data()); // copy caffe_copy(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY nesterov_update_gpu(net_params[param_id]->count(), net_params[param_id]->mutable_gpu_diff(), this->history_[param_id]->mutable_gpu_data(), momentum, local_rate); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } INSTANTIATE_CLASS(NesterovSolver); REGISTER_SOLVER_CLASS(Nesterov); } // namespace caffermsprop_solver.cpp:
这里有一个官网的介绍:here
rmsprop的算法推导:here
实现:
#include <vector> #include "caffe/sgd_solvers.hpp" namespace caffe { #ifndef CPU_ONLY template <typename Dtype> void rmsprop_update_gpu(int N, Dtype* g, Dtype* h, Dtype rms_decay, Dtype delta, Dtype local_rate); #endif template <typename Dtype> void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) { const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params(); const vector<float>& net_params_lr = this->net_->params_lr(); // get the learning rate Dtype delta = this->param_.delta(); Dtype rms_decay = this->param_.rms_decay(); Dtype local_rate = rate * net_params_lr[param_id]; switch (Caffe::mode()) { case Caffe::CPU: // compute square of gradient in update //update=diff^2 caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), Dtype(2), this->update_[param_id]->mutable_cpu_data()); // update history //history=(1-rms_decay)*update+rms_decay*history caffe_cpu_axpby(net_params[param_id] -> count(), Dtype(1-rms_decay), this->update_[param_id]->cpu_data(), rms_decay, this->history_[param_id]-> mutable_cpu_data()); // prepare update //update=history^0.5 caffe_powx(net_params[param_id]->count(), this->history_[param_id]->cpu_data(), Dtype(0.5), this->update_[param_id]->mutable_cpu_data()); //update=delta*update caffe_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_cpu_data()); // update=diff/update caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data()); // scale and copy //diff=local_rate*update caffe_cpu_axpby(net_params[param_id]->count(), local_rate, this->update_[param_id]->cpu_data(), Dtype(0), net_params[param_id]->mutable_cpu_diff()); break; case Caffe::GPU: #ifndef CPU_ONLY rmsprop_update_gpu(net_params[param_id]->count(), net_params[param_id]->mutable_gpu_diff(), this->history_[param_id]->mutable_gpu_data(), rms_decay, delta, local_rate); #else NO_GPU; #endif break; default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } INSTANTIATE_CLASS(RMSPropSolver); REGISTER_SOLVER_CLASS(RMSProp); } // namespace caffe