caffe 用cuda 来进行加速, 看完caffe CPU的程序,再去看 CUDA程序, 变得很容易了
在layer文件中, cuda 程序和CPU程序类似,只是调用math_function.hpp函数不一样,一个调用cpu 一个调用gpu,下面以absval_layer.cu为例:
#include <vector> #include "caffe/layer.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/vision_layers.hpp" namespace caffe { template <typename Dtype> void AbsValLayer<Dtype>::Forward_gpu( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const int count = top[0]->count(); Dtype* top_data = top[0]->mutable_gpu_data(); // 与cpu程序区别,这里调用GPU 函数 <span style="color:#ff6666;"> caffe_gpu_abs(count, bottom[0]->gpu_data(), top_data);</span> } template <typename Dtype> void AbsValLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { const int count = top[0]->count(); const Dtype* top_diff = top[0]->gpu_diff(); if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
<pre name="code" class="cpp"><span style="color:#ff0000;"> // 与cpu程序区别,这里调用GPU 函数</span>caffe_gpu_sign(count, bottom_data, bottom_diff); caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); }}INSTANTIATE_LAYER_GPU_FUNCS(AbsValLayer);} // namespace caffe
layer中 还有一种方式
#include <algorithm> #include <vector> #include "caffe/layer.hpp" #include "caffe/vision_layers.hpp" // if x > 0 Computes y = x + log(1 + exp(-x)) ; otherwise y =log(1 + exp(-x)) namespace caffe { const float kBNLL_THRESHOLD = 50.;
先封装一个函数,用于前向 <span style="background-color: rgb(255, 204, 51);">template <typename Dtype> __global__ void BNLLForward(const int n, const Dtype* in, Dtype* out) { CUDA_KERNEL_LOOP(index, n) { // if x > 0 Computes y = x + log(1 + exp(-x)) ; otherwise y =log(1 + exp(-x)) out[index] = in[index] > 0 ? in[index] + log(1. + exp(-in[index])) : log(1. + exp(in[index])); } }</span> template <typename Dtype> void BNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* top_data = top[0]->mutable_gpu_data(); const int count = bottom[0]->count();
// 这里与cpu 不同 // NOLINT_NEXT_LINE(whitespace/operators) BNLLForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>( count, bottom_data, top_data); CUDA_POST_KERNEL_CHECK; } <span style="background-color: rgb(255, 204, 51);">template <typename Dtype> __global__ void BNLLBackward(const int n, const Dtype* in_diff, const Dtype* in_data, Dtype* out_diff) { CUDA_KERNEL_LOOP(index, n) { // if x < 50 ,x = x, else x = 50; // if x > 0 ,y' = exp(x) / (1 + exp(x)); Dtype expval = exp(min(in_data[index], Dtype(kBNLL_THRESHOLD))); out_diff[index] = in_diff[index] * expval / (expval + 1.); } }</span> template <typename Dtype> void BNLLLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (propagate_down[0]) { const Dtype* bottom_data = bottom[0]->gpu_data(); const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); const int count = bottom[0]->count();
<span style="background-color: rgb(102, 204, 204);"> // 与cpu区别 // NOLINT_NEXT_LINE(whitespace/operators) BNLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>( count, top_diff, bottom_data, bottom_diff);</span> CUDA_POST_KERNEL_CHECK; } } INSTANTIATE_LAYER_GPU_FUNCS(BNLLLayer); } // namespace caffe