在blob.hpp中我们会看到protected成员变量:
protected:
shared_ptr data_;
shared_ptr diff_;
shared_ptr shape_data_;
vector<int> shape_;
int count_;
int capacity_;
其中data_,diff_,shape_data_ 就是指向数据,梯度,和形状数据的shared_ptr指针。在这里顺便学习一下shared_ptr,主要参考http://blog.csdn.net/sndaxdrs/article/details/6175701。自己写了点代码,体会了一下。
注意,编译时:g++ shared_ptr.cpp -lboost_system
// shared_ptr.cpp
#include
#include
#include
using namespace std;
using boost::shared_ptr;
class inS
{
public:
inS(int i, int j) : m(i), n(j) {}
inS() {}
void printm() { cout << "m = " << m << endl; }
void printn() { cout << "n = " << n << endl; }
void setm(int i) { m = i; }
void setn(int j) { n = j; }
private:
int m;
int n;
};
class S
{
public:
S() {inS1.reset(new inS(0,0));
inS2.reset(new inS(0,0));}
void print_inS1() { cout << "print_inS1" << endl;
inS1->printm();
inS1->printn(); }
void print_inS2() { cout << "print_inS2" << endl;
inS2->printm();
inS2->printn(); }
void set_inS1(int i, int j) { inS1->setm(i);
inS1->setn(j); }
void set_inS2(int i, int j) { inS2->setm(i);
inS2->setn(j); }
void S1pointtoS2() { inS1 = inS2; }
void printcount() { cout << "inS1.count: " << inS1.use_count() << endl;
cout << "inS2.count: " << inS2.use_count() << endl; }
private:
shared_ptr inS1;
shared_ptr inS2;
};
int main()
{
shared_ptr<int> ap(new int(10));
shared_ptr<int> ap1 = ap;
bool a = (ap1 == ap);
cout << "ap1==ap" << a << endl;
cout << "*ap = " << *ap << endl;
cout << "*ap1 = " << *ap1 << endl;
*ap1 = 22;
cout << "*ap = " << *ap <22);
cout << ap.use_count() << endl;
// test class S
int m = 1;
int n = 2;
S s;
s.print_inS1();
s.print_inS2();
s.set_inS1(m,n);
s.print_inS1();
s.print_inS2();
s.printcount();
s.S1pointtoS2();
s.print_inS1();
s.print_inS2();
s.printcount();
// shared_ptr in vector
typedef vector< shared_ptr<int> > vs;
vs v(10);
int i = 0;
for(vs::iterator pos = v.begin(); pos != v.end(); ++pos)
{
shared_ptr<int> vp(new int(++i));
*pos = vp;
}
for(int i = 0; i < 10; ++i)
{
cout << *(v[i]) << endl;
cout << v[i].use_count() << endl;
}
shared_ptr<int> vp1 = v[1];
*vp1 = 11;
cout << *v[1] << endl;
cout << v[1].use_count() << endl;
return 0;
}
在之前的Blob学习中简单的了解了SyncedMemory类中都有些什么东西,为了进一步了解caffe是如何完成cpu和gpu数据管理,还是读一读源码吧。
syncedmem.hpp
#ifndef CAFFE_SYNCEDMEM_HPP_
#define CAFFE_SYNCEDMEM_HPP_
#include
#include "caffe/common.hpp"
namespace caffe {
// If CUDA is available and in GPU mode, host memory will be allocated pinned,
// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA).
// The improvement in performance seems negligible in the single GPU case,
// but might be more significant for parallel training. Most importantly,
// it improved stability for large models on many GPUs.
// 如果使用cuda,那么用cudaMallocHost分配管理主机内存,这种方式在多gpu上并行计算时,性能会显著提高。
// 如果只用cpu,则用malloc分配内存。要想了解两者的差异,还需进一步学习。
// 分配内存函数
inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
CUDA_CHECK(cudaMallocHost(ptr, size));
*use_cuda = true;
return;
}
#endif
*ptr = malloc(size);
*use_cuda = false;
CHECK(*ptr) << "host allocation of size " << size << " failed";
}
// 释放内存函数
inline void CaffeFreeHost(void* ptr, bool use_cuda) {
#ifndef CPU_ONLY
if (use_cuda) {
CUDA_CHECK(cudaFreeHost(ptr));
return;
}
#endif
free(ptr);
}
/**
* @brief Manages memory allocation and synchronization between the host (CPU)
* and device (GPU).
*
* TODO(dox): more thorough description.
*/
// 管理内存分配和同步的类
class SyncedMemory {
public:
// 构造函数
SyncedMemory()
: cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
gpu_device_(-1) {}
explicit SyncedMemory(size_t size)
: cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
gpu_device_(-1) {}
~SyncedMemory();
// 返回指向cpu数据的void类型的指针,用void类型可以管理任意类型的数据,对于具体类型数据只需制定指针类型即可
// 另外指针时const的,不能通过该指针改变数据
const void* cpu_data();
// 将当前cpu_ptr_指向data指向的数据,并将其原来指向的数据(如果存在)释放
void set_cpu_data(void* data);
// 下面两个与cpu类似
const void* gpu_data();
void set_gpu_data(void* data);
// 下面两个也与cpu_data类似,区别是可以通过该指针改变其数据
void* mutable_cpu_data();
void* mutable_gpu_data();
// head的状态,前三个分别是,没有初始化,在cpu,在gpu,最后一个表示同步了,说名数据刚从cpu转到gpu,或gpu到cpu
// 下面的函数要很据这些状态来判断是否同步,怎样同步
enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
SyncedHead head() { return head_; }
// 返回size
size_t size() { return size_; }
#ifndef CPU_ONLY
// 将cpu的数据同步到gpu上,并head_ = SYNCED
void async_gpu_push(const cudaStream_t& stream);
#endif
private:
// 将数据同步到cpu,根据head的状态进行不同的操作,具体的可以看.cpp文件
void to_cpu();
// 与cpu类似
void to_gpu();
// 下面就是其私有成员变量了,见名知意
void* cpu_ptr_;
void* gpu_ptr_;
size_t size_;
SyncedHead head_;
bool own_cpu_data_;
bool cpu_malloc_use_cuda_;
bool own_gpu_data_;
int gpu_device_;
// 这个其实就是禁止使用复制和赋值操作符(=)
DISABLE_COPY_AND_ASSIGN(SyncedMemory);
}; // class SyncedMemory
} // namespace caffe
#endif // CAFFE_SYNCEDMEM_HPP_
通过读头文件再参考下面的cpp文件已经对SyncedMemory的功能很清楚了,并且能够大体了解时如何实现的,如果再深究的话,可能就要研究cuda,这里就简单理解它给出函数的意义就可以了,比如cudaMemcpyAsync。下面的cpp文件内容少易读,上面的注释就是参考它写的,所以就不在赘述了,为了方便还是贴出来吧。
#include "caffe/common.hpp"
#include "caffe/syncedmem.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
SyncedMemory::~SyncedMemory() {
if (cpu_ptr_ && own_cpu_data_) {
CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
}
#ifndef CPU_ONLY
if (gpu_ptr_ && own_gpu_data_) {
int initial_device;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1) {
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
#endif // CPU_ONLY
}
inline void SyncedMemory::to_cpu() {
switch (head_) {
case UNINITIALIZED:
CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
caffe_memset(size_, 0, cpu_ptr_);
head_ = HEAD_AT_CPU;
own_cpu_data_ = true;
break;
case HEAD_AT_GPU:
#ifndef CPU_ONLY
if (cpu_ptr_ == NULL) {
CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
own_cpu_data_ = true;
}
caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
head_ = SYNCED;
#else
NO_GPU;
#endif
break;
case HEAD_AT_CPU:
case SYNCED:
break;
}
}
inline void SyncedMemory::to_gpu() {
#ifndef CPU_ONLY
switch (head_) {
case UNINITIALIZED:
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
caffe_gpu_memset(size_, 0, gpu_ptr_);
head_ = HEAD_AT_GPU;
own_gpu_data_ = true;
break;
case HEAD_AT_CPU:
if (gpu_ptr_ == NULL) {
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
own_gpu_data_ = true;
}
caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
head_ = SYNCED;
break;
case HEAD_AT_GPU:
case SYNCED:
break;
}
#else
NO_GPU;
#endif
}
const void* SyncedMemory::cpu_data() {
to_cpu();
return (const void*)cpu_ptr_;
}
void SyncedMemory::set_cpu_data(void* data) {
CHECK(data);
if (own_cpu_data_) {
CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
}
cpu_ptr_ = data;
head_ = HEAD_AT_CPU;
own_cpu_data_ = false;
}
const void* SyncedMemory::gpu_data() {
#ifndef CPU_ONLY
to_gpu();
return (const void*)gpu_ptr_;
#else
NO_GPU;
return NULL;
#endif
}
void SyncedMemory::set_gpu_data(void* data) {
#ifndef CPU_ONLY
CHECK(data);
if (own_gpu_data_) {
int initial_device;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1) {
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
gpu_ptr_ = data;
head_ = HEAD_AT_GPU;
own_gpu_data_ = false;
#else
NO_GPU;
#endif
}
void* SyncedMemory::mutable_cpu_data() {
to_cpu();
head_ = HEAD_AT_CPU;
return cpu_ptr_;
}
void* SyncedMemory::mutable_gpu_data() {
#ifndef CPU_ONLY
to_gpu();
head_ = HEAD_AT_GPU;
return gpu_ptr_;
#else
NO_GPU;
return NULL;
#endif
}
#ifndef CPU_ONLY
void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
CHECK(head_ == HEAD_AT_CPU);
if (gpu_ptr_ == NULL) {
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
own_gpu_data_ = true;
}
const cudaMemcpyKind put = cudaMemcpyHostToDevice;
CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));
// Assume caller will synchronize on the stream before use
head_ = SYNCED;
}
#endif
} // namespace caffe
总结
通过读SyncedMemory的源代码,可以看到,用它可以为数据分配内存空间,根据需要储存,获取,修改cpu或gpu数据,比如,set_cpu_data,set_gpu_data,cpu_data,gpu_data,mutable_cpu_data,mutable_gpu_data。并可以在cpu和gpu之间进行同步,不用关心其具体细节。
希望大家批评指正。
参考:
【1】http://www.cnblogs.com/louyihang-loves-baiyan/p/5150554.html