Caffe 初学拾遗(五) CUDA 并行化示例

Original Source:

http://blog.csdn.net/augusdi/article/details/12833235

一些CUDA编程的简单示例程序,笔者在此进行了整理说明


在此之前,关于GPU选择(有提到流处理簇内容)以及Intel与Nvidia争论,读者有兴趣也可以一看。


Note:

1. 没有任何前缀的函数,都是Host程序。Host程序可以用__host__前缀进行显式声明。

2. Device程序需要由NVCC进行编译,而Host程序只需要由Host编译器(GCC)编译。

3. Host程序主要完成设备环境初始化,数据传输等必备过程,Device程序只负责计算。

4. Host程序中,“cuda”开头的函数,都是CUDA Runtime API,即运行时函数,主要负责完成Device的初始化、内存分配、内存拷贝等任务。



1.获取设备数目及属性:

// Add vectors in parallel.  
cudaError_t cudaStatus;  
int num = 0;  
cudaDeviceProp prop;  
cudaStatus = cudaGetDeviceCount(&num);  
for(int i = 0;i
cudaError_t是cuda错误类型,取值为整数。
cudaDeviceProp为设备属性结构体,其定义如下:

/** 
 * CUDA device properties 
 */  
struct __device_builtin__ cudaDeviceProp  
{  
    char   name[256];                  /**< ASCII string identifying device */  
    size_t totalGlobalMem;             /**< Global memory available on device in bytes */  
    size_t sharedMemPerBlock;          /**< Shared memory available per block in bytes */  
    int    regsPerBlock;               /**< 32-bit registers available per block */  
    int    warpSize;                   /**< Warp size in threads */  
    size_t memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */  
    int    maxThreadsPerBlock;         /**< Maximum number of threads per block */  
    int    maxThreadsDim[3];           /**< Maximum size of each dimension of a block */  
    int    maxGridSize[3];             /**< Maximum size of each dimension of a grid */  
    int    clockRate;                  /**< Clock frequency in kilohertz */  
    size_t totalConstMem;              /**< Constant memory available on device in bytes */  
    int    major;                      /**< Major compute capability */  
    int    minor;                      /**< Minor compute capability */  
    size_t textureAlignment;           /**< Alignment requirement for textures */  
    size_t texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */  
    int    deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */  
    int    multiProcessorCount;        /**< Number of multiprocessors on device */  
    int    kernelExecTimeoutEnabled;   /**< Specified whether there is a run time limit on kernels */  
    int    integrated;                 /**< Device is integrated as opposed to discrete */  
    int    canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */  
    int    computeMode;                /**< Compute mode (See ::cudaComputeMode) */  
    int    maxTexture1D;               /**< Maximum 1D texture size */  
    int    maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */  
    int    maxTexture1DLinear;         /**< Maximum size for 1D textures bound to linear memory */  
    int    maxTexture2D[2];            /**< Maximum 2D texture dimensions */  
    int    maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */  
    int    maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */  
    int    maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */  
    int    maxTexture3D[3];            /**< Maximum 3D texture dimensions */  
    int    maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */  
    int    maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */  
    int    maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */  
    int    maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */  
    int    maxSurface1D;               /**< Maximum 1D surface size */  
    int    maxSurface2D[2];            /**< Maximum 2D surface dimensions */  
    int    maxSurface3D[3];            /**< Maximum 3D surface dimensions */  
    int    maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */  
    int    maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */  
    int    maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */  
    int    maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */  
    size_t surfaceAlignment;           /**< Alignment requirements for surfaces */  
    int    concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */  
    int    ECCEnabled;                 /**< Device has ECC support enabled */  
    int    pciBusID;                   /**< PCI bus ID of the device */  
    int    pciDeviceID;                /**< PCI device ID of the device */  
    int    pciDomainID;                /**< PCI domain ID of the device */  
    int    tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */  
    int    asyncEngineCount;           /**< Number of asynchronous engines */  
    int    unifiedAddressing;          /**< Device shares a unified address space with the host */  
    int    memoryClockRate;            /**< Peak memory clock frequency in kilohertz */  
    int    memoryBusWidth;             /**< Global memory bus width in bits */  
    int    l2CacheSize;                /**< Size of L2 cache in bytes */  
    int    maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */  
}; 
name:设备名称;
totalGlobalMem:显存大小;
clockRate:GPU时钟频率
multiProcessorCount:GPU流多处理器数目,SM,Stream-Multiprocessor{一个SM包含多个流处理器(SP,Stream-Processor)};


查看SP数目:

// Beginning of GPU Architecture definitions  
inline int _ConvertSMVer2Cores(int major, int minor)  
{  
    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM  
    typedef struct  
    {  
        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version  
        int Cores;  
    } sSMtoCores;  
  
    sSMtoCores nGpuArchCoresPerSM[] =  
    {  
        { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class  
        { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class  
        { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class  
        { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class  
        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class  
        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class  
        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class  
        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class  
        {   -1, -1 }  
    };  
  
    int index = 0;  
  
    while (nGpuArchCoresPerSM[index].SM != -1)  
    {  
        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))  
        {  
            return nGpuArchCoresPerSM[index].Cores;  
        }  
  
        index++;  
    }  
  
    // If we don't find the values, we default use the previous one to run properly  
    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores);  
    return nGpuArchCoresPerSM[7].Cores;  
}  
// end of GPU Architecture definitions


2.线程并行:

CPU Host中进程是资源分配的基本单元,线程是CPU时间调度的基本单元。

GPU Device中线程是执行CUDA程序的最小单元。GPU上线程没有优先级概念,所有线程机会均等,线程状态只有等待资源和执行两种状态

如果资源未就绪,那么就等待;一旦就绪,立即执行。

当GPU资源很充裕时,所有线程都是并发执行的,这样加速效果很接近理论加速比;

而GPU资源少于总线程个数时,有一部分线程就会等待前面执行的线程释放资源,从而变为串行化执行

#include "cuda_runtime.h"           //CUDA运行时API  
#include "device_launch_parameters.h"     
#include   
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);  
__global__ void addKernel(int *c, const int *a, const int *b)  
{  
    int i = threadIdx.x;  // 每个线程获得自身ID 由于dim3结构体只传入了1维参数 x,y,z三维参数只有x维是有效的
    c[i] = a[i] + b[i];  
}  
int main()  
{  
    const int arraySize = 5;  
    const int a[arraySize] = { 1, 2, 3, 4, 5 };  
    const int b[arraySize] = { 10, 20, 30, 40, 50 };  
    int c[arraySize] = { 0 };  
    // Add vectors in parallel.  
    cudaError_t cudaStatus;  
    int num = 0;  
    cudaDeviceProp prop;  
    cudaStatus = cudaGetDeviceCount(&num);  
    for(int i = 0;i>>(dev_c, dev_a, dev_b); 
	// 1表示只分配一个Block
	// size表示每个Block有size个线程(Thread)
	// cudaThreadSynchronize waits for the kernel to finish, and returns  
    // any errors encountered during the launch.  
    cudaStatus = cudaThreadSynchronize(); // 同步线程  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);  
        goto Error;  
    }  
    // Copy output vector from GPU buffer to host memory.  
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);      //拷贝结果回主机  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
Error:  
    cudaFree(dev_c);    //释放GPU设备端内存  
    cudaFree(dev_a);  
    cudaFree(dev_b);      
    return cudaStatus;  
}


3.块并行

将线程并行代码中的:

addKernel<<<1,size >>>(dev_c, dev_a, dev_b);
// 改为
addKernel<<>>(dev_c, dev_a, dev_b);
另将
__global__ void addKernel(int *c, const int *a, const int *b)  
{  
    int i = blockIdx.x; // 原为threadIdx.x  
    c[i] = a[i] + b[i];  
}  
线程并行是细粒度并行,调度效率高;

块并行是粗粒度并行,每次调度都要重新分配资源,由于资源限制,可能需要串行执行。


4.流并行

线程并行为细粒度的并行,而块并行为粗粒度的并行。

一组Thread并行处理可以组织为一个block,而一组block并行处理可以组织为一个Grid。

利用多个Grid来完成并行处理即流并行


流可以实现在一个Device上运行多个核函数。

块并行线程并行运行的核函数都是相同的(代码相同)

而流并行,可以执行不同的核函数,也可以实现对同一个核函数传递不同的参数,实现任务级别的并行。

CUDA中的流用cudaStream_t类型实现,用到以下几个API:

cudaStreamCreate(cudaStream_t * s)// 用于创建流;
cudaStreamDestroy(cudaStream_t s)// 用于销毁流;
cudaStreamSynchronize()// 用于单个流同步;
cudaDeviceSynchronize()// 用于整个设备上的所有流同步;
cudaStreamQuery()// 用于查询一个流的任务是否已经完成。
E.g.

#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  
#include   
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);  
__global__ void addKernel(int *c, const int *a, const int *b)  
{  
    int i = blockIdx.x;  
    c[i] = a[i] + b[i];  
}  
int main()  
{  
    const int arraySize = 5;  
    const int a[arraySize] = { 1, 2, 3, 4, 5 };  
    const int b[arraySize] = { 10, 20, 30, 40, 50 };  
    int c[arraySize] = { 0 };  
    // Add vectors in parallel.  
    cudaError_t cudaStatus;  
    int num = 0;  
    cudaDeviceProp prop;  
    cudaStatus = cudaGetDeviceCount(&num);  
    for(int i = 0;i>>(dev_c+i, dev_a+i, dev_b+i);    //执行流  
		// 1 : block 数目
		// 1 : block 中 thread 数目
		// 0 : block 中用到的共享内存大小
		// stream[i] : 当前核函数在哪个流上运行 每个流上都拥有一个核函数
		// 每个核函数作用的对象不同就实现了任务级别的并行
		// 当有多个互不相关的任务时 可以写多个核函数 
		// 在资源允许的情况下 将这些核函数装载到不同流上执行
    }  
    cudaDeviceSynchronize();  
    // cudaThreadSynchronize waits for the kernel to finish, and returns  
    // any errors encountered during the launch.  
    cudaStatus = cudaThreadSynchronize();  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);  
        goto Error;  
    }  
    // Copy output vector from GPU buffer to host memory.  
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
	for(int i = 0;i<5;i++)  
    {  
        cudaStreamDestroy(stream[i]);   //销毁流  
    }  
	cudaFree(dev_c);  
    cudaFree(dev_a);  
    cudaFree(dev_b);      
    return cudaStatus;  
} 



你可能感兴趣的:(CNN,caffe,CUDA,GPU)