CUDA小记(4)something before programming(一)


我是参考了《GPU高性能CUDA实战》来学习CUDA C编程的,具体分析样例时采用了NVIDIA自带的样例,API有一些不一致的地方我会做简单的说明。
在编程开始之前,我们要对GPU有个简单的理解。GPU,图像处理单元,但我使用CUDA编程最主要的想法是使GPU能把强大的计算能力应用到通用并行计算里。我们将CPU以及系统的内存称为主机,而将GPU以及其内存称为设备。
下面我们仔细分析一下之前的样例,在并行编程之前了解一些常用API和概念。(本节我们仅关注内存的分配、释放和数据传输。)
// includes, system
#include

// includes CUDA Runtime
#include

// includes, project
#include
#include // helper utility functions

    ...
bool correct_output(int *data, const int n, const int x)
{
    for (int i = 0; i < n; i++)
        if (data[i] != x)
        {
            printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
            return false;
        }

    return true;
}

int main(int argc, char *argv[])
{
    int devID;
    cudaDeviceProp deviceProps;///mark

    printf("[%s] - Starting...\n", argv[0]);

    // This will pick the best possible CUDA capable device
    devID = findCudaDevice(argc, (const char **)argv);

    // get device name
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s]\n", deviceProps.name);

    int n = 16 * 1024 * 1024;
    int nbytes = n * sizeof(int);
    int value = 26;

    // allocate host memory
    int *a = 0;
    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
    memset(a, 0, nbytes);

    // allocate device memory
    int *d_a=0;
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
    checkCudaErrors(cudaMemset(d_a, 255, nbytes));

        ...
    checkCudaErrors(cudaDeviceSynchronize());
    ...
    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
    ...
    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);

    ...
   
    // check the output for correctness
    bool bFinalResults = correct_output(a, n, value);

    // release resources
    ...
    checkCudaErrors(cudaFreeHost(a));
    checkCudaErrors(cudaFree(d_a));

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
}

问题:内存的分配、释放和数据传输
几个相关的API:
cudaMalloc()        分配设备内存,第一个参数是指针(指向保存新分配内存地址的变量),第二个参数是分配内存的大小。
cudaFree()        对应地释放内存,第一个参数是指针(指向要释放的内存地址的变量)。
cudaMemcpy()        数据在主机和设备间的传递,第一个参数是目标指针,第二个参数是源指针,第三个参数是数据大小,第四个参数是提示数据流向(cudaMemcpyKind可取三个值:cudaMemcpyDeviceToHost,cudaMemcpyDeviceToDevice,cudaMemcpyHostToDevice)。
cudaMemset()        初始化或给设备内存赋值,第一个参数是指针(指向要初始化的内存地址),第二个参数是初始化使用的值,第三个参数是需要初始化的内存大小。
cudaMallocHost()    给主机分配页面锁定内存,第一个参数是指针(指向保存新分配内存地址的变量),第二个参数是分配内存的大小。
cudaFreeHost()        对应地释放内存,第一个参数是指针(指向要释放的内存地址的变量)。

memset()        和malloc()、free()一样,同为C语言标准函数,用于内存初始化或赋值。

memcpy()        主机到主机的数据传递。

cudaMemcpyAsync()    数据在主机和设备间的传递,与cudaMemcpy()惟一的不同是参数里多了一个cudaStream_t,我们在之后的章节会再讨论。

在CUDA Runtime API中可以找到他们的介绍,如下:
__host__ ​ __device__ ​cudaError_t cudaMalloc ( void** devPtr, size_t size )
    Allocate memory on the device.

__host__ ​ __device__ ​cudaError_t cudaFree ( void* devPtr )
    Frees memory on the device.

__host__ ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
    Copies data between host and device.

__host__ ​cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
    Initializes or sets device memory to a value.

__host__ ​cudaError_t cudaMallocHost ( void** ptr, size_t size )
    Allocates page-locked memory on the host.

__host__ ​cudaError_t cudaFreeHost ( void* ptr )
    Frees page-locked memory.

__host__ ​ __device__ ​cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0 )
    Copies data between host and device.

一个量和宏:
注意到之前提到的运行时函数返回值都是cudaError_t,等同于cudaError,是CUDA错误类型,枚举类型。
enum cudaError { cudaSuccess =  0, cudaErrorMissingConfiguration =  1,
           cudaErrorMemoryAllocation =  2, cudaErrorInitializationError =  3,
           cudaErrorLaunchFailure =  4, cudaErrorPriorLaunchFailure =  5,
           cudaErrorLaunchTimeout =  6, cudaErrorLaunchOutOfResources =  7,
           cudaErrorInvalidDeviceFunction =  8, cudaErrorInvalidConfiguration
           =  9, cudaErrorInvalidDevice =  10, cudaErrorInvalidValue =  11,
           cudaErrorInvalidPitchValue =  12, cudaErrorInvalidSymbol =  13,
           cudaErrorMapBufferObjectFailed =  14,
           cudaErrorUnmapBufferObjectFailed =  15, cudaErrorInvalidHostPointer
           =  16, cudaErrorInvalidDevicePointer =  17, cudaErrorInvalidTexture
           =  18, cudaErrorInvalidTextureBinding =  19,
           cudaErrorInvalidChannelDescriptor =  20,
           cudaErrorInvalidMemcpyDirection =  21, cudaErrorAddressOfConstant =
           22, cudaErrorTextureFetchFailed =  23, cudaErrorTextureNotBound =
           24, cudaErrorSynchronizationError =  25,
           cudaErrorInvalidFilterSetting =  26, cudaErrorInvalidNormSetting =
           27, cudaErrorMixedDeviceExecution =  28, cudaErrorCudartUnloading =
           29, cudaErrorUnknown =  30, cudaErrorNotYetImplemented =  31,
           cudaErrorMemoryValueTooLarge =  32, cudaErrorInvalidResourceHandle
           =  33, cudaErrorNotReady =  34, cudaErrorInsufficientDriver =  35,
           cudaErrorSetOnActiveProcess =  36, cudaErrorInvalidSurface =  37,
           cudaErrorNoDevice =  38, cudaErrorECCUncorrectable =  39,
           cudaErrorSharedObjectSymbolNotFound =  40,
           cudaErrorSharedObjectInitFailed =  41, cudaErrorUnsupportedLimit =
           42, cudaErrorDuplicateVariableName =  43,
           cudaErrorDuplicateTextureName =  44, cudaErrorDuplicateSurfaceName
           =  45, cudaErrorDevicesUnavailable =  46,
           cudaErrorInvalidKernelImage =  47, cudaErrorNoKernelImageForDevice
           =  48, cudaErrorIncompatibleDriverContext =  49,
           cudaErrorPeerAccessAlreadyEnabled =  50,
           cudaErrorPeerAccessNotEnabled =  51, cudaErrorDeviceAlreadyInUse =
           54, cudaErrorProfilerDisabled =  55,
           cudaErrorProfilerNotInitialized =  56,
           cudaErrorProfilerAlreadyStarted =  57,
           cudaErrorProfilerAlreadyStopped =  58, cudaErrorAssert =  59,
           cudaErrorTooManyPeers =  60, cudaErrorHostMemoryAlreadyRegistered =
           61, cudaErrorHostMemoryNotRegistered =  62,
           cudaErrorOperatingSystem =  63, cudaErrorPeerAccessUnsupported =
           64, cudaErrorLaunchMaxDepthExceeded =  65,
           cudaErrorLaunchFileScopedTex =  66, cudaErrorLaunchFileScopedSurf =
           67, cudaErrorSyncDepthExceeded =  68,
           cudaErrorLaunchPendingCountExceeded =  69, cudaErrorNotPermitted =
           70, cudaErrorNotSupported =  71, cudaErrorHardwareStackError =  72,
           cudaErrorIllegalInstruction =  73, cudaErrorMisalignedAddress =
           74, cudaErrorInvalidAddressSpace =  75, cudaErrorInvalidPc =  76,
           cudaErrorIllegalAddress =  77, cudaErrorInvalidPtx =  78,
           cudaErrorInvalidGraphicsContext =  79, cudaErrorNvlinkUncorrectable
           =  80, cudaErrorStartupFailure =  0x7f, cudaErrorApiFailureBase =
           10000 }
checkCudaErrors()定义了一个宏来确定API调用是否正确。
更多内存管理的细节和API可参见:
http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#axzz4kVApOTcy



你可能感兴趣的:(CUDA)