花了两个晚上,搞了一个使用CUDA驱动API的简单例子.可以从这个例子出发,修改出你想要的程序.例子的功能很简单,就是Hello CUDA!(被我改成了New CUDA!呵呵).好了,废话少说,下面就是说明和代码.
1)原程序由两部分组成:
a)由kernel.cu编译成的kernel.cubin.我用的是sdk 3.0.因此,cubin被编译成为了elf文件格式.不过,没关系,在程序执行时照样可装入使用.
kernel.cu的代码如下:
__device__ void HelloCUDA(char *result, int num)
{
int i = 0;
char p_HelloCUDA[] = "New CUDA!";
for(i = 0; i < num; i++) {
result[i] = p_HelloCUDA[i];
}
}
__global__ static void GPUMain(char* result, int num)
{
HelloCUDA(result, num);
}
这里要注意两点:
i:编译器可能优化你的程序.在kernel.cubin中,你将找不到HelloCUDA函数.
ii:在kernel.cubin中,GPUMain的函数名成为了_Z7GPUMainPci.呵呵.
b)主机程序sample.cpp.
嗯?cpp?不是cu?
是的,是cpp!而且可以在vc6.0下编译通过!总算又看到了VC6的"亲切面孔"....
(和普通的vc6一样,不要特别的设置。当然,别忘了设置include路径,lib路径,还有连接库上别忘了加上cuda.lib)
下面是sample.cpp的代码:
/********************************************************************
* sample.cu
* This is a example of the CUDA program.
*********************************************************************/
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <io.h>
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
#include <cuda.h>
#include <cutil.h>
/************************************************************************/
/* Define for CUDA */
/************************************************************************/
#define ALIGN_UP(offset, alignment) /
(offset)=(((offset)+(alignment)-1) & ~((alignment)-1))
/************************************************************************/
/* Init CUDA */
/************************************************************************/
bool InitCUDA(void)
{
static int nGpuArchCoresPerSM[] = { -1, 8, 32 };
int driverVersion;
char deviceName[256];
int major;
int minor;
unsigned int totalGlobalMem;
int multiProcessorCount;
int totalConstantMemory;
int sharedMemPerBlock;
int regsPerBlock;
int warpSize;
int maxThreadsPerBlock;
int blockDim[3];
int gridDim[3];
int memPitch;
int clockRate;
int gpuOverlap;
int textureAlign;
int kernelExecTimeoutEnabled;
int integrated;
int canMapHostMemory;
int computeMode;
CUresult rc;
int count = 0;
int i = 0;
/* Init */
rc=cuInit(0);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "CUDA init error./n");
return false;
}
/* Get device num */
rc=cuDeviceGetCount(&count);
if((rc!=CUDA_SUCCESS)|| (count==0))
{
fprintf(stderr, "There is no device supporting CUDA./n");
return false;
}
/* Get API version */
rc=cuDriverGetVersion(&driverVersion);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDriverGetVersion./n");
return false;
}
printf("CUDA Driver Version: %d.%d/n/n", driverVersion/1000, driverVersion%100);
/* Get device info */
for(i=0; i<count; i++)
{
rc=cuDeviceComputeCapability(&major, &minor,i);
if(rc==CUDA_SUCCESS)
{
if ((major == 9999) && (minor == 9999))
{ /* emu device */
fprintf(stderr, "Find the emu device./n");
continue;
}
/* display the info */
rc=cuDeviceGetName(deviceName, 256, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetName(%ld)./n", i);
continue;
}
printf("Device %ld: /"%s/"/n", i, deviceName);
printf(" CUDA Capability Major revision number: %d/n", major);
printf(" CUDA Capability Minor revision number: %d/n", minor);
rc=cuDeviceTotalMem(&totalGlobalMem, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceTotalMem(%ld)./n", i);
continue;
}
printf(" Total amount of global memory: %u bytes/n", totalGlobalMem);
rc=cuDeviceGetAttribute( &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Number of multiprocessors: %d/n", multiProcessorCount);
printf(" Number of cores: %d/n", nGpuArchCoresPerSM[major] * multiProcessorCount);
rc=cuDeviceGetAttribute( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Total amount of constant memory: %u bytes/n", totalConstantMemory);
rc=cuDeviceGetAttribute( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Total amount of shared memory per block: %u bytes/n", sharedMemPerBlock);
rc=cuDeviceGetAttribute( ®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Total number of registers available per block: %d/n", regsPerBlock);
rc=cuDeviceGetAttribute( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Warp size: %d/n", warpSize);
rc=cuDeviceGetAttribute( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Maximum number of threads per block: %d/n", maxThreadsPerBlock);
rc=cuDeviceGetAttribute( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
rc=cuDeviceGetAttribute( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
rc=cuDeviceGetAttribute( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Maximum sizes of each dimension of a block: %d x %d x %d/n", blockDim[0], blockDim[1], blockDim[2]);
rc=cuDeviceGetAttribute( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
rc=cuDeviceGetAttribute( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
rc=cuDeviceGetAttribute( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Maximum sizes of each dimension of a grid: %d x %d x %d/n", gridDim[0], gridDim[1], gridDim[2]);
rc=cuDeviceGetAttribute( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Maximum memory pitch: %u bytes/n", memPitch);
rc=cuDeviceGetAttribute( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Texture alignment: %u bytes/n", textureAlign);
rc=cuDeviceGetAttribute( &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Clock rate: %.2f GHz/n", clockRate * 1e-6f);
rc=cuDeviceGetAttribute( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Concurrent copy and execution: %s/n",gpuOverlap ? "Yes" : "No");
rc=cuDeviceGetAttribute( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Run time limit on kernels: %s/n", kernelExecTimeoutEnabled ? "Yes" : "No");
rc=cuDeviceGetAttribute( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Integrated: %s/n", integrated ? "Yes" : "No");
rc=cuDeviceGetAttribute( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Support host page-locked memory mapping: %s/n", canMapHostMemory ? "Yes" : "No");
rc=cuDeviceGetAttribute( &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, i);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
continue;
}
printf(" Compute mode: %s/n", computeMode == 0 ?
"Default (multiple host threads can use this device)" :
computeMode == 1 ?
"Exclusive (only one host thread at a time can use this device)" :
computeMode == 2 ?
"Prohibited (no host thread can use this device)" :
"Unknown");
}
}
return true;
}
/************************************************************************/
/* HelloCUDA */
/************************************************************************/
int main(int argc, char* argv[])
{
CUdevice dev;
CUcontext ctx;
CUdeviceptr device_result =0;
CUmodule module;
CUfunction func;
int offset;
void* paramptr;
int paramint;
CUresult rc;
char *codemem=0;
long codesize=0;
int fp;
struct _stat fstatus;
int rt;
CUevent startevent;
CUevent stopevent;
char host_result[12] ={0};
float tm;
float tottm;
int i;
if(!InitCUDA()) {
return 0;
}
printf("CUDA initialized./n");
/* Get the handle */
rc=cuDeviceGet(&dev, 0);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuDeviceGet(%ld)./n", rc);
return 0;
}
/* Get cubin file size */
fp=_open("kernel.cubin", _O_BINARY|_O_RDONLY);
if(fp<0)
{
fprintf(stderr, "There is error on fopen(kernel.cubin)./n");
/* free all resource */
return 0;
}
rt=_fstat(fp, &fstatus);
if(rt!=0)
{
fprintf(stderr, "There is error on _fstat(kernel.cubin)./n");
/* free all resource */
_close(fp);
return 0;
}
codesize=fstatus.st_size;
/* Alloc code memory */
codemem=(char *)malloc(codesize+1);
if(codemem==NULL)
{
fprintf(stderr, "There is error on malloc(%ld)./n", codesize+1);
/* free all resource */
_close(fp);
return 0;
}
codemem[codesize]=0;
/* Read cubin file */
rt=_read(fp, codemem, codesize);
if(rt!=codesize)
{
fprintf(stderr, "There is error on _read(kernel.cubin)(%ld)./n", rt);
/* free all resource */
free(codemem);
_close(fp);
return 0;
}
/* Close the cubin file */
rt=_close(fp);
if(rt!=0)
{
fprintf(stderr, "There is error on _close(kernel.cubin)./n");
/* free all resource */
free(codemem);
return 0;
}
/* Create the context */
/* Normal flag is CU_CTX_SCHED_AUTO */
rc=cuCtxCreate(&ctx, CU_CTX_BLOCKING_SYNC, dev);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuCtxCreate(%ld)./n", rc);
free(codemem);
return 0;
}
/* Load cuda module */
rc=cuModuleLoadData(&module, codemem);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuModuleLoadData(%ld)./n", rc);
/* free all resource */
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Malloc gpu memory */
rc=cuMemAlloc(&device_result, sizeof(char) * 10);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuMemAlloc(%ld)./n", rc);
/* free all resource */
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Create a event for start time point */
rc=cuEventCreate(&startevent, CU_EVENT_DEFAULT);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuEventCreate(startevent)(%ld)./n", rc);
/* free all resource */
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Create a event for stop time point */
rc=cuEventCreate(&stopevent, CU_EVENT_DEFAULT);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuEventCreate(stopevent)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Get the function */
rc=cuModuleGetFunction(&func, module, "_Z7GPUMainPci");
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuModuleGetFunction(GPUMain)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* run kernel for 10000 times */
tottm=0.0;
for(i=0; i<10000; i++)
{ /* Set param 1 */
offset=0;
/* void* should be used to determine CUdeviceptr’s alignment */
paramptr=(void*)(size_t)device_result;
ALIGN_UP(offset, __alignof(void *));
rc=cuParamSetv(func, offset, ¶mptr, sizeof(paramptr));
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuParamSetv(GPUMain)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
offset += sizeof(paramptr);
/* Set param 2 */
paramint=10;
ALIGN_UP(offset, __alignof(int));
rc=cuParamSeti(func, offset, paramint);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuParamSeti(GPUMain)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
offset += sizeof(paramint);
/* Set param size */
rc=cuParamSetSize(func, offset);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuParamSetSize(GPUMain)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Set block size */
rc=cuFuncSetBlockShape(func, 1, 1, 1);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuFuncSetBlockShape(GPUMain)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Record the begin time */
/* kernel is default at stream 0 */
rc=cuEventRecord(startevent, 0);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuEventRecord(startevent)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Lauch grid */
rc=cuLaunchGrid(func, 1, 1);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuLaunchGrid(GPUMain)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Waiting for the kernel finish */
rc=cuCtxSynchronize();
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuCtxSynchronize(GPUMain)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Record the end time */
rc=cuEventRecord(stopevent, 0);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuEventRecord(stopevent)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Wait for stop event is actually recorded */
rc=cuEventSynchronize(stopevent);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuEventSynchronize(stopevent)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Calc the time of processing */
rc=cuEventElapsedTime(&tm, startevent, stopevent);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuEventElapsedTime(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
tottm+=tm;
}
/* Copy the result */
rc=cuMemcpyDtoH(host_result, device_result, sizeof(char)*10);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuMemcpyDtoH(%ld)./n", rc);
/* free all resource */
cuEventDestroy(stopevent);
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
printf("Processing time: %f (ms)/n", tottm/i);
/* Free the event */
rc=cuEventDestroy(stopevent);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuEventDestroy(stopevent)(%ld)./n", rc);
/* free all resource */
cuEventDestroy(startevent);
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
rc=cuEventDestroy(startevent);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuEventDestroy(startevent)(%ld)./n", rc);
/* free all resource */
cuMemFree(device_result);
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Free gpu memory */
rc=cuMemFree(device_result);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuMemFree(%ld)./n", rc);
/* free all resource */
cuModuleUnload(module);
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Unload cuda module */
rc=cuModuleUnload(module);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuModuleUnload(%ld)./n", rc);
/* free all resource */
free(codemem);
cuCtxDestroy(ctx);
return 0;
}
/* Free code memory */
free(codemem);
/* Destory the context */
rc=cuCtxDestroy(ctx);
if(rc!=CUDA_SUCCESS)
{
fprintf(stderr, "There is error on cuCtxDestroy(%ld)./n", rc);
return 0;
}
printf("%s/n", host_result);
printf("CUDA Sample is finished./n");
return 0;
}
感谢cuda2010所指出的错误!已做了修改(文中红字部分).