下面是一个简单的CUDA式的Hello World。
/************************************************************************ * [!output PROJECT_NAME].cu * This is a example of the CUDA program. ************************************************************************/ #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> /************************************************************************/ /* Example */ /************************************************************************/ __global__ static void HelloCUDA(char* result, int num, clock_t* time) { int i = 0; char p_HelloCUDA[] = "Hello CUDA!"; clock_t start = clock(); for(i = 0; i < num; i++) { result[i] = p_HelloCUDA[i]; } *time = clock() - start; } int main(int argc, char** argv) { char *device_result = 0; clock_t *time = 0; char host_result[12] ={0}; clock_t time_used = 0; int deviceCount; int device; cudaGetDeviceCount(&deviceCount); for (device = 0; device < deviceCount; ++device) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, device); printf("Device %d has compute capability %d.%d .\n", device, deviceProp.major, deviceProp.minor); } cudaMalloc((void**) &device_result, sizeof(char) * 11); cudaMalloc((void**) &time, sizeof(clock_t)); HelloCUDA<<<1, 1, 0>>>(device_result, 11 , time); cudaMemcpy(&host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost); cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost); cudaFree(device_result); cudaFree(time); printf("%s,%d\n", host_result, time_used); return 0; }
用命令行来编译,相对简单许多,nvcc.exe --help >nvcc.txt得到帮助文档,方便查看。对应上面的程序,批处理文件如下(保存为make.bat双击运行):
@echo off set myFun=sample call "%VS90COMNTOOLS%vsvars32.bat" set include=%CUDA_INC_PATH%;%include% set lib=%CUDA_LIB_PATH%;%lib% set path=%CUDA_BIN_PATH%;%path% echo ------------------===By GoldenSpider 2011-10-8===------------------ nvcc %myFun%.cu -c -Xcompiler "/MD " -o "%myFun%.obj" link /OUT:"%myFun%.exe" /SUBSYSTEM:console /nologo %myFun%.obj cudart.lib kernel32.lib msvcrt.lib echo ------------------------------------------------------------------- echo Good Job, Compiler Success!! Run EXE(Y/?) pause %myFun%.exe pause
效果:(好像不支持上传图片,就复制一下cmd下的结果吧^_^)
Setting environment for using Microsoft Visual Studio 2008 x86 tools.
------------------===By GoldenSpider 2011-10-8===------------------
sample.cu
tmpxft_00000cf0_00000000-3_sample.cudafe1.gpu
tmpxft_00000cf0_00000000-8_sample.cudafe2.gpu
sample.cu
tmpxft_00000cf0_00000000-3_sample.cudafe1.cpp
tmpxft_00000cf0_00000000-14_sample.ii
-------------------------------------------------------------------
Good Job, Compiler Success!! Run EXE(Y/?)
请按任意键继续. . .
Device 0 has compute capability 1.2 .
Hello CUDA!,8876
请按任意键继续. . .
上面是基本入门,如果想用vc6.0编译怎么办呢,要是用汇编该怎么写呢,思路也很简单,就是用CUDA Driver API.设备码交给nvcc编译,得到ptx或cubin。主机码交给vc编译或汇编器来编译。ptx、cubin仅仅作为数据。实质上也是这么做的。具体的可以参考vectorAddDrv这个实例。你能:
call "%VS90COMNTOOLS%vsvars32.bat" set include=%CUDA_INC_PATH%;%include% set lib=%CUDA_LIB_PATH%;%lib% set path=%CUDA_BIN_PATH%;%path% nvcc -ptx VecAdd.cu
再:
@echo off call "E:\Microsoft Visual Studio\VC98\Bin\vcvars32.bat" set include=%CUDA_INC_PATH%;%include% set lib=%CUDA_LIB_PATH%;%lib% set myHost=main cl /c /MD %myHost%.cpp link /SUBSYSTEM:console /nologo %myHost%.obj cuda.lib kernel32.lib msvcrt.lib %myHost%.exe pause
执行效果:
cuDeviceGet returns: 0 cuCtxCreate returns: 0 cuModuleLoad returns: 0 allocating d_a returns: 0 copy data for a returns: 0 getting the function handle returns: 0 kernel launch returns: 0 copy from device to host returns: 0 2.1000 ....
查看其导入库:
导入表所处的节: .rdata
----------------------------------------------------------
导入库: nvcuda.dll
----------------------------------------------------------
OriginalFirstThunk 000020FC
TimeDateStamp 00000000
ForwarderChain 00000000
FirstThunk 00002044
----------------------------------------------------------
导入序号 导入函数名称
----------------------------------------------------------
00000084 cuInit
00000059 cuDeviceGetCount
00000057 cuDeviceGet
0000000D cuCtxCreate_v2
000000E0 cuModuleLoad
0000008E cuMemAlloc_v2
000000C6 cuMemcpyHtoD_v2
000000DB cuModuleGetFunction
00000088 cuLaunchKernel
000000BE cuMemcpyDtoH_v2
----------------------------------------------------------
导入库: MSVCRT.dll
----------------------------------------------------------
OriginalFirstThunk 000020B8
TimeDateStamp 00000000
ForwarderChain 00000000
FirstThunk 00002000
运行库已经不再是MSVCR90.dll ,呵呵。上面的VecAdd.cu代码如下:
__global__ void VecAdd(const float* A, const float* B, float* C, int N) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < N) C[i] = A[i] + B[i]; }