在网上查了如何在 NVIDIA GPU上来运行openCL , 结果大部分都是介绍在AMD GPU上开发的。所以在此写篇简单例子,介绍在NVIDIA GPU 上开发。
NVIDIA的显卡上,cuda装好opencl就自然有了,所以装cuda=装opencl. 【CUDA安装见 CUDA安装文章】
以下代码是 功能是实现 向量加: C=A+B ,代码是参照一本介绍在AMD GPU上开发的openCL教程(具体记不清了),在此我略微有添加改动的等。
系统:redhat Linux
GPU:NVIDIA K40
软件环境:CUDA-6.0
一:此程序有两个文件 1.vecadd.c 2 add.cl
vecadd.c #include<stdio.h> #include<CL/cl.h> #include<stdlib.h> #include<time.h> #include<string.h> #include<iostream> #include<fstream> using namespace std; #define NWITEMS 262144 //2^18=262144 //read a text file into a string int convertToString(const char *filename, std::string& s) { size_t size; char* str; std::fstream f(filename,(std::fstream::in | std::fstream::binary)); if(f.is_open()){ size_t filesize; f.seekg(0,std::fstream::end); size=filesize=(size_t)f.tellg(); f.seekg(0,std::fstream::beg); str = new char[size+1]; if(!str){ f.close(); return -1; } f.read(str,filesize); f.close(); str[size]='\0'; s=str; delete[] str; return 0; } printf("Error: Failed to open file %s\n",filename); return 1; } int main() { float* buff1=NULL; float* buff2=NULL; float* buff=NULL; buff1=(float*)malloc(NWITEMS*sizeof(float)); buff2=(float*)malloc(NWITEMS*sizeof(float)); buff=(float*)malloc(NWITEMS*sizeof(float)); int i; srand((unsigned)time(NULL)); for(i=0;i<NWITEMS;i++){ buff1[1]=rand()%RAND_MAX; } srand((unsigned)time(NULL)+1000); for(i=0;i<NWITEMS;i++){ buff2[1]=rand()%RAND_MAX; } for(i=0;i<NWITEMS;i++){ buff[i]=buff1[i]+buff2[i]; } /************openCL begain*******/ cl_int status=0; size_t deviceListSize; cl_uint numPlatforms; cl_platform_id platform=NULL; /**********get platform*************/ status=clGetPlatformIDs(0,NULL,&numPlatforms);//get number of platform //printf("status=%d\nnumPlatforms=%d\n",status,numPlatforms); status=clGetPlatformIDs(1,&platform,NULL);//choose NV platform /**********************get device form NV platform**************/ cl_uint numdevices; cl_device_id device=NULL; clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,0,NULL,&numdevices);//get device number of NV platform printf("numDevice=%d\n",numdevices); clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,1,&device,NULL);//choose a NV GPU device cl_context context = clCreateContext(NULL,1,&device,NULL,NULL,NULL); //create Context; cl_command_queue queue = clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,NULL);//create commmand queue and enable profiling /********************create openCL buffer object, and then copy buff1 to clbuff1, buf2 copy to clbuff2*****************/ cl_mem clbuf1 = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,NWITEMS*sizeof(cl_float),buff1,NULL); cl_mem clbuf2 = clCreateBuffer(context,CL_MEM_READ_ONLY,NWITEMS*sizeof(cl_float),NULL,NULL); status = clEnqueueWriteBuffer(queue,clbuf2,1,0,NWITEMS*sizeof(cl_float),buff2,0,0,0); cl_mem clbuf = clCreateBuffer(context,CL_MEM_WRITE_ONLY,NWITEMS*sizeof(cl_float),NULL,NULL); const char* filename="add.cl"; std::string sourceStr; status = convertToString(filename,sourceStr); const char* source = sourceStr.c_str(); size_t sourceSize[]={ strlen(source) }; /************create program object***********/ cl_program program = clCreateProgramWithSource(context,1,&source,sourceSize,NULL); status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if(status!=0){ fprintf(stderr,"clBuild failed:%d\n",status); char tbuf[100]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0x10000, tbuf, NULL); fprintf(stderr,"\n%s\n",tbuf); return -1; } /***********create kernel object************/ cl_kernel kernel = clCreateKernel(program, "vecadd", NULL); //set kernel argument cl_int clnum=NWITEMS; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clbuf1); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&clbuf2); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&clbuf); cl_event ev; size_t global_work_size=NWITEMS; clEnqueueNDRangeKernel(queue,kernel,1,NULL, &global_work_size,NULL,0,NULL,&ev); clFinish(queue); /**********copy data from device to host memory *****/ cl_float *ptr; ptr=(cl_float *) clEnqueueMapBuffer(queue,clbuf, CL_TRUE, CL_MAP_READ, 0, NWITEMS * sizeof(cl_float),0,NULL, NULL, NULL); /***************vertify the result******/ if(!memcmp(buff,ptr,NWITEMS)){ printf("vertify passed\n"); }else{ printf("vertify failed\n"); } if(buff) free(buff); if(buff1) free(buff1); if(buff2) free(buff2); /*************free OpenCL resource object************/ clReleaseMemObject(clbuf1); clReleaseMemObject(clbuf2); clReleaseMemObject(clbuf); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; }
__kernel void vecadd(__global const float* A, __global const float* B,__global float* C) { int id = get_global_id(0); C[id] = A[id] + B[id]; }
二:编译
nvcc test.cpp -o test -lOpenCL //注意-lOopenCL 的大小写
三:执行
./vecadd四:结果
numPlatforms=1 //环境中只有NVIDIA的平台,所以为 1 numDevice=3 //共有三块NVIDIA卡 verify passed