opencl学习任务紧迫,仅看文字资料速度太慢,还是跟着大家一起通过尝试不同代码,边学边学。昨天写了把上次写的一维数组相加的做了修改,变成二维数组相加。初学者就是这么无聊,大家见谅!还是在freescale的i.MX6q上跑的,这个芯片的GPU中work group大小最大为1024,因此一维数组元素个数最大就是1024,二维中也只能width * height <= 1024。另外,此代码中通过事件的方式,利用opencl的API进行了kernel运算计时。借此可以慢慢了解一些事件相关的使用。
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <iostream> #include <CL/cl.h> using namespace std; int main() { int width = 32, height = 32; float *buf1 = 0; float *buf2 = 0; float *buf = 0; buf1 = (float *)malloc(width * height * sizeof(float)); buf2 = (float *)malloc(width * height * sizeof(float)); buf = (float *)malloc(width * height * sizeof(float)); for(int i=0; i<32*32; i++) { buf1[i] = i*1.2; buf2[i] = i*1.5; } cl_int ret; //get platform id cl_uint numplatforms; cl_platform_id platform; cl_platform_id *platforms; ret = clGetPlatformIDs(0, NULL, &numplatforms); if( ret != CL_SUCCESS ) { printf("get platformID failed!"); return 0; } platforms = new cl_platform_id[numplatforms]; ret = clGetPlatformIDs(numplatforms, platforms, NULL); platform = platforms[0];//here we just use the first one of the platforms delete[] platforms; //get device id cl_uint numdevices; cl_device_id device; cl_device_id *devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numdevices); if( ret != CL_SUCCESS ) { printf("get deviceID failed!"); return 0; } devices = new cl_device_id[numdevices]; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numdevices, devices, NULL); device = devices[0]; delete[] devices; //create context cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); //create commandqueue cl_command_queue queue = clCreateCommandQueue( context, device, CL_QUEUE_PROFILING_ENABLE, &ret ); if( ret != CL_SUCCESS ) { printf(" queue creation failure!\n"); return 0; } //create opencl memory objects //and copy buf1 to clbuf1 implicitly //while copy buf2 to clbuf2 explicitly cl_mem clbuf1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, width*height*sizeof(cl_float), buf1, NULL); cl_mem clbuf2 = clCreateBuffer(context, CL_MEM_READ_ONLY, width*height*sizeof(cl_float), NULL, NULL); ret = clEnqueueWriteBuffer(queue, clbuf2, 1, 0, width*height*sizeof(cl_float), buf2, 0, 0, NULL);// &writeEvt); cl_mem clbuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width*height*sizeof(cl_float), NULL, NULL); const char* source = " __kernel void vecadd(__global const float *a, \ __global const float *b, __global const float *c)\ {\ int x = get_global_id(0);\ int y = get_global_id(1);\ int width = get_global_size(0);\ int height = get_global_size(1);\ c[x+y*width] = a[x+y*width]+b[x+y*width];\ }"; //create program object cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &ret); if( ret != CL_SUCCESS ) { printf(" program creation failure!\n"); return 0; } //build program object ret = clBuildProgram( program, 1, &device, NULL, NULL, NULL); if( ret != CL_SUCCESS ) { printf(" program build failure!\n"); return 0; } //create kernel cl_kernel kernel = clCreateKernel(program, "vecadd", &ret); if( ret != CL_SUCCESS ) { printf("kernel creation failure!\n"); return 0; } //set kernel argument cl_int num = width*height; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clbuf1); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&clbuf2); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&clbuf); //create kernel, rang 2dim, work itmes width*height size_t localx, localy; if(width/8 >4) localx = 16; else if(width<8) localx = width; else localx = 8; if(height/8 >4) localy = 16; else if(height<8) localy = height; else localy = 8; size_t globalThreads[] = {width, height}; size_t localThreads[] = {localx, localy}; cl_event ev; //execu kernel ret = clEnqueueNDRangeKernel(queue, kernel, 2, 0, globalThreads, localThreads, 0, NULL, &ev); if( ret != CL_SUCCESS ) { if(ret == CL_INVALID_KERNEL) printf("invalid kernel!\n"); printf("%d\n", ret); printf("execution failure!\n"); return 0; } clFinish( queue ); //calc the execu time cl_ulong startTime = 0, endTime = 0; clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); cl_ulong time = endTime - startTime; printf("simple kernel start time: %8.6f ms\n", startTime*1e-6); printf("simple kernel end time: %8.6f ms\n", endTime*1e-6); printf("simple kernel exec time: %8.6f ms\n", time*1e-6); return 0; }
本来想把同样的计时方式添加到上篇中一维数组的计算当中,但不论怎么修改,程序运行都正常,但计时始终显示为0,不能得到有效时间。有感兴趣的初学者朋友可以试试,ok的话给留个言。多谢。
刚说完问题就找到了,在创建命令队列的时候有一个参数需要使能。多亏一起学习的哥们帮忙,看来常用的API每一个参数的功能还是要好好看。
创建命令队列的函数原型为:
cl_command_queue clCreateCommandQueue ( cl_context context ,
cl_device_id device ,
cl_command_queue_properties properties,
cl_int *errcode_ret )
第一个参数为一个有效的上下文context。
第二个参数为与该上下文绑定的设备device。
第三个参数为要建立的队列的属性,有两个备选的属性,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE和CL_QUEUE_PROFILING_ENABLE。前者决定该命令队列是否按顺序执行,还是乱序执行,如果设置了,则该命令队列将乱序执行,否则按顺序执行。第二个属性设置是否剖析命令。如果设置则剖析命令,否则不剖析。这个剖析命令的使能设置就影响了该命令执行计时的问题。 详情请查询opencl手册。
第四个参数返回错误码。
具体问题如下:
此篇中,二维数组命令队列建立语句为:
//create commandqueue
cl_command_queue queue = clCreateCommandQueue( context, device, CL_QUEUE_PROFILING_ENABLE, &ret );
上篇中,一维数组由于没想使用事件,命令队列的建立语句为:
//Command-queue
queue = clCreateCommandQueue( context, device, 0, &errNum );
其中第三个参数在一维数组的计算中没有设置剖析命令,因此无法获取命令执行时间。设置为CL_QUEUE_PROFILING_ENABLE即可计算运行时间。
对比发现,在freescale的i.MX6q上,两个含有1024个元素的一维数组相加耗时,是两个32*32的二维数组相加耗时的至少两倍。