学了OpenCL有一段时间了,对于其思想已经有了一些了解。但是对于一些问题仍然难以理解。学习就是要通过阅读代码和自己试验总结经验。所以就有了这个系列的想法。
今天看的时AtomaticCounters例子。这里对环境的配置等就略过了,因为这些可以在网上直接找到。
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable __kernel void atomicCounters( volatile __global uint *input, uint value, counter32_t counter) { size_t globalId = get_global_id(0); if(value == input[globalId]) atomic_inc(counter); } /** * Counts number of occurrences of value in input array using * global atomics */ #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable __kernel void globalAtomics( volatile __global uint *input, uint value, __global uint* counter) { size_t globalId = get_global_id(0); if(value == input[globalId]) atomic_inc(&counter[0]); }在内核函数中因为要使用到OpenCL中的内置函数,所以在内核中加入了扩展编译选项#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable 以及#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable。先来看下OpenCL扩展,它的格式是:
#pragma OPENCL EXTENSION 扩展项:enable/disable
还有一些比如是否允许化简表达式,格式为:
#pragma OPENCL FP_CONTRACT :ON/OFF/DEFAULT
对于OpenCL的扩展项,其定义的格式为cl_khr_<name>或者cl_<vendor_name>_<name>的开发商形式扩展名。
再看这两个内核实际上区别非常小,主要就是atomic_inc函数的作用在哪里,一个是全局变量,一个是非全局变量,这里看一下在传入内核参数时候的构建方式:
1)atomicCounter:
int AtomicCounters::runAtomicCounterKernel() { cl_int status = CL_SUCCESS; // Set Global and Local work items size_t globalWorkItems = length; size_t localWorkItems = counterWorkGroupSize; // Initialize the counter value cl_event writeEvt; status = clEnqueueWriteBuffer( commandQueue, counterOutBuf, CL_FALSE, 0, sizeof(cl_uint), &initValue, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(counterOutBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush(commandQueue)failed."); // Wait for event and release event status = waitForEventAndRelease(&writeEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed."); // Set kernel arguments status = clSetKernelArg(counterKernel, 0, sizeof(cl_mem), &inBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed."); status = clSetKernelArg(counterKernel, 1, sizeof(cl_uint), &value); CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed."); status = clSetKernelArg(counterKernel, 2, sizeof(cl_mem), &counterOutBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(counterOutBuf) failed."); // Run Kernel cl_event ndrEvt; status = clEnqueueNDRangeKernel( commandQueue, counterKernel, 1, NULL, &globalWorkItems, &localWorkItems, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(counterKernel) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed."); cl_int eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( ndrEvt, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed."); } cl_ulong startTime; cl_ulong endTime; // Get profiling information status = clGetEventProfilingInfo( ndrEvt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed."); status = clGetEventProfilingInfo( ndrEvt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed."); double sec = 1e-9 * (endTime - startTime); kTimeAtomCounter += sec; status = clReleaseEvent(ndrEvt); CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed."); // Get the occurrences of Value from atomicKernel cl_event readEvt; status = clEnqueueReadBuffer( commandQueue, counterOutBuf, CL_FALSE, 0, sizeof(cl_uint), &counterOut, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(counterOutBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for event and release event status = waitForEventAndRelease(&readEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed."); return SDK_SUCCESS; }
cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue(context, devices[sampleArgs->deviceId], props, &status);
int AtomicCounters::runGlobalAtomicKernel() { cl_int status = CL_SUCCESS; // Set Global and Local work items size_t globalWorkItems = length; size_t localWorkItems = globalWorkGroupSize; // Initialize the counter value cl_event writeEvt; status = clEnqueueWriteBuffer( commandQueue, globalOutBuf, CL_FALSE, 0, sizeof(cl_uint), &initValue, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(globalOutBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for event and release event status = waitForEventAndRelease(&writeEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed."); // Set kernel arguments status = clSetKernelArg(globalKernel, 0, sizeof(cl_mem), &inBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed."); status = clSetKernelArg(globalKernel, 1, sizeof(cl_uint), &value); CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed."); status = clSetKernelArg(globalKernel, 2, sizeof(cl_mem), &globalOutBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(globalOutBuf) failed."); // Run Kernel cl_event ndrEvt; status = clEnqueueNDRangeKernel( commandQueue, globalKernel, 1, NULL, &globalWorkItems, &localWorkItems, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(globalKernel) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed."); cl_int eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( ndrEvt, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed."); } cl_ulong startTime; cl_ulong endTime; // Get profiling information status = clGetEventProfilingInfo( ndrEvt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed."); status = clGetEventProfilingInfo( ndrEvt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed."); double sec = 1e-9 * (endTime - startTime); kTimeAtomGlobal += sec; status = clReleaseEvent(ndrEvt); CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed."); // Get the occurrences of Value from atomicKernel cl_event readEvt; status = clEnqueueReadBuffer( commandQueue, globalOutBuf, CL_FALSE, 0, sizeof(cl_uint), &globalOut, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(globalOutBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for event and release event status = waitForEventAndRelease(&readEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed."); return SDK_SUCCESS; }