学了OpenCL有一段时间了,对于其思想已经有了一些了解。但是对于一些问题仍然难以理解。学习就是要通过阅读代码和自己试验总结经验。所以就有了这个系列的想法。
今天看的时AtomaticCounters例子。这里对环境的配置等就略过了,因为这些可以在网上直接找到。
#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
__kernel
void atomicCounters(
volatile __global uint *input,
uint value,
counter32_t counter)
{
size_t globalId = get_global_id(0);
if(value == input[globalId])
atomic_inc(counter);
}
/**
* Counts number of occurrences of value in input array using
* global atomics
*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
__kernel
void globalAtomics(
volatile __global uint *input,
uint value,
__global uint* counter)
{
size_t globalId = get_global_id(0);
if(value == input[globalId])
atomic_inc(&counter[0]);
}
在内核函数中因为要使用到OpenCL中的内置函数,所以在内核中加入了扩展编译选项#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable 以及#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable。先来看下OpenCL扩展,它的格式是:
#pragma OPENCL EXTENSION 扩展项:enable/disable
还有一些比如是否允许化简表达式,格式为:
#pragma OPENCL FP_CONTRACT :ON/OFF/DEFAULT
对于OpenCL的扩展项,其定义的格式为cl_khr_
再看这两个内核实际上区别非常小,主要就是atomic_inc函数的作用在哪里,一个是全局变量,一个是非全局变量,这里看一下在传入内核参数时候的构建方式:
1)atomicCounter:
int
AtomicCounters::runAtomicCounterKernel()
{
cl_int status = CL_SUCCESS;
// Set Global and Local work items
size_t globalWorkItems = length;
size_t localWorkItems = counterWorkGroupSize;
// Initialize the counter value
cl_event writeEvt;
status = clEnqueueWriteBuffer(
commandQueue,
counterOutBuf,
CL_FALSE,
0,
sizeof(cl_uint),
&initValue,
0,
NULL,
&writeEvt);
CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(counterOutBuf) failed.");
status = clFlush(commandQueue);
CHECK_OPENCL_ERROR(status, "clFlush(commandQueue)failed.");
// Wait for event and release event
status = waitForEventAndRelease(&writeEvt);
CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
// Set kernel arguments
status = clSetKernelArg(counterKernel, 0, sizeof(cl_mem), &inBuf);
CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed.");
status = clSetKernelArg(counterKernel, 1, sizeof(cl_uint), &value);
CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed.");
status = clSetKernelArg(counterKernel, 2, sizeof(cl_mem), &counterOutBuf);
CHECK_OPENCL_ERROR(status, "clSetKernelArg(counterOutBuf) failed.");
// Run Kernel
cl_event ndrEvt;
status = clEnqueueNDRangeKernel(
commandQueue,
counterKernel,
1,
NULL,
&globalWorkItems,
&localWorkItems,
0,
NULL,
&ndrEvt);
CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(counterKernel) failed.");
status = clFlush(commandQueue);
CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
cl_int eventStatus = CL_QUEUED;
while(eventStatus != CL_COMPLETE)
{
status = clGetEventInfo(
ndrEvt,
CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int),
&eventStatus,
NULL);
CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed.");
}
cl_ulong startTime;
cl_ulong endTime;
// Get profiling information
status = clGetEventProfilingInfo(
ndrEvt,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&startTime,
NULL);
CHECK_OPENCL_ERROR(status,
"clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed.");
status = clGetEventProfilingInfo(
ndrEvt,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&endTime,
NULL);
CHECK_OPENCL_ERROR(status,
"clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed.");
double sec = 1e-9 * (endTime - startTime);
kTimeAtomCounter += sec;
status = clReleaseEvent(ndrEvt);
CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed.");
// Get the occurrences of Value from atomicKernel
cl_event readEvt;
status = clEnqueueReadBuffer(
commandQueue,
counterOutBuf,
CL_FALSE,
0,
sizeof(cl_uint),
&counterOut,
0,
NULL,
&readEvt);
CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(counterOutBuf) failed.");
status = clFlush(commandQueue);
CHECK_OPENCL_ERROR(status, "clFlush() failed.");
// Wait for event and release event
status = waitForEventAndRelease(&readEvt);
CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed.");
return SDK_SUCCESS;
}
cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
commandQueue = clCreateCommandQueue(context, devices[sampleArgs->deviceId],
props, &status);
int
AtomicCounters::runGlobalAtomicKernel()
{
cl_int status = CL_SUCCESS;
// Set Global and Local work items
size_t globalWorkItems = length;
size_t localWorkItems = globalWorkGroupSize;
// Initialize the counter value
cl_event writeEvt;
status = clEnqueueWriteBuffer(
commandQueue,
globalOutBuf,
CL_FALSE,
0,
sizeof(cl_uint),
&initValue,
0,
NULL,
&writeEvt);
CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(globalOutBuf) failed.");
status = clFlush(commandQueue);
CHECK_OPENCL_ERROR(status, "clFlush() failed.");
// Wait for event and release event
status = waitForEventAndRelease(&writeEvt);
CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
// Set kernel arguments
status = clSetKernelArg(globalKernel, 0, sizeof(cl_mem), &inBuf);
CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed.");
status = clSetKernelArg(globalKernel, 1, sizeof(cl_uint), &value);
CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed.");
status = clSetKernelArg(globalKernel, 2, sizeof(cl_mem), &globalOutBuf);
CHECK_OPENCL_ERROR(status, "clSetKernelArg(globalOutBuf) failed.");
// Run Kernel
cl_event ndrEvt;
status = clEnqueueNDRangeKernel(
commandQueue,
globalKernel,
1,
NULL,
&globalWorkItems,
&localWorkItems,
0,
NULL,
&ndrEvt);
CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(globalKernel) failed.");
status = clFlush(commandQueue);
CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
cl_int eventStatus = CL_QUEUED;
while(eventStatus != CL_COMPLETE)
{
status = clGetEventInfo(
ndrEvt,
CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof(cl_int),
&eventStatus,
NULL);
CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed.");
}
cl_ulong startTime;
cl_ulong endTime;
// Get profiling information
status = clGetEventProfilingInfo(
ndrEvt,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&startTime,
NULL);
CHECK_OPENCL_ERROR(status,
"clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed.");
status = clGetEventProfilingInfo(
ndrEvt,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&endTime,
NULL);
CHECK_OPENCL_ERROR(status,
"clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed.");
double sec = 1e-9 * (endTime - startTime);
kTimeAtomGlobal += sec;
status = clReleaseEvent(ndrEvt);
CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed.");
// Get the occurrences of Value from atomicKernel
cl_event readEvt;
status = clEnqueueReadBuffer(
commandQueue,
globalOutBuf,
CL_FALSE,
0,
sizeof(cl_uint),
&globalOut,
0,
NULL,
&readEvt);
CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(globalOutBuf) failed.");
status = clFlush(commandQueue);
CHECK_OPENCL_ERROR(status, "clFlush() failed.");
// Wait for event and release event
status = waitForEventAndRelease(&readEvt);
CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed.");
return SDK_SUCCESS;
}