AMD OpenCL例子阅读笔记系列之AtomicCounters

         学了OpenCL有一段时间了,对于其思想已经有了一些了解。但是对于一些问题仍然难以理解。学习就是要通过阅读代码和自己试验总结经验。所以就有了这个系列的想法。

         今天看的时AtomaticCounters例子。这里对环境的配置等就略过了,因为这些可以在网上直接找到。

       

#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable        

__kernel 
void atomicCounters(
		volatile __global uint *input,
		uint value,
		counter32_t counter)                          
{
	
	size_t globalId = get_global_id(0);
	
	if(value == input[globalId])
		atomic_inc(counter);
		
}                                                                         



/**
 * Counts number of occurrences of value in input array using 
 * global atomics
 */

#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
__kernel 
void globalAtomics(
		volatile __global uint *input,
		uint value,
		__global uint* counter)                         
{                                                                         
	size_t globalId = get_global_id(0);
	
	if(value == input[globalId])
		atomic_inc(&counter[0]);
}                                                                         
     在内核函数中因为要使用到OpenCL中的内置函数,所以在内核中加入了扩展编译选项#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable   以及#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable。先来看下OpenCL扩展,它的格式是:

     #pragma  OPENCL EXTENSION  扩展项:enable/disable

    还有一些比如是否允许化简表达式,格式为:

    #pragma OPENCL  FP_CONTRACT :ON/OFF/DEFAULT

    对于OpenCL的扩展项,其定义的格式为cl_khr_或者cl__的开发商形式扩展名。

     再看这两个内核实际上区别非常小,主要就是atomic_inc函数的作用在哪里,一个是全局变量,一个是非全局变量,这里看一下在传入内核参数时候的构建方式:

1)atomicCounter:

int
AtomicCounters::runAtomicCounterKernel()
{
    cl_int status = CL_SUCCESS;
    // Set Global and Local work items
    size_t globalWorkItems = length;
    size_t localWorkItems = counterWorkGroupSize;
    // Initialize the counter value
    cl_event writeEvt;
    status = clEnqueueWriteBuffer(
                 commandQueue,
                 counterOutBuf,
                 CL_FALSE,
                 0,
                 sizeof(cl_uint),
                 &initValue,
                 0,
                 NULL,
                 &writeEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(counterOutBuf) failed.");
    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush(commandQueue)failed.");
    // Wait for event and release event
    status = waitForEventAndRelease(&writeEvt);
    CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
    // Set kernel arguments
    status = clSetKernelArg(counterKernel, 0, sizeof(cl_mem), &inBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed.");
    status = clSetKernelArg(counterKernel, 1, sizeof(cl_uint), &value);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed.");
    status = clSetKernelArg(counterKernel, 2, sizeof(cl_mem), &counterOutBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(counterOutBuf) failed.");
    // Run Kernel
    cl_event ndrEvt;
    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 counterKernel,
                 1,
                 NULL,
                 &globalWorkItems,
                 &localWorkItems,
                 0,
                 NULL,
                 &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(counterKernel) failed.");
    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
    cl_int eventStatus = CL_QUEUED;
    while(eventStatus != CL_COMPLETE)
    {
        status = clGetEventInfo(
                     ndrEvt,
                     CL_EVENT_COMMAND_EXECUTION_STATUS,
                     sizeof(cl_int),
                     &eventStatus,
                     NULL);
        CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed.");
    }
    cl_ulong startTime;
    cl_ulong endTime;
    // Get profiling information
    status = clGetEventProfilingInfo(
                 ndrEvt,
                 CL_PROFILING_COMMAND_START,
                 sizeof(cl_ulong),
                 &startTime,
                 NULL);
    CHECK_OPENCL_ERROR(status,
                       "clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed.");
    status = clGetEventProfilingInfo(
                 ndrEvt,
                 CL_PROFILING_COMMAND_END,
                 sizeof(cl_ulong),
                 &endTime,
                 NULL);
    CHECK_OPENCL_ERROR(status,
                       "clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed.");
    double sec = 1e-9 * (endTime - startTime);
    kTimeAtomCounter += sec;
    status = clReleaseEvent(ndrEvt);
    CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed.");
    // Get the occurrences of Value from atomicKernel
    cl_event readEvt;
    status = clEnqueueReadBuffer(
                 commandQueue,
                 counterOutBuf,
                 CL_FALSE,
                 0,
                 sizeof(cl_uint),
                 &counterOut,
                 0,
                 NULL,
                 &readEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(counterOutBuf) failed.");
    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush() failed.");
    // Wait for event and release event
    status = waitForEventAndRelease(&readEvt);
    CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed.");
    return SDK_SUCCESS;
}

        所有的OpenCL程序应该都遵循着将需要的内存空间从主机拷贝到设备,设备运算,然后设备再拷回主机的过程,这里主要需要注意的是对于counter32_t这个类型的内核形参变量也需要有这样的操作。同时在函数中使用了事件的方式来查看本次搜索的性能,需要注意的是,如果需要通过这种方式查看性能,则需要在创建命令队列时增加一个属性:

    cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
    commandQueue = clCreateCommandQueue(context, devices[sampleArgs->deviceId],
                                        props, &status);

    而Global形式的原子操作主机调用函数为:

int
AtomicCounters::runGlobalAtomicKernel()
{
    cl_int status = CL_SUCCESS;
    // Set Global and Local work items
    size_t globalWorkItems = length;
    size_t localWorkItems = globalWorkGroupSize;
    // Initialize the counter value
    cl_event writeEvt;
    status = clEnqueueWriteBuffer(
                 commandQueue,
                 globalOutBuf,
                 CL_FALSE,
                 0,
                 sizeof(cl_uint),
                 &initValue,
                 0,
                 NULL,
                 &writeEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(globalOutBuf) failed.");
    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush() failed.");
    // Wait for event and release event
    status = waitForEventAndRelease(&writeEvt);
    CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
    // Set kernel arguments
    status = clSetKernelArg(globalKernel, 0, sizeof(cl_mem), &inBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed.");
    status = clSetKernelArg(globalKernel, 1, sizeof(cl_uint), &value);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed.");
    status = clSetKernelArg(globalKernel, 2, sizeof(cl_mem), &globalOutBuf);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(globalOutBuf) failed.");
    // Run Kernel
    cl_event ndrEvt;
    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 globalKernel,
                 1,
                 NULL,
                 &globalWorkItems,
                 &localWorkItems,
                 0,
                 NULL,
                 &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(globalKernel) failed.");
    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
    cl_int eventStatus = CL_QUEUED;
    while(eventStatus != CL_COMPLETE)
    {
        status = clGetEventInfo(
                     ndrEvt,
                     CL_EVENT_COMMAND_EXECUTION_STATUS,
                     sizeof(cl_int),
                     &eventStatus,
                     NULL);
        CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed.");
    }
    cl_ulong startTime;
    cl_ulong endTime;
    // Get profiling information
    status = clGetEventProfilingInfo(
                 ndrEvt,
                 CL_PROFILING_COMMAND_START,
                 sizeof(cl_ulong),
                 &startTime,
                 NULL);
    CHECK_OPENCL_ERROR(status,
                       "clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed.");
    status = clGetEventProfilingInfo(
                 ndrEvt,
                 CL_PROFILING_COMMAND_END,
                 sizeof(cl_ulong),
                 &endTime,
                 NULL);
    CHECK_OPENCL_ERROR(status,
                       "clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed.");
    double sec = 1e-9 * (endTime - startTime);
    kTimeAtomGlobal += sec;
    status = clReleaseEvent(ndrEvt);
    CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed.");
    // Get the occurrences of Value from atomicKernel
    cl_event readEvt;
    status = clEnqueueReadBuffer(
                 commandQueue,
                 globalOutBuf,
                 CL_FALSE,
                 0,
                 sizeof(cl_uint),
                 &globalOut,
                 0,
                 NULL,
                 &readEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(globalOutBuf) failed.");
    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush() failed.");
    // Wait for event and release event
    status = waitForEventAndRelease(&readEvt);
    CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed.");
    return SDK_SUCCESS;
}

    与AtomicCounter那个没有什么大的区别,但是,因为kernel函数中最后一个参数地址空间限定符以及使用的原子操作范围的不同,所以最终的结果导致了只有global的那个原子操作才能得到正确的结果(这个在大数据的情况下更加明显)。其原因也很简单,第一个的atomic_inc按我的理解应该只对本组的进行同步,但是全局则保证了所有组的正常顺序进行操作。

你可能感兴趣的:(OpenCL)