OpenCL Programming Template -- Juliet






#include <stdio.h>

#include <Cl/cl.h>



#include <intrin.h>

#include <windows.h>


//we refuse to use oclUtils.h, including oclXXX() or shrXXX() because they are not OCL intrinsic functions.

//#include <oclUtils.h>


#include "FloatTest.h"


cl_context cxGPUContext = NULL;             

cl_device_id device;

cl_command_queue cqCommandQueue = NULL;       

cl_mem d_data = NULL;

cl_int ciErrNum = CL_SUCCESS;

cl_program cpProgram = NULL, cpProgram1 = NULL, cpProgram2=NULL, cpProgram3 = NULL; //Because the length of the lenght is limited, so a big .cl needs several cpPrograms.

size_t program_length = 0, program_length1=0, program_length2=0, program_length3 = 0;

cl_kernel kernel = NULL;

size_t max_item[4];

size_t localWorkSize;

size_t globalWorkSize;


const int testnum = 32;

double time[256] = {0.0};

unsigned long ops_cnt = 1 * 1024 * 1024 * 1024UL;

const float mem_size = sizeof(float) * 65536;


static int k = 0;

bool itemFlag = true;


typedef struct


     cl_platform_id platform;

     cl_device_id* devices;

     cl_uint numDevices;


PD* platforms;

cl_uint numPlatforms;


int main()



     unsigned int i;


     //Get counterFreq of your CPU, used in InitTimer()/GetTimer()

     counterFreq = GetCPUSpeed();


     //Get the number of the platforms

     ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);


     if(ciErrNum != CL_SUCCESS) return 1;

     if( numPlatforms > 0)


         platforms = new PD[numPlatforms];


         //malloc for these platforms

         cl_platform_id* platformsTem = new cl_platform_id[numPlatforms];


         //get these platforms

         ciErrNum = clGetPlatformIDs(numPlatforms, platformsTem, NULL);


         for(i=0; i<numPlatforms; i++)

              platforms[i].platform = platformsTem[i];

         delete[] platformsTem;


         cl_uint maxPerPlatform = 0;

         for(i = 0; i < numPlatforms; i++)


              char pbuf[100];


              //get detailed info about this platform,e.g., CL_PLATFORM_NAME, /version/vendor

              ciErrNum = clGetPlatformInfo(platforms[i].platform, CL_PLATFORM_NAME, sizeof(pbuf), pbuf, NULL);



              //get the number of devices supported by this platform.(type could be CL_DEVICE_TYPE_ALL)

              ciErrNum = clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_GPU, 0, NULL, &(platforms[i].numDevices));


              if(platforms[i].numDevices > maxPerPlatform)

                   maxPerPlatform = platforms[i].numDevices;


              //get these devices supported by this platform

              platforms[i].devices = new cl_device_id[platforms[i].numDevices];

              //get one device is simple

              //clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

              ciErrNum = clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_GPU, platforms[i].numDevices, platforms[i].devices, NULL);




         /* get platforms and its supporting devices through oclXXX()


         //get platforms

         char cBuffer[1024];

         cl_platform_id cpPlatform = NULL;        


         clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);


         //get devices

         clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);

         cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );

         clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);


         clGetDeviceInfo(cdDevices[i], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);

         oclPrintDevInfo(LOGBOTH, cdDevices[i]);


         //record into log

         std::string sProfileString = "oclDeviceQuery,[Platform]Device Name = ";

         sProfileString += ", Device = ";

        sProfileString += cBuffer;


         sProfileString += "/n";

         shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str());



         if(maxPerPlatform < 0)

              return 1;

         double* dpTime = new double[ 10 * numPlatforms * maxPerPlatform]; //item <= 10

         unsigned long item = 0;

         int flag = FloatTest(dpTime, &item, &ciErrNum, platforms, numPlatforms);

         if(flag != 0 ) return 1;


         for(int j=0; j< 5; j++)

              printf("%.3lf %.3lf GLOPS/n", dpTime[j] / 1000000000.0,dpTime[j+5]/ 1000000000.0);



         delete[] dpTime;

         for(i = 0; i < numPlatforms; i++)

              delete[] platforms[i].devices;

         delete[] platforms;




         return 1;





@ double* dpTime, 返回数据的数组;

@ unsigned long* item, 测试项的个数,在每个平台下的每个平台上都会测这几个测试项,例如AddMulMad

@ cl_int* pciErrNum,错误代号;

@ PD* platforms,自定义的PD类型的platforms[]

@ cl_uint numPlatforms,用来说明平台个数,即platforms[]数组元素个数;

@ __int64 counterFreq_main,计时函数中需要用到的一个参数;


int FloatTest(double* dpTime, unsigned long* item, cl_int* pciErrNum, PD* platforms, cl_uint numPlatforms)



     cl_uint pi,di;

     for(pi = 0; pi<numPlatforms; pi++)


         cxGPUContext = clCreateContext(0, platforms[pi].numDevices, platforms[pi].devices, NULL, NULL, &*pciErrNum);


         //-----------reate context---------NO clCreateContextFromType().x and later driver doesn't support it well


         cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &*pciErrNum);  //Jun: cxGPUContext = clCreateContext(0, pInfoDlg->m_iCLDeviceBMCount, pInfoDlg->m_idCLDeviceListBM, NULL, NULL, &ciErrNum);


         //--------choose device----------

         //Above, we use platform info to get devices info.

         //Here, we could get devices info from context.We could get devices under certain platform(context <-> platform), or, we get all devices(just one context).

         size_t nDeviceBytes;

         *pciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);

         ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);



         for(unsigned int i = 0; i < ciDeviceCount; ++i)


              // get and print the device for this queue

              device = oclGetDev(cxGPUContext, i);

              oclPrintDevName(LOGBOTH, device);






         //program setup, create the program, build the program, create d_data

         int flag = subFloatTest(pciErrNum);

         if(flag != 0)   return 1; //one fail, then return.



    for(di = 0; di<platforms[pi].numDevices; di++)


              device = (platforms[pi].devices)[di];


              //Attention: some global var may be changed during one iteration, so we assign it the initial value.

              //If the value will not be changed, I suggest to declare it as "const"

              ops_cnt = 1 * 1024 * 1024 * 1024UL;


              runOnDevice(dpTime, item, pciErrNum);















int subFloatTest(cl_int* pciErrNum)


     //program setup, .h(char[]) replaces .cl

     //create the program

     cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&FloatTestCLsource, &program_length, &*pciErrNum);



     cpProgram1 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelMul, &program_length1, &*pciErrNum);



     cpProgram2 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelMAD, &program_length2, &*pciErrNum);



     cpProgram3 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelSF, &program_length3, &*pciErrNum);



 //if we use .cl

    char* source_path = "";

    char *source = oclLoadProgSource(source_path, "", &program_length);

    cpProgram = clCreateProgramWithSource(cxGPUContext, 1,

                         (const char **)&source, &program_length, &*pciErrNum);

    if(CL_SUCCESS != *pciErrNum ) return 1;



     //build the program

    *pciErrNum = clBuildProgram(cpProgram, 0, NULL, "", NULL, NULL);



     *pciErrNum = clBuildProgram(cpProgram1, 0, NULL, "", NULL, NULL);



     *pciErrNum = clBuildProgram(cpProgram2, 0, NULL, "", NULL, NULL);



     *pciErrNum = clBuildProgram(cpProgram3, 0, NULL, "", NULL, NULL);



     //inputhost memory

     d_data = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, mem_size, NULL, &*pciErrNum);


     //cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,mem_size_A, h_A_data, &ciErrNum);


     return 0;



int runOnDevice(double* AddIOPS, unsigned long* item, cl_int* pciErrNum)


     //get Throughput, /10^9 (GIOPS)

     double final;


     *pciErrNum = ms(&final,"floatAddTest",pciErrNum,1024, 0);


     if(final > 0.0)

         AddIOPS[k++] = ops_cnt / (final /1000.0);


     *pciErrNum = ms(&final,"floatMulTest",pciErrNum,1024,1);


     if(final > 0.0)

         AddIOPS[k++] = ops_cnt / (final /1000.0);


     *pciErrNum = ms(&final,"floatMADTest",pciErrNum,1024,2);


     if(final > 0.0)

         AddIOPS[k++] = ops_cnt / (final /1000.0) * 2.0;


     ops_cnt = 128 * 1024 * 1024UL;


     *pciErrNum = ms(&final,"floatSFTest",pciErrNum,16,3);


     if(final > 0.0)

         AddIOPS[k++] = ops_cnt / (final /1000.0);


     *pciErrNum = ms(&final,"floatNativeSFTest",pciErrNum,16,3);


     if(final > 0.0)

         AddIOPS[k++] = ops_cnt / (final /1000.0);


     if(itemFlag == true)


         *item = k;

         itemFlag = false;



     return 0;




int ms(double *final,const char* kernelName, cl_int *pciErrNum, int coef,int id)


     // create a command-queue

    cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &*pciErrNum);

    if(CL_SUCCESS != *pciErrNum )

         return 1;


     //which kernel



     case 0: kernel = clCreateKernel(cpProgram, kernelName, &*pciErrNum); break;

     case 1: kernel = clCreateKernel(cpProgram1, kernelName, &*pciErrNum); break;

     case 2: kernel = clCreateKernel(cpProgram2, kernelName, &*pciErrNum); break;

     case 3: kernel = clCreateKernel(cpProgram3, kernelName, &*pciErrNum); break;


     if(CL_SUCCESS != *pciErrNum )

         return 1;


     //setup execution parameter

     clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_item), max_item, NULL);

     localWorkSize = max_item[0] < 256 ? max_item[0] : 256;

     globalWorkSize = ops_cnt / coef;


     //set kernel arg

     *pciErrNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&d_data);

    if(CL_SUCCESS != *pciErrNum )

         return 1;



         //input data in device memory. clCreateBuffer + clEnqueueCopyBuffer()

         d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * WA, NULL,NULL);

         clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * WA, 0, workSize[i] * sizeof(float) * WA, 0, NULL, NULL);


         //or, we do like this: create + copy

        d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,

                                mem_size_B, h_B_data, NULL);


         //Asynchronous write of data to GPU device

         clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);




     //warmup so we don't time driver startup

    *pciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 1, 0,&globalWorkSize, &localWorkSize, 0, NULL, NULL);

     if(CL_SUCCESS != *pciErrNum ) return 1;





     //cl_event kernel_completion;


     //foreach testnum

     for(int i = 0; i< testnum; i++)


     //   shrDeltaT(0);  //us



         *pciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); //&kernel_completion

         if(CL_SUCCESS != *pciErrNum ) return 1;

     //   clWaitForEvents(1, &kernel_completion);

         *pciErrNum = clFinish(cqCommandQueue);

         if(CL_SUCCESS != *pciErrNum ) return 1;


     //   time[i] = shrDeltaT(0) * 1000.0; //ms

         time[i] = GetTimer(); //ms



     //clReleaseEvent(kernel_completion); //we use clFinish() or event to synchronize CPU and GPU


     // Read back results and check accumulated errors

    //clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);







     //get ms

     *final = GetTimeMin(testnum);


     return 0;



double GetTimeMin(int len)


     double tem = time[0];

     for(int i=1;i<len;i++)

         if(time[i] < tem)

              tem = time[i];

     return tem;


__inline void InitTimer(void)


     counterT0 = __rdtsc();



__inline double GetTimer(void)


     return (__rdtsc() - counterT0) * 1000.0 / counterFreq;



unsigned __int64 GetCPUSpeed(void)


     unsigned __int64 start, stop;

     unsigned __int64 nCtr, nFreq, nCtrStop;


     QueryPerformanceFrequency((LARGE_INTEGER *)&nFreq);


     _asm _emit 0x0F

    _asm _emit 0x31

    _asm mov DWORD PTR start, eax

    _asm mov DWORD PTR [start + 4], edx


     QueryPerformanceCounter((LARGE_INTEGER *)&nCtrStop);

     nCtrStop += nFreq / 5;



         QueryPerformanceCounter((LARGE_INTEGER *)&nCtr);

     }while (nCtr < nCtrStop);


     _asm _emit 0x0F

     _asm _emit 0x31

     _asm mov DWORD PTR stop, eax

     _asm mov DWORD PTR [stop + 4], edx


     counterFreq = (stop - start) * 5;


     return counterFreq;






#ifndef INT32TEST_H
#define INT32TEST_H

#define CHECK(i) if((i)!=CL_SUCCESS) return 1;

const char *FloatTestCLsource = "   /
__kernel void floatAddTest(__global float *dummy_buf)       /

{       /

}       /


typedef struct
 cl_platform_id platform;
 cl_device_id* devices;
 cl_uint numDevices;
__int64 counterT0, counterFreq;

int FloatTest(double* dpTime, unsigned long* item, cl_int* pciErrNum, PD* platforms, cl_uint numPlatforms);
int subFloatTest(cl_int* pciErrNum);
int runOnDevice(double* AddIOPS, unsigned long* item, cl_int* pciErrNum);
int ms(double *final,const char* kernelName, cl_int *pciErrNum, int coef, int id);
double GetTimeMin(int len);
__inline void InitTimer(void);
__inline double GetTimer(void);
unsigned __int64 GetCPUSpeed(void);








//some log functions

shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");


shrLog("Hello World!!!/n/n");

shrLog(" CL_PLATFORM_NAME: /t%s/n", cBuffer);



oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));

oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");


//some checking functions

oclCheckError(ciErrNum, CL_SUCCESS);

shrCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);


//some filling and result-checking functions

void* srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);

shrFillArray((float*)srcA, 4 * iNumElements);

shrDiffArray((const float*)dst, (const float*)Golden, iNumElements);

shrCompareL2fe(reference, h_C, size_C, 1e-6f);



//print WINDOWS systemInfo:

    #ifdef _WIN32

        SYSTEM_INFO stProcInfo;         // processor info struct

        OSVERSIONINFO stOSVerInfo;      // Win OS info struct

        SYSTEMTIME stLocalDateTime;     // local date / time struct


        // processor

        SecureZeroMemory(&stProcInfo, sizeof(SYSTEM_INFO));



        // OS

        SecureZeroMemory(&stOSVerInfo, sizeof(OSVERSIONINFO));

        stOSVerInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);



        // date and time



        // write time and date to logs

        shrLog(" Local Time/Date = %i:%i:%i, %i/%i/%i/n",

            stLocalDateTime.wHour, stLocalDateTime.wMinute, stLocalDateTime.wSecond,

            stLocalDateTime.wMonth, stLocalDateTime.wDay, stLocalDateTime.wYear);


        // write proc and OS info to logs

        shrLog(" CPU Arch: %i/n CPU Level: %i/n # of CPU processors: %u/n Windows Build: %u/n Windows Ver: %u.%u/n/n/n",

            stProcInfo.wProcessorArchitecture, stProcInfo.wProcessorLevel, stProcInfo.dwNumberOfProcessors,

            stOSVerInfo.dwBuildNumber, stOSVerInfo.dwMajorVersion, stOSVerInfo.dwMinorVersion);





//oclMatrixMul use event to time:

double executionTime(cl_event &event)


    cl_ulong start, end;


    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);

    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);


    return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return


