OpenCL Programming Template -- Juliet

很久了,CSDN博客图片一直处于和谐态,灰常影响博友心情=_=!!!

 

这里(http://download.csdn.net/source/2030769)有OpenCL开发流程,Platforms/Devices/Context概念澄清的图解,需要的朋友可以下载看看,免积分。本篇我写OCL程序的一个模板,拿出来晒晒太阳,大家喜欢了可以拿去用。CUDA与OpenCL的裙带关系矣然大白于天下。下一篇博客我会对比CUDA与OpenCL的相关函数,有兴趣的朋友可以做一个转换工具,于人于已也就方便多了。言归正传,代码说事儿:

 

 

#include

#include

 

 //InitTimer()GetTimer()

#include

#include

 

//we refuse to use oclUtils.h, including oclXXX() or shrXXX() because they are not OCL intrinsic functions.

//#include

 

#include "FloatTest.h"

 

cl_context cxGPUContext = NULL;             

cl_device_id device;

cl_command_queue cqCommandQueue = NULL;       

cl_mem d_data = NULL;

cl_int ciErrNum = CL_SUCCESS;

cl_program cpProgram = NULL, cpProgram1 = NULL, cpProgram2=NULL, cpProgram3 = NULL; //Because the length of the lenght is limited, so a big .cl needs several cpPrograms.

size_t program_length = 0, program_length1=0, program_length2=0, program_length3 = 0;

cl_kernel kernel = NULL;

size_t max_item[4];

size_t localWorkSize;

size_t globalWorkSize;

 

const int testnum = 32;

double time[256] = {0.0};

unsigned long ops_cnt = 1 * 1024 * 1024 * 1024UL;

const float mem_size = sizeof(float) * 65536;

 

static int k = 0;

bool itemFlag = true;

/*

typedef struct

{

     cl_platform_id platform;

     cl_device_id* devices;

     cl_uint numDevices;

}PD;*/

PD* platforms;

cl_uint numPlatforms;

 

int main()

{

    

     unsigned int i;

 

     //Get counterFreq of your CPU, used in InitTimer()/GetTimer()

     counterFreq = GetCPUSpeed();

 

     //Get the number of the platforms

     ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);

 

     if(ciErrNum != CL_SUCCESS) return 1;

     if( numPlatforms > 0)

     {

         platforms = new PD[numPlatforms];

        

         //malloc for these platforms

         cl_platform_id* platformsTem = new cl_platform_id[numPlatforms];

        

         //get these platforms

         ciErrNum = clGetPlatformIDs(numPlatforms, platformsTem, NULL);

         CHECK(ciErrNum);

         for(i=0; i

              platforms[i].platform = platformsTem[i];

         delete[] platformsTem;

        

         cl_uint maxPerPlatform = 0;

         for(i = 0; i < numPlatforms; i++)

         {

              char pbuf[100];

 

              //get detailed info about this platform,e.g., CL_PLATFORM_NAME, /version/vendor

              ciErrNum = clGetPlatformInfo(platforms[i].platform, CL_PLATFORM_NAME, sizeof(pbuf), pbuf, NULL);

              CHECK(ciErrNum);

 

              //get the number of devices supported by this platform.(type could be CL_DEVICE_TYPE_ALL)

              ciErrNum = clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_GPU, 0, NULL, &(platforms[i].numDevices));

              CHECK(ciErrNum);

              if(platforms[i].numDevices > maxPerPlatform)

                   maxPerPlatform = platforms[i].numDevices;

             

              //get these devices supported by this platform

              platforms[i].devices = new cl_device_id[platforms[i].numDevices];

              //get one device is simple

              //clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

              ciErrNum = clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_GPU, platforms[i].numDevices, platforms[i].devices, NULL);

              CHECK(ciErrNum);

         }//for

 

         /* get platforms and its supporting devices through oclXXX()

         {

         //get platforms

         char cBuffer[1024];

         cl_platform_id cpPlatform = NULL;        

         oclGetPlatformID(&cpPlatform);

         clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);

        

         //get devices

         clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);

         cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );

         clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);

 

         clGetDeviceInfo(cdDevices[i], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);

         oclPrintDevInfo(LOGBOTH, cdDevices[i]);

        

         //record into log

         std::string sProfileString = "oclDeviceQuery,[Platform]Device Name = ";

         sProfileString += ", Device = ";

        sProfileString += cBuffer;

 

         sProfileString += "/n";

         shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str());

         }

         */

         if(maxPerPlatform < 0)

              return 1;

         double* dpTime = new double[ 10 * numPlatforms * maxPerPlatform]; //item <= 10

         unsigned long item = 0;

         int flag = FloatTest(dpTime, &item, &ciErrNum, platforms, numPlatforms);

         if(flag != 0 ) return 1;

 

         for(int j=0; j< 5; j++)

              printf("%.3lf %.3lf GLOPS/n", dpTime[j] / 1000000000.0,dpTime[j+5]/ 1000000000.0);

 

        

         delete[] dpTime;

         for(i = 0; i < numPlatforms; i++)

              delete[] platforms[i].devices;

         delete[] platforms;

     }

     else

     {

         return 1;

     }

}

 

/*

@ double* dpTime, 返回数据的数组;

@ unsigned long* item, 测试项的个数,在每个平台下的每个平台上都会测这几个测试项,例如AddMulMad

@ cl_int* pciErrNum,错误代号;

@ PD* platforms,自定义的PD类型的platforms[]

@ cl_uint numPlatforms,用来说明平台个数,即platforms[]数组元素个数;

@ __int64 counterFreq_main,计时函数中需要用到的一个参数;

*/

int FloatTest(double* dpTime, unsigned long* item, cl_int* pciErrNum, PD* platforms, cl_uint numPlatforms)

{  

 

     cl_uint pi,di;

     for(pi = 0; pi

     {

         cxGPUContext = clCreateContext(0, platforms[pi].numDevices, platforms[pi].devices, NULL, NULL, &*pciErrNum);

         /*

         //-----------reate context---------NO clCreateContextFromType().x and later driver doesn't support it well

 

         cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &*pciErrNum);  //Jun: cxGPUContext = clCreateContext(0, pInfoDlg->m_iCLDeviceBMCount, pInfoDlg->m_idCLDeviceListBM, NULL, NULL, &ciErrNum);

        

         //--------choose device----------

         //Above, we use platform info to get devices info.

         //Here, we could get devices info from context.We could get devices under certain platform(context <-> platform), or, we get all devices(just one context).

         size_t nDeviceBytes;

         *pciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);

         ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);

 

 

         for(unsigned int i = 0; i < ciDeviceCount; ++i)

         {

              // get and print the device for this queue

              device = oclGetDev(cxGPUContext, i);

              oclPrintDevName(LOGBOTH, device);

         }

         */

        

 

        

         //program setup, create the program, build the program, create d_data

         int flag = subFloatTest(pciErrNum);

         if(flag != 0)   return 1; //one fail, then return.

 

        

    for(di = 0; di

         {

              device = (platforms[pi].devices)[di];

 

              //Attention: some global var may be changed during one iteration, so we assign it the initial value.

              //If the value will not be changed, I suggest to declare it as "const"

              ops_cnt = 1 * 1024 * 1024 * 1024UL;

 

              runOnDevice(dpTime, item, pciErrNum);

         }       

 

     }

 

     //release

     clReleaseProgram(cpProgram);

     clReleaseProgram(cpProgram1);

     clReleaseProgram(cpProgram2);

     clReleaseProgram(cpProgram3);

     clReleaseMemObject(d_data);

     clReleaseContext(cxGPUContext);

}

    

 

int subFloatTest(cl_int* pciErrNum)

{

     //program setup, .h(char[]) replaces .cl

     //create the program

     cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&FloatTestCLsource, &program_length, &*pciErrNum);

     CHECK(*pciErrNum);

 

     cpProgram1 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelMul, &program_length1, &*pciErrNum);

     CHECK(*pciErrNum);

 

     cpProgram2 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelMAD, &program_length2, &*pciErrNum);

     CHECK(*pciErrNum);

 

     cpProgram3 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelSF, &program_length3, &*pciErrNum);

     CHECK(*pciErrNum);

/*

 //if we use .cl

    char* source_path = "FloatTest.cl";

    char *source = oclLoadProgSource(source_path, "", &program_length);

    cpProgram = clCreateProgramWithSource(cxGPUContext, 1,

                         (const char **)&source, &program_length, &*pciErrNum);

    if(CL_SUCCESS != *pciErrNum ) return 1;

*/

 

     //build the program

    *pciErrNum = clBuildProgram(cpProgram, 0, NULL, "", NULL, NULL);

    CHECK(*pciErrNum);

 

     *pciErrNum = clBuildProgram(cpProgram1, 0, NULL, "", NULL, NULL);

    CHECK(*pciErrNum);

 

     *pciErrNum = clBuildProgram(cpProgram2, 0, NULL, "", NULL, NULL);

    CHECK(*pciErrNum);

 

     *pciErrNum = clBuildProgram(cpProgram3, 0, NULL, "", NULL, NULL);

    CHECK(*pciErrNum);

 

     //inputhost memory

     d_data = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, mem_size, NULL, &*pciErrNum);

    CHECK(*pciErrNum);

     //cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,mem_size_A, h_A_data, &ciErrNum);

 

     return 0;

}

 

int runOnDevice(double* AddIOPS, unsigned long* item, cl_int* pciErrNum)

{

     //get Throughput, /10^9 (GIOPS)

     double final;

 

     *pciErrNum = ms(&final,"floatAddTest",pciErrNum,1024, 0);

     CHECK(*pciErrNum);

     if(final > 0.0)

         AddIOPS[k++] = ops_cnt / (final /1000.0);

 

     *pciErrNum = ms(&final,"floatMulTest",pciErrNum,1024,1);

     CHECK(*pciErrNum);

     if(final > 0.0)

         AddIOPS[k++] = ops_cnt / (final /1000.0);

 

     *pciErrNum = ms(&final,"floatMADTest",pciErrNum,1024,2);

     CHECK(*pciErrNum);

     if(final > 0.0)

         AddIOPS[k++] = ops_cnt / (final /1000.0) * 2.0;

 

     ops_cnt = 128 * 1024 * 1024UL;

 

     *pciErrNum = ms(&final,"floatSFTest",pciErrNum,16,3);

     CHECK(*pciErrNum);

     if(final > 0.0)

         AddIOPS[k++] = ops_cnt / (final /1000.0);

 

     *pciErrNum = ms(&final,"floatNativeSFTest",pciErrNum,16,3);

     CHECK(*pciErrNum);

     if(final > 0.0)

         AddIOPS[k++] = ops_cnt / (final /1000.0);

 

     if(itemFlag == true)

     {

         *item = k;

         itemFlag = false;

     }

 

     return 0;

}

 

 

int ms(double *final,const char* kernelName, cl_int *pciErrNum, int coef,int id)

{

     // create a command-queue

    cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &*pciErrNum);

    if(CL_SUCCESS != *pciErrNum )

         return 1;

    

     //which kernel

     switch(id)

     {

     case 0: kernel = clCreateKernel(cpProgram, kernelName, &*pciErrNum); break;

     case 1: kernel = clCreateKernel(cpProgram1, kernelName, &*pciErrNum); break;

     case 2: kernel = clCreateKernel(cpProgram2, kernelName, &*pciErrNum); break;

     case 3: kernel = clCreateKernel(cpProgram3, kernelName, &*pciErrNum); break;

     }

     if(CL_SUCCESS != *pciErrNum )

         return 1;

 

     //setup execution parameter

     clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_item), max_item, NULL);

     localWorkSize = max_item[0] < 256 ? max_item[0] : 256;

     globalWorkSize = ops_cnt / coef;

 

     //set kernel arg

     *pciErrNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&d_data);

    if(CL_SUCCESS != *pciErrNum )

         return 1;

 

     /*

         //input data in device memory. clCreateBuffer + clEnqueueCopyBuffer()

         d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * WA, NULL,NULL);

         clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * WA, 0, workSize[i] * sizeof(float) * WA, 0, NULL, NULL);

        

         //or, we do like this: create + copy

        d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,

                                mem_size_B, h_B_data, NULL);

    

         //Asynchronous write of data to GPU device

         clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);

     */

 

     /*

     //warmup so we don't time driver startup

    *pciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 1, 0,&globalWorkSize, &localWorkSize, 0, NULL, NULL);

     if(CL_SUCCESS != *pciErrNum ) return 1;

     */

 

 

     //event

     //cl_event kernel_completion;

 

     //foreach testnum

     for(int i = 0; i< testnum; i++)

     {

     //   shrDeltaT(0);  //us

         InitTimer();

 

         *pciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); //&kernel_completion

         if(CL_SUCCESS != *pciErrNum ) return 1;

     //   clWaitForEvents(1, &kernel_completion);

         *pciErrNum = clFinish(cqCommandQueue);

         if(CL_SUCCESS != *pciErrNum ) return 1;

 

     //   time[i] = shrDeltaT(0) * 1000.0; //ms

         time[i] = GetTimer(); //ms

     }

    

     //clReleaseEvent(kernel_completion); //we use clFinish() or event to synchronize CPU and GPU

    

     // Read back results and check accumulated errors

    //clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);

   

 

     //release

     clReleaseKernel(kernel);

     clReleaseCommandQueue(cqCommandQueue);

 

     //get ms

     *final = GetTimeMin(testnum);

 

     return 0;

}

 

double GetTimeMin(int len)

{

     double tem = time[0];

     for(int i=1;i

         if(time[i] < tem)

              tem = time[i];

     return tem;

}

__inline void InitTimer(void)

{

     counterT0 = __rdtsc();

}

 

__inline double GetTimer(void)

{

     return (__rdtsc() - counterT0) * 1000.0 / counterFreq;

}

 

unsigned __int64 GetCPUSpeed(void)

{

     unsigned __int64 start, stop;

     unsigned __int64 nCtr, nFreq, nCtrStop;

 

     QueryPerformanceFrequency((LARGE_INTEGER *)&nFreq);

 

     _asm _emit 0x0F

    _asm _emit 0x31

    _asm mov DWORD PTR start, eax

    _asm mov DWORD PTR [start + 4], edx

 

     QueryPerformanceCounter((LARGE_INTEGER *)&nCtrStop);

     nCtrStop += nFreq / 5;

     do

     {

         QueryPerformanceCounter((LARGE_INTEGER *)&nCtr);

     }while (nCtr < nCtrStop);

    

     _asm _emit 0x0F

     _asm _emit 0x31

     _asm mov DWORD PTR stop, eax

     _asm mov DWORD PTR [stop + 4], edx

 

     counterFreq = (stop - start) * 5;

    

     return counterFreq;

}

 

 

//--------------------.h---------------

 

#ifndef INT32TEST_H
#define INT32TEST_H

#define CHECK(i) if((i)!=CL_SUCCESS) return 1;

const char *FloatTestCLsource = "   /
__kernel void floatAddTest(__global float *dummy_buf)       /

{       /

}       /
";

 

typedef struct
{
 cl_platform_id platform;
 cl_device_id* devices;
 cl_uint numDevices;
}PD;
__int64 counterT0, counterFreq;


int FloatTest(double* dpTime, unsigned long* item, cl_int* pciErrNum, PD* platforms, cl_uint numPlatforms);
int subFloatTest(cl_int* pciErrNum);
int runOnDevice(double* AddIOPS, unsigned long* item, cl_int* pciErrNum);
int ms(double *final,const char* kernelName, cl_int *pciErrNum, int coef, int id);
double GetTimeMin(int len);
__inline void InitTimer(void);
__inline double GetTimer(void);
unsigned __int64 GetCPUSpeed(void);

 

#endif

 

 

 

 

/*

//some log functions

shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");

shrSetLogFileName("log.txt");

shrLog("Hello World!!!/n/n");

shrLog(" CL_PLATFORM_NAME: /t%s/n", cBuffer);

 

shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);

oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));

oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");

 

//some checking functions

oclCheckError(ciErrNum, CL_SUCCESS);

shrCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);

 

//some filling and result-checking functions

void* srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);

shrFillArray((float*)srcA, 4 * iNumElements);

shrDiffArray((const float*)dst, (const float*)Golden, iNumElements);

shrCompareL2fe(reference, h_C, size_C, 1e-6f);

 

 

//print WINDOWS systemInfo:

    #ifdef _WIN32

        SYSTEM_INFO stProcInfo;         // processor info struct

        OSVERSIONINFO stOSVerInfo;      // Win OS info struct

        SYSTEMTIME stLocalDateTime;     // local date / time struct

 

        // processor

        SecureZeroMemory(&stProcInfo, sizeof(SYSTEM_INFO));

        GetSystemInfo(&stProcInfo);

 

        // OS

        SecureZeroMemory(&stOSVerInfo, sizeof(OSVERSIONINFO));

        stOSVerInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);

        GetVersionEx(&stOSVerInfo);

 

        // date and time

        GetLocalTime(&stLocalDateTime);

 

        // write time and date to logs

        shrLog(" Local Time/Date = %i:%i:%i, %i/%i/%i/n",

            stLocalDateTime.wHour, stLocalDateTime.wMinute, stLocalDateTime.wSecond,

            stLocalDateTime.wMonth, stLocalDateTime.wDay, stLocalDateTime.wYear);

 

        // write proc and OS info to logs

        shrLog(" CPU Arch: %i/n CPU Level: %i/n # of CPU processors: %u/n Windows Build: %u/n Windows Ver: %u.%u/n/n/n",

            stProcInfo.wProcessorArchitecture, stProcInfo.wProcessorLevel, stProcInfo.dwNumberOfProcessors,

            stOSVerInfo.dwBuildNumber, stOSVerInfo.dwMajorVersion, stOSVerInfo.dwMinorVersion);

    #endif

 

 

 

//oclMatrixMul use event to time:

double executionTime(cl_event &event)

{

    cl_ulong start, end;

   

    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);

    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);

   

    return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return

}

*/

你可能感兴趣的:(C/C++,GPU/CUDA/OpenCL)