很久了,CSDN博客图片一直处于和谐态,灰常影响博友心情=_=!!!
这里(http://download.csdn.net/source/2030769)有OpenCL开发流程,Platforms/Devices/Context概念澄清的图解,需要的朋友可以下载看看,免积分。本篇我写OCL程序的一个模板,拿出来晒晒太阳,大家喜欢了可以拿去用。CUDA与OpenCL的裙带关系矣然大白于天下。下一篇博客我会对比CUDA与OpenCL的相关函数,有兴趣的朋友可以做一个转换工具,于人于已也就方便多了。言归正传,代码说事儿:
#include
#include
//InitTimer()、GetTimer()
#include
#include
//we refuse to use oclUtils.h, including oclXXX() or shrXXX() because they are not OCL intrinsic functions.
//#include
#include "FloatTest.h"
cl_context cxGPUContext = NULL;
cl_device_id device;
cl_command_queue cqCommandQueue = NULL;
cl_mem d_data = NULL;
cl_int ciErrNum = CL_SUCCESS;
cl_program cpProgram = NULL, cpProgram1 = NULL, cpProgram2=NULL, cpProgram3 = NULL; //Because the length of the lenght is limited, so a big .cl needs several cpPrograms.
size_t program_length = 0, program_length1=0, program_length2=0, program_length3 = 0;
cl_kernel kernel = NULL;
size_t max_item[4];
size_t localWorkSize;
size_t globalWorkSize;
const int testnum = 32;
double time[256] = {0.0};
unsigned long ops_cnt = 1 * 1024 * 1024 * 1024UL;
const float mem_size = sizeof(float) * 65536;
static int k = 0;
bool itemFlag = true;
/*
typedef struct
{
cl_platform_id platform;
cl_device_id* devices;
cl_uint numDevices;
}PD;*/
PD* platforms;
cl_uint numPlatforms;
int main()
{
unsigned int i;
//Get counterFreq of your CPU, used in InitTimer()/GetTimer()
counterFreq = GetCPUSpeed();
//Get the number of the platforms
ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
if(ciErrNum != CL_SUCCESS) return 1;
if( numPlatforms > 0)
{
platforms = new PD[numPlatforms];
//malloc for these platforms
cl_platform_id* platformsTem = new cl_platform_id[numPlatforms];
//get these platforms
ciErrNum = clGetPlatformIDs(numPlatforms, platformsTem, NULL);
CHECK(ciErrNum);
for(i=0; i
platforms[i].platform = platformsTem[i];
delete[] platformsTem;
cl_uint maxPerPlatform = 0;
for(i = 0; i < numPlatforms; i++)
{
char pbuf[100];
//get detailed info about this platform,e.g., CL_PLATFORM_NAME, /version/vendor
ciErrNum = clGetPlatformInfo(platforms[i].platform, CL_PLATFORM_NAME, sizeof(pbuf), pbuf, NULL);
CHECK(ciErrNum);
//get the number of devices supported by this platform.(type could be CL_DEVICE_TYPE_ALL)
ciErrNum = clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_GPU, 0, NULL, &(platforms[i].numDevices));
CHECK(ciErrNum);
if(platforms[i].numDevices > maxPerPlatform)
maxPerPlatform = platforms[i].numDevices;
//get these devices supported by this platform
platforms[i].devices = new cl_device_id[platforms[i].numDevices];
//get one device is simple
//clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
ciErrNum = clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_GPU, platforms[i].numDevices, platforms[i].devices, NULL);
CHECK(ciErrNum);
}//for
/* get platforms and its supporting devices through oclXXX()
{
//get platforms
char cBuffer[1024];
cl_platform_id cpPlatform = NULL;
oclGetPlatformID(&cpPlatform);
clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
//get devices
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);
clGetDeviceInfo(cdDevices[i], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
oclPrintDevInfo(LOGBOTH, cdDevices[i]);
//record into log
std::string sProfileString = "oclDeviceQuery,[Platform]Device Name = ";
sProfileString += ", Device = ";
sProfileString += cBuffer;
sProfileString += "/n";
shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str());
}
*/
if(maxPerPlatform < 0)
return 1;
double* dpTime = new double[ 10 * numPlatforms * maxPerPlatform]; //item <= 10
unsigned long item = 0;
int flag = FloatTest(dpTime, &item, &ciErrNum, platforms, numPlatforms);
if(flag != 0 ) return 1;
for(int j=0; j< 5; j++)
printf("%.3lf %.3lf GLOPS/n", dpTime[j] / 1000000000.0,dpTime[j+5]/ 1000000000.0);
delete[] dpTime;
for(i = 0; i < numPlatforms; i++)
delete[] platforms[i].devices;
delete[] platforms;
}
else
{
return 1;
}
}
/*
@ double* dpTime, 返回数据的数组;
@ unsigned long* item, 测试项的个数,在每个平台下的每个平台上都会测这几个测试项,例如Add、Mul、Mad;
@ cl_int* pciErrNum,错误代号;
@ PD* platforms,自定义的PD类型的platforms[];
@ cl_uint numPlatforms,用来说明平台个数,即platforms[]数组元素个数;
@ __int64 counterFreq_main,计时函数中需要用到的一个参数;
*/
int FloatTest(double* dpTime, unsigned long* item, cl_int* pciErrNum, PD* platforms, cl_uint numPlatforms)
{
cl_uint pi,di;
for(pi = 0; pi
{
cxGPUContext = clCreateContext(0, platforms[pi].numDevices, platforms[pi].devices, NULL, NULL, &*pciErrNum);
/*
//-----------reate context---------NO clCreateContextFromType(),.x and later driver doesn't support it well
cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &*pciErrNum); //Jun: cxGPUContext = clCreateContext(0, pInfoDlg->m_iCLDeviceBMCount, pInfoDlg->m_idCLDeviceListBM, NULL, NULL, &ciErrNum);
//--------choose device----------
//Above, we use platform info to get devices info.
//Here, we could get devices info from context.We could get devices under certain platform(context <-> platform), or, we get all devices(just one context).
size_t nDeviceBytes;
*pciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);
for(unsigned int i = 0; i < ciDeviceCount; ++i)
{
// get and print the device for this queue
device = oclGetDev(cxGPUContext, i);
oclPrintDevName(LOGBOTH, device);
}
*/
//program setup, create the program, build the program, create d_data
int flag = subFloatTest(pciErrNum);
if(flag != 0) return 1; //one fail, then return.
for(di = 0; di
{
device = (platforms[pi].devices)[di];
//Attention: some global var may be changed during one iteration, so we assign it the initial value.
//If the value will not be changed, I suggest to declare it as "const"
ops_cnt = 1 * 1024 * 1024 * 1024UL;
runOnDevice(dpTime, item, pciErrNum);
}
}
//release
clReleaseProgram(cpProgram);
clReleaseProgram(cpProgram1);
clReleaseProgram(cpProgram2);
clReleaseProgram(cpProgram3);
clReleaseMemObject(d_data);
clReleaseContext(cxGPUContext);
}
int subFloatTest(cl_int* pciErrNum)
{
//program setup, .h(char[]) replaces .cl
//create the program
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&FloatTestCLsource, &program_length, &*pciErrNum);
CHECK(*pciErrNum);
cpProgram1 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelMul, &program_length1, &*pciErrNum);
CHECK(*pciErrNum);
cpProgram2 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelMAD, &program_length2, &*pciErrNum);
CHECK(*pciErrNum);
cpProgram3 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelSF, &program_length3, &*pciErrNum);
CHECK(*pciErrNum);
/*
//if we use .cl
char* source_path = "FloatTest.cl";
char *source = oclLoadProgSource(source_path, "", &program_length);
cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
(const char **)&source, &program_length, &*pciErrNum);
if(CL_SUCCESS != *pciErrNum ) return 1;
*/
//build the program
*pciErrNum = clBuildProgram(cpProgram, 0, NULL, "", NULL, NULL);
CHECK(*pciErrNum);
*pciErrNum = clBuildProgram(cpProgram1, 0, NULL, "", NULL, NULL);
CHECK(*pciErrNum);
*pciErrNum = clBuildProgram(cpProgram2, 0, NULL, "", NULL, NULL);
CHECK(*pciErrNum);
*pciErrNum = clBuildProgram(cpProgram3, 0, NULL, "", NULL, NULL);
CHECK(*pciErrNum);
//input,host memory
d_data = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, mem_size, NULL, &*pciErrNum);
CHECK(*pciErrNum);
//cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,mem_size_A, h_A_data, &ciErrNum);
return 0;
}
int runOnDevice(double* AddIOPS, unsigned long* item, cl_int* pciErrNum)
{
//get Throughput, /10^9 (GIOPS)
double final;
*pciErrNum = ms(&final,"floatAddTest",pciErrNum,1024, 0);
CHECK(*pciErrNum);
if(final > 0.0)
AddIOPS[k++] = ops_cnt / (final /1000.0);
*pciErrNum = ms(&final,"floatMulTest",pciErrNum,1024,1);
CHECK(*pciErrNum);
if(final > 0.0)
AddIOPS[k++] = ops_cnt / (final /1000.0);
*pciErrNum = ms(&final,"floatMADTest",pciErrNum,1024,2);
CHECK(*pciErrNum);
if(final > 0.0)
AddIOPS[k++] = ops_cnt / (final /1000.0) * 2.0;
ops_cnt = 128 * 1024 * 1024UL;
*pciErrNum = ms(&final,"floatSFTest",pciErrNum,16,3);
CHECK(*pciErrNum);
if(final > 0.0)
AddIOPS[k++] = ops_cnt / (final /1000.0);
*pciErrNum = ms(&final,"floatNativeSFTest",pciErrNum,16,3);
CHECK(*pciErrNum);
if(final > 0.0)
AddIOPS[k++] = ops_cnt / (final /1000.0);
if(itemFlag == true)
{
*item = k;
itemFlag = false;
}
return 0;
}
int ms(double *final,const char* kernelName, cl_int *pciErrNum, int coef,int id)
{
// create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &*pciErrNum);
if(CL_SUCCESS != *pciErrNum )
return 1;
//which kernel
switch(id)
{
case 0: kernel = clCreateKernel(cpProgram, kernelName, &*pciErrNum); break;
case 1: kernel = clCreateKernel(cpProgram1, kernelName, &*pciErrNum); break;
case 2: kernel = clCreateKernel(cpProgram2, kernelName, &*pciErrNum); break;
case 3: kernel = clCreateKernel(cpProgram3, kernelName, &*pciErrNum); break;
}
if(CL_SUCCESS != *pciErrNum )
return 1;
//setup execution parameter
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_item), max_item, NULL);
localWorkSize = max_item[0] < 256 ? max_item[0] : 256;
globalWorkSize = ops_cnt / coef;
//set kernel arg
*pciErrNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&d_data);
if(CL_SUCCESS != *pciErrNum )
return 1;
/*
//input data in device memory. clCreateBuffer + clEnqueueCopyBuffer()
d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * WA, NULL,NULL);
clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * WA, 0, workSize[i] * sizeof(float) * WA, 0, NULL, NULL);
//or, we do like this: create + copy
d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
mem_size_B, h_B_data, NULL);
//Asynchronous write of data to GPU device
clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
*/
/*
//warmup so we don't time driver startup
*pciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 1, 0,&globalWorkSize, &localWorkSize, 0, NULL, NULL);
if(CL_SUCCESS != *pciErrNum ) return 1;
*/
//event
//cl_event kernel_completion;
//foreach testnum
for(int i = 0; i< testnum; i++)
{
// shrDeltaT(0); //us
InitTimer();
*pciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); //&kernel_completion
if(CL_SUCCESS != *pciErrNum ) return 1;
// clWaitForEvents(1, &kernel_completion);
*pciErrNum = clFinish(cqCommandQueue);
if(CL_SUCCESS != *pciErrNum ) return 1;
// time[i] = shrDeltaT(0) * 1000.0; //ms
time[i] = GetTimer(); //ms
}
//clReleaseEvent(kernel_completion); //we use clFinish() or event to synchronize CPU and GPU
// Read back results and check accumulated errors
//clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
//release
clReleaseKernel(kernel);
clReleaseCommandQueue(cqCommandQueue);
//get ms
*final = GetTimeMin(testnum);
return 0;
}
double GetTimeMin(int len)
{
double tem = time[0];
for(int i=1;i
if(time[i] < tem)
tem = time[i];
return tem;
}
__inline void InitTimer(void)
{
counterT0 = __rdtsc();
}
__inline double GetTimer(void)
{
return (__rdtsc() - counterT0) * 1000.0 / counterFreq;
}
unsigned __int64 GetCPUSpeed(void)
{
unsigned __int64 start, stop;
unsigned __int64 nCtr, nFreq, nCtrStop;
QueryPerformanceFrequency((LARGE_INTEGER *)&nFreq);
_asm _emit 0x0F
_asm _emit 0x31
_asm mov DWORD PTR start, eax
_asm mov DWORD PTR [start + 4], edx
QueryPerformanceCounter((LARGE_INTEGER *)&nCtrStop);
nCtrStop += nFreq / 5;
do
{
QueryPerformanceCounter((LARGE_INTEGER *)&nCtr);
}while (nCtr < nCtrStop);
_asm _emit 0x0F
_asm _emit 0x31
_asm mov DWORD PTR stop, eax
_asm mov DWORD PTR [stop + 4], edx
counterFreq = (stop - start) * 5;
return counterFreq;
}
//--------------------.h---------------
#ifndef INT32TEST_H
#define INT32TEST_H
#define CHECK(i) if((i)!=CL_SUCCESS) return 1;
const char *FloatTestCLsource = " /
__kernel void floatAddTest(__global float *dummy_buf) /
{ /
} /
";
typedef struct
{
cl_platform_id platform;
cl_device_id* devices;
cl_uint numDevices;
}PD;
__int64 counterT0, counterFreq;
int FloatTest(double* dpTime, unsigned long* item, cl_int* pciErrNum, PD* platforms, cl_uint numPlatforms);
int subFloatTest(cl_int* pciErrNum);
int runOnDevice(double* AddIOPS, unsigned long* item, cl_int* pciErrNum);
int ms(double *final,const char* kernelName, cl_int *pciErrNum, int coef, int id);
double GetTimeMin(int len);
__inline void InitTimer(void);
__inline double GetTimer(void);
unsigned __int64 GetCPUSpeed(void);
#endif
/*
//some log functions
shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
shrSetLogFileName("log.txt");
shrLog("Hello World!!!/n/n");
shrLog(" CL_PLATFORM_NAME: /t%s/n", cBuffer);
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
//some checking functions
oclCheckError(ciErrNum, CL_SUCCESS);
shrCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
//some filling and result-checking functions
void* srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
shrFillArray((float*)srcA, 4 * iNumElements);
shrDiffArray((const float*)dst, (const float*)Golden, iNumElements);
shrCompareL2fe(reference, h_C, size_C, 1e-6f);
//print WINDOWS systemInfo:
#ifdef _WIN32
SYSTEM_INFO stProcInfo; // processor info struct
OSVERSIONINFO stOSVerInfo; // Win OS info struct
SYSTEMTIME stLocalDateTime; // local date / time struct
// processor
SecureZeroMemory(&stProcInfo, sizeof(SYSTEM_INFO));
GetSystemInfo(&stProcInfo);
// OS
SecureZeroMemory(&stOSVerInfo, sizeof(OSVERSIONINFO));
stOSVerInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
GetVersionEx(&stOSVerInfo);
// date and time
GetLocalTime(&stLocalDateTime);
// write time and date to logs
shrLog(" Local Time/Date = %i:%i:%i, %i/%i/%i/n",
stLocalDateTime.wHour, stLocalDateTime.wMinute, stLocalDateTime.wSecond,
stLocalDateTime.wMonth, stLocalDateTime.wDay, stLocalDateTime.wYear);
// write proc and OS info to logs
shrLog(" CPU Arch: %i/n CPU Level: %i/n # of CPU processors: %u/n Windows Build: %u/n Windows Ver: %u.%u/n/n/n",
stProcInfo.wProcessorArchitecture, stProcInfo.wProcessorLevel, stProcInfo.dwNumberOfProcessors,
stOSVerInfo.dwBuildNumber, stOSVerInfo.dwMajorVersion, stOSVerInfo.dwMinorVersion);
#endif
//oclMatrixMul use event to time:
double executionTime(cl_event &event)
{
cl_ulong start, end;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
}
*/