opencl学习(三)

opencl学习任务紧迫,仅看文字资料速度太慢,还是跟着大家一起通过尝试不同代码,边学边学。昨天写了把上次写的一维数组相加的做了修改,变成二维数组相加。初学者就是这么无聊,大家见谅!还是在freescale的i.MX6q上跑的,这个芯片的GPU中work group大小最大为1024,因此一维数组元素个数最大就是1024,二维中也只能width * height <= 1024。另外,此代码中通过事件的方式,利用opencl的API进行了kernel运算计时。借此可以慢慢了解一些事件相关的使用。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>
#include <CL/cl.h>

using namespace std;

int main()
{
	int width = 32, height = 32;
	
	float *buf1 = 0;
	float *buf2 = 0;
	float *buf  = 0;
	
	buf1 = (float *)malloc(width * height * sizeof(float));
	buf2 = (float *)malloc(width * height * sizeof(float));
	buf  = (float *)malloc(width * height * sizeof(float));

	for(int i=0; i<32*32; i++)
	{
		buf1[i] = i*1.2;
		buf2[i] = i*1.5;
	}
	
	cl_int ret;
	//get platform id
	cl_uint numplatforms;
	cl_platform_id platform;
	cl_platform_id *platforms;
	
	ret = clGetPlatformIDs(0, NULL, &numplatforms);	
	if( ret != CL_SUCCESS )
	{
		printf("get platformID failed!");
		return 0;
	}
	platforms = new cl_platform_id[numplatforms];
	ret = clGetPlatformIDs(numplatforms, platforms, NULL);
	platform = platforms[0];//here we just use the first one of the platforms
	
	delete[] platforms;

	//get device id
	cl_uint numdevices;
	cl_device_id device;
	cl_device_id *devices;
	
	ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numdevices);
	if( ret != CL_SUCCESS )
	{
		printf("get deviceID failed!");
		return 0;
	}
	devices = new cl_device_id[numdevices];
	ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numdevices, devices, NULL);
	device = devices[0];
	delete[] devices;
	
	//create context
	cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);	
	
	//create commandqueue
	cl_command_queue queue = clCreateCommandQueue( context, device, CL_QUEUE_PROFILING_ENABLE, &ret );
	if( ret != CL_SUCCESS )
	{
		printf(" queue creation failure!\n");
		return 0;
	}

	//create opencl memory objects
	//and copy buf1 to clbuf1 implicitly
	//while copy buf2 to clbuf2 explicitly
	cl_mem clbuf1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
									width*height*sizeof(cl_float), buf1, NULL);
	cl_mem clbuf2 = clCreateBuffer(context, CL_MEM_READ_ONLY, width*height*sizeof(cl_float), NULL, NULL);	
	
	ret = clEnqueueWriteBuffer(queue, clbuf2, 1, 0, width*height*sizeof(cl_float), buf2, 0, 0, NULL);// &writeEvt);

	cl_mem clbuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width*height*sizeof(cl_float), NULL, NULL);
	
	const char* source = "	__kernel void vecadd(__global const float *a, \
							__global const float *b, __global const float *c)\
							{\
								int x = get_global_id(0);\
								int y = get_global_id(1);\
								int width = get_global_size(0);\
								int height = get_global_size(1);\
								c[x+y*width] = a[x+y*width]+b[x+y*width];\
							}";
	//create program object
	cl_program program = clCreateProgramWithSource(context,	1, &source, NULL, &ret);
	if( ret != CL_SUCCESS )
	{
		printf(" program creation failure!\n");
		return 0;
	}
	//build program object
	ret = clBuildProgram( program, 1, &device, NULL, NULL, NULL);
	if( ret != CL_SUCCESS )
	{
		printf(" program build failure!\n");
		return 0;
	}

	//create kernel
	cl_kernel kernel = clCreateKernel(program, "vecadd", &ret);
	if( ret != CL_SUCCESS )
	{
		printf("kernel creation failure!\n");
		return 0;
	}
	
	//set kernel argument
	cl_int num = width*height;
	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clbuf1);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&clbuf2);
	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&clbuf);

	//create kernel, rang 2dim, work itmes width*height
	size_t localx, localy;
	if(width/8 >4)
		localx = 16;
	else if(width<8)
		localx = width;
	else localx = 8;

	if(height/8 >4)
		localy = 16;
	else if(height<8)
		localy = height;
	else localy = 8;

	size_t globalThreads[] = {width, height};	
	size_t localThreads[] = {localx, localy};

	cl_event ev;
	//execu kernel
	ret = clEnqueueNDRangeKernel(queue, kernel, 2, 0, globalThreads, 
								localThreads, 0, NULL, &ev);
	if( ret != CL_SUCCESS )
	{
		if(ret == CL_INVALID_KERNEL) printf("invalid kernel!\n");
		printf("%d\n", ret);
		printf("execution failure!\n");
		return 0;
	}
	clFinish( queue );
	//calc the execu time
	cl_ulong startTime = 0, endTime = 0;
	clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, 
							sizeof(cl_ulong), &startTime, NULL);	
	clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, 
							sizeof(cl_ulong), &endTime, NULL);	
	cl_ulong time = endTime - startTime;
	printf("simple kernel start time: %8.6f ms\n", startTime*1e-6);
	printf("simple kernel end time: %8.6f ms\n", endTime*1e-6);
	printf("simple kernel exec time: %8.6f ms\n", time*1e-6);
	return 0;
}

本来想把同样的计时方式添加到上篇中一维数组的计算当中,但不论怎么修改,程序运行都正常,但计时始终显示为0,不能得到有效时间。有感兴趣的初学者朋友可以试试,ok的话给留个言。多谢。

刚说完问题就找到了,在创建命令队列的时候有一个参数需要使能。多亏一起学习的哥们帮忙,看来常用的API每一个参数的功能还是要好好看。

创建命令队列的函数原型为:

cl_command_queue  clCreateCommandQueue  ( cl_context  context , 
cl_device_id device , 
cl_command_queue_properties  properties, 
cl_int *errcode_ret ) 

第一个参数为一个有效的上下文context。

第二个参数为与该上下文绑定的设备device。

第三个参数为要建立的队列的属性,有两个备选的属性,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE和CL_QUEUE_PROFILING_ENABLE。前者决定该命令队列是否按顺序执行,还是乱序执行,如果设置了,则该命令队列将乱序执行,否则按顺序执行。第二个属性设置是否剖析命令。如果设置则剖析命令,否则不剖析。这个剖析命令的使能设置就影响了该命令执行计时的问题。 详情请查询opencl手册。

第四个参数返回错误码。

具体问题如下:

此篇中,二维数组命令队列建立语句为:

//create commandqueue
cl_command_queue queue = clCreateCommandQueue( context, device, CL_QUEUE_PROFILING_ENABLE, &ret );

上篇中,一维数组由于没想使用事件,命令队列的建立语句为:

//Command-queue  
queue = clCreateCommandQueue( context, device, 0, &errNum );

其中第三个参数在一维数组的计算中没有设置剖析命令,因此无法获取命令执行时间。设置为CL_QUEUE_PROFILING_ENABLE即可计算运行时间。

对比发现,在freescale的i.MX6q上,两个含有1024个元素的一维数组相加耗时,是两个32*32的二维数组相加耗时的至少两倍。

你可能感兴趣的:(OpenCL)