1,OpenCL程序包含两部分:
设备上执行:GPU 主机上运行:CPU
需要使用OpenCL语言编写Kernel函数。
2,
(1)Kernel:设备程序执行的入口点,是唯一可以从主机上调用执行的函数。
eg:
Function:
void vector_add_cpu (const float* src_a, const float* src_b, float* res, const int num) { for (int i = 0; i < num; i++) res[i] = src_a[i] + src_b[i]; }
OpenCL Kernel:
__kernel void vector_add_gpu (__global const float* src_a, __global const float* src_b, __global float* res, const int num) { /* get_global_id(0) 返回正在执行的这个线程的ID。 许多线程会在同一时间开始执行同一个kernel, 每个线程都会收到一个不同的ID,所以必然会执行一个不同的计算。*/ const int idx = get_global_id(0); /* 每个work-item都会检查自己的id是否在向量数组的区间内。 如果在,work-item就会执行相应的计算。*/ if (idx < num) res[idx] = src_a[idx] + src_b[idx]; }
如何编写Kernel,如何表达并行性,执行模型是什么样的?
SIMT: single instruction multi thread
work-item(工作项): 最小的执行单元。当一个Kernel开始执行,可定义数量的work-item开始运行,每个都执行相同的代码。其中,每个work-item有一个id,可以在kernel中被访问,运行在work-item上的kernel通过id找到work-item需要处理的数据。
work-group(工作组):允许work-item之间的通信和协作。也有唯一的可被kernel读取的id。
ND-range:下一个组织级别,定义work-group的组织形式。
Attention:
a. 关键字__kernel定义函数为kernel, 必须返回void。
b. 关键字__global位于参数前面,定义了参数内存的存放位置。
c. 所有Kernel必须写在".cl"文件中,".cl"文件必须只包含OpenCL代码。
(2)Host(主机)
首先建立基本的OpenCL运行环境:
Platform: Host加OpenCL框架管理下的若干设备构成Platform。通过Platform,应用程序可以与设备共享资源并在设备上执行Kernel。Platform通过cl_platform表现。
Platform初始化:
// return error code cl_int oclGetPlatformID(cl_platform_id *platforms) //Pointer to the platform object
Device: 通过cl_device表现。
// Return the error code cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type, //Bitfield identifying the type. For GPU we use CL_DEVICE_TYPE_GPU cl_uint num_entries, //Number of Devices, typically 1 cl_device_id *devices, //Pointer to the device object cl_unit *num_devices) //Puts here the number of devices matching the device_type
Context: 定义整个OpenCL环境,包括OpenCL kernel、设备、内存管理、命令队列等。Context使用cl_context表现。
// Returns the contxt cl_context cl_CreateContext(onst cl_context_properties *properties, //Bitwise with the properties(ee specification) cl_unit num_devices, //Number of devices const cl_device_id *devices, //Pointer to devices object void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data), void *user_data, cl_int *errcode_ret) //error code result
Command_Queue: 存储需要在设备上执行的OpenCL指令的队列。Command_Queue建立在一个Context指定的设备上,多个Command_Queue允许应用程序在不需要同步的情况下执行多条无关联的指令。
cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device, cl_command_queue_properties properties, //Bitwise with the properties cl_int *errcode_ret) //Error code result
环境初始化方法:
cl_int error = 0; // Used to handle error codes cl_platform_id platform; cl_context context; cl_command_queue queue; cl_device_id device; // Platform error = oclGetPlatformID(&platform); if (error != CL_SUCCESS) { cout << "Error getting platform id: " << errorMessage(error) << endl; exit(error); } // Device error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { cout << "Error getting device ids: " << errorMessage(error) << endl; exit(error); } // Context context = clCreateContext(0, 1, &device, NULL, NULL, &error); if (error != CL_SUCCESS) { cout << "Error creating context: " << errorMessage(error) << endl; exit(error); } // Command-queue queue = clCreateCommandQueue(context, device, 0, &error); if (error != CL_SUCCESS) { cout << "Error creating command queue: " << errorMessage(error) << endl; exit(error); }
接着,分配内存。
针对介绍Kernel时举的例子,需要分配三个向量的内存空间,且至少将其中的两个初始化。
const int size = 1234567 float* src_a_h = new float[size]; float* src_b_h = new float[size]; float* res_h = new float[size]; // Initialize both vectors for (int i = 0; i < size; i++) { src_a_h = src_b_h = (float) i; }
而在Devices上分配内存,我们需要使用cl_mem类型:
//Returns the cl_mem object referencing the memory allocated on the device cl_mem clCreateBuffer(cl_context context, //the context where the memory will be allocated cl_mem_flags flags, size_t size // Size in bytes void *host_ptr, cl_int *errcode_ret)
其中,flags是逐位的,选项包括:
CL_MEM_READ_WRITE
CL_MEM_WRITE_ONLY
CL_MEM_READ_ONLY
CL_MEM_USE_HOST_PTR
CL_MEM_ALLOC_HOST_PTR
CL_MEM_COPY_HOST_PTR – 从 host_ptr处拷贝数据
而clCreateBuffer的使用方法为:
const int mem_size = sizeof(float)*size; // Allocates a buffer of size mem_size and copies mem_size bytes from src_a_h cl_mem src_a_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_a_h, &error); cl_mem src_b_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_b_h, &error); cl_mem res_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error);
(3)程序与Kernel
怎么调用Kernel?编译器怎么将代码放在Device上?怎么编译Kernel?
对比OpenCL Program与OpenCL Kernel:
Kernel:本质是一个可以在Host上调用,在Device上运行的函数,是在运行时编译的。所有运行在Device上的代码,包括Kernel以及Kernel调用的其他函数都是在运行时编译的。
Program:OpenCL Program是由Kernel函数/其他函数/声明组成。通过cl_program表示。当创建一个Program时,必须指定其是有哪些文件组成的,然后编译它。
Create a Program:
// Returns the OpenCL program cl_program clCreateProgramWithSource(cl_context context, cl_uint count, //number of files const char **strings, //array of strings, each one is a file const size_t *lengths, //array specifying the file lengths cl_int *errcode_ret) //error code to be returned
在创建了Program后,执行编译操作:
cl_int clBuildProgram(cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, //Compiler options, see the specifications for more details void (*pfn_notify)(cl_program, void *user_data), void *user_data)
编译后,我们使用如下函数查看编译log:
cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device, cl_program_build_info param_name, //The parameter we want to know size_t param_value_size, void *param_value, // The answer size_t *param_value_size_ret)
最后提取Program的入口,使用cl_kernel:
cl_kernel cl_CreateKernel(cl_pogram program, const char *kernel_name, //The name of the kernel, i.e.the name of the kernel function as it's declared in the code cl_int *errcode_ret)
我们可以创建多个Program,而每个Program中可以包含多个Kernel。创建Program:
// Create a program size_t src_size = 0; const char* path = shrFindFilePath("vector_add_gpu.cl", NULL); const char* source = oclLoadProgSource(path, "", &src_size); cl_program program = clCreateProgramWithSource(context, 1 &source, &src_size, &error); assert(error == CL_SUCCESS) //Build the program error = clBuildProgram(program, 1, &device, NULL, NULL, NULL); assert(error == CL_SUCCESS) //Show the log char* build_log; size_t log_size; //First call to know the proper size clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); build_log = new char[log_size + 1]; //Second call to get the log clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); build_log[log_size] = ''\0"; cout << build_log << endl; delete[] build_log; //Extracting the kernel cl_kernel vector_add_kernel = clCreateKernel(program, "vector_add_gpu", &error); assert(error == CL_SUCCESS);
我们将Kernel建立好后,在运行Kernel前,需要先设置Kernel的参数。
cl_int clSetKernelArg(cl_kernel kernel, //Which Kernel cl_uint arg_index, //Which argument size_t arg_size, //Size of the next argument(not the value pointed by it) const void *arg_value) //Value
对于Kernel中的每个参数,都需要调用上述的函数来为参数设置Value。所有参数设置完毕后,我们可以调用Kernel:
cl_int clWnqueueNDRangeKernel(cl_command_queue conmmand_queue, cl_kernel kernel, cl_uint work_dim, //Choose if using 1D, 2D or 3D work-items and work-groups const size_t *global_work_offset, const size_t *global_work_size, //The total number of work-items(must have work-dim dimensions) const size_t *local_work_size, //The number of work-items per work-group(must have work-dim dimensions) cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
最终,设置参数及调用Kernel:
//Enqueuing parameters //Note that we inform the size of the cl_mem object, not the size of the memory pointed by it error = clSetKernelArg(vector_add_k, 0, sizeof(cl_mem), &src_a_d); error |= clSetKernelArg(vector_add_k, 1, sizeof(cl_mem), &src_b_d); error |= clSetKernelArg(vector_add_k, 2, sizeof(cl_mem), &res_d); error |= clSetKernelArg(vector_add_k, 3, sizeof(size_t), &size); assert(error == CL_SUCCESS) //Luauching Kernel const size_t local_ws = 512; //Number of work-items per work-group //shrRoundUp returns the smallest multiple of local_ws bigger than size const size_t global_ws = shrRoundUp(local_ws, size); //Total number of work-items error = clEnqueueNDRangeKernel(queue, vector_add_k, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL); assert(error == CL_SUCCESS);
(4)读取结果
cl_int clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer, //From which buffer cl_bool offset //Offset from the beginning size_t cb, //Size to be read(in bytes) void *ptr, //Pointer to the host memory cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
使用方法如下:
//Reading back float* check = net float[size]; clEnQueueReadBuffer(queue, res_d, CL_TRUE, 0, mem_size, check, 0, NULL, NULL);
(5)清除内存
使用clCreate申请的(缓冲区/Kernel/队列等)内存必须使用clRelease释放。
// Cleaning memory delete[] src_a_h; delete[] src_b_h; delete[] res_h; delete[] check; clReleaseKernel(vector_add_k); clReleaseCommandQueue(queue); clReleaseContext(context); clReleaseMemObject(src_a_d); clReleaseMemObject(src_ab_d); clReleaseMemObject(res_d);