Plantform(平台):主机加上OpenCL框架管理下的若干设备构成了这个平台,通过这个平台,应用程序可以与设备共享资源并在设备上执行kernel。
平台通过cl_plantform来展现,可以使用下面的代码来初始化平台
cl_int oclGetPlatformID (cl_platform_id *platforms); // Pointer to the platform object
Device(设备):通过cl_device来表现
cl_int clGetDeviceIDs (cl_platform_id platform,
cl_device_type device_type, // Bitfield identifying the type. For the GPU we use CL_DEVICE_TYPE_GPU
cl_uint num_entries, // Number of devices, typically 1
cl_device_id *devices, // Pointer to the device object
cl_uint *num_devices // Puts here the number of devices matching the device_type
);
Context(上下文):定义了整个OpenCL化境,包括OpenCL kernel、设备、内存管理、命令队列等。上下文使用cl_context来表现
cl_context clCreateContext(const cl_context_properties *properties, // Bitwise with the properties (see specification)
cl_uint num_devices,
const cl_device_id *devices, // Pointer to the devices object
void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data), // (don't worry about this)
void *user_data,
// (don't worry about this)
cl_int *errcode_ret // error code result
);
Command-Queue(指令队列):就像它的名字一样,他是一个存储需要在设备上执行的OpenCL指令的队列。
指令队列建立在一个上下文中的指定设备上 ,多个指令队列允许应用程序在不需要同步的情况下执行多条无关联的指令
cl_command_queue clCreateCommandQueue (
cl_context context,
cl_device_id device,
cl_command_queue_properties properties, // Bitwise with the properties
cl_int *errcode_ret
// error code result
);
下面是一个简单的示例
cl_int error = 0; // Used to handle error codes
cl_platform_id platform;
cl_context context;
cl_command_queue queue;
cl_device_id device;
// Platform
error = oclGetPlatformID(&platform);
if (error != CL_SUCCESS) {
cout << "Error getting platform id: " << errorMessage(error) << endl;
exit(error);
}
// Device
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) {
cout << "Error getting device ids: " << errorMessage(error) << endl;
exit(error);
}
// Context
context = clCreateContext(0, 1, &device, NULL, NULL, &error);
if (error != CL_SUCCESS) {
cout << "Error creating context: " << errorMessage(error) << endl;
exit(error);
}
// Command-queue
queue = clCreateCommandQueue(context, device, 0, &error);
if (error != CL_SUCCESS) {
cout << "Error creating command queue: " << errorMessage(error) << endl;
exit(error);
}
在设备上分配内存,我们需要使用cl_mem类型
cl_mem clCreateBuffer (cl_context context, // The context where the memory will be allocated
cl_mem_flags flags,
size_t size, // The size in bytes
void *host_ptr,
cl_int *errcode_ret
);
flags是逐位的,选项如下:
CL_MEM_READ_WRITE
CL_MEM_WRITE_ONLY
CL_MEM_READ_ONLY
CL_MEM_USE_HOST_PTR
CL_MEM_ALLOC_HOST_PTR
CL_MEM_COPY_HOST_PTR – 从 host_ptr处拷贝数据
下面是一个简单的内存分配示例,分配3块内存,两份分别存a,b向量,一份存相加后的结果向量,这里的变量名后的_d表示内存是分配在设备上的
const int mem_size = sizeof(float)*size;
// Allocates a buffer of size mem_size and copies mem_size bytes from src_a_h
cl_mem src_a_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_a_h, &error);
cl_mem src_b_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_b_h, &error);
cl_mem res_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error);
Kernel:kernel本质上是一个我们可以从主机上调用的,运行在设备上的函数.所有运行在设备上的代码,包括kernel和kernel调用的其他的函数,
都是在运行的时候编译的。这涉及到下一个概念,Program。
Program:OpenCL Program由kernel函数、其他函数和声明组成。它通过cl_program表示。当创建一个program时,你必须指定它是由哪些文件组成的,然后编译它。
创建program
cl_program clCreateProgramWithSource (
cl_context context,
cl_uint count, // 文件个数
const char **strings, // 字符串数组,每一个字符串代表一个文件
const size_t *lengths, // 指明文件长度的数组
cl_int *errcode_ret // 返回的错误码
);
编译program
cl_int clBuildProgram (cl_program program,
cl_uint num_devices, //设备个数
const cl_device_id *device_list, //设备ID号列表
const char *options,
// 编译选项
void (*pfn_notify)(cl_program, void *user_data), //
void *user_data
);
查看编译log,必须使用下面的函数
cl_int clGetProgramBuildInfo (cl_program program,
cl_device_id device,
cl_program_build_info param_name, // 我们想要知道的参数
size_t param_value_size,
// 参数值的大小
void *param_value,
// 结果
size_t *param_value_size_ret
);
提取program的入口点,使用cl_kernel
cl_kernel clCreateKernel (
cl_program program, // The program where the kernel is
const char *kernel_name, // The name of the kernel, i.e. the name of the kernel function as it's declared in the code
cl_int *errcode_ret
);
下面是一个简单的示例
// Creates the program
// Uses NVIDIA helper functions to get the code string and it's size (in bytes)
size_t src_size = 0;
const char* path = shrFindFilePath("vector_add_gpu.cl", NULL);
const char* source = oclLoadProgSource(path, "", &src_size);
//创建program
cl_program program = clCreateProgramWithSource(context, 1, &source, &src_size, &error);
assert(error == CL_SUCCESS);
//编译program
error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
assert(error == CL_SUCCESS);
//显示log
char* build_log;
size_t log_size;
// 第一次调用获得恰当的日志大小
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
build_log = new char[log_size+1];
// 第二次调用获得日志的内容
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
build_log[log_size] = '\0';
cout << build_log << endl;
delete[] build_log;
//提取kernel
//下面的 "vector_add_gpu"对应的就是cl文件里面的函数 vector_add_gpu(...)函数
cl_kernel vector_add_kernel = clCreateKernel(program, "vector_add_gpu", &error);
assert(error == CL_SUCCESS);
运行kernel
一旦我们的kernel建立好,我们就可以运行它。
首先,我们必须设置kernel的参数
cl_int clSetKernelArg (
cl_kernel kernel, // Which kernel
cl_uint arg_index, // 哪一个参数,譬如0,1,2,3代表第1,2,3,4号参数
size_t arg_size, // Size of the next argument (not of the value pointed by it!)
const void *arg_value // Value
)
每个参数都需要调用一次这个函数。
当所有参数设置完毕,我们就可以调用这个kernel
cl_int clEnqueueNDRangeKernel(
cl_command_queue command_queue,
cl_kernel kernel,
cl_uint work_dim,
// Choose if we are using 1D, 2D or 3D work-items and work-groups
const size_t *global_work_offset,
const size_t *global_work_size, // The total number of work-items (must have work_dim dimensions)
const size_t *local_work_size, // The number of work-items per work-group (must have work_dim dimensions)
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event);
// Enqueuing parameters
// Note that we inform the size of the cl_mem object, not the size of the memory pointed by it
error = clSetKernelArg(vector_add_k, 0, sizeof(cl_mem), &src_a_d);
error |= clSetKernelArg(vector_add_k, 1, sizeof(cl_mem), &src_b_d);
error |= clSetKernelArg(vector_add_k, 2, sizeof(cl_mem), &res_d);
error |= clSetKernelArg(vector_add_k, 3, sizeof(size_t), &size);
assert(error == CL_SUCCESS);
// Launching kernel
const size_t local_ws = 512; // Number of work-items per work-group
// shrRoundUp returns the smallest multiple of local_ws bigger than size
const size_t global_ws = shrRoundUp(local_ws, size); // Total number of work-items
error = clEnqueueNDRangeKernel(queue, vector_add_k, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL);
assert(error == CL_SUCCESS);
读取结果
读取结果非常简单。与之前讲到的写入内存(设备内存)的操作相似,现在我们需要存入队列一个读取缓冲区的操作
cl_int clEnqueueReadBuffer (
cl_command_queue command_queue,
cl_mem buffer,
// 从哪个buffer
cl_bool blocking_read,
// 是否读阻塞
size_t offset,
// 对开始的偏移
size_t cb,
// 要读的字节大小
void *ptr,
// 主机的内存指针
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event);
使用示例
float* check = new float[size];
clEnqueueReadBuffer(queue, res_d, CL_TRUE, 0, mem_size, check, 0, NULL, NULL);
清理内存
使用clCreate申请的(缓冲区、kernel、队列)必须使用clRelease释放
delete[] src_a_h;
delete[] src_b_h;
delete[] res_h;
delete[] check;
clReleaseKernel(vector_add_k);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseMemObject(src_a_d);
clReleaseMemObject(src_b_d);
clReleaseMemObject(res_d);