http://www.drdobbs.com/parallel/a-gentle-introduction-to-opencl/231002854?queryText=openCL
Remeber:CUDA has a better performance than OpenCL
openCL是一个通用标准接口类似于MPI,各个组织都可以实现它比如intel AMD Apple NVidia都有自己的实现,所以它可以在CPU和GPU上运行,并且他是一个跨平台的编程架构,可以同时支持不同的device一起混合运行,可以这么讲openGL是一个编写kernel的语言。
每个openCL程序有一个主程序(host),然后host把工作kernel分配到device上,每个host APP需要5个data structure
cl_device_id:对应于运行的硬件设备。
cl_kernel:需要计算的单元模块。
cl_program:程序本身
cl_command_queue:顾名思义,很容易理解
cl_context:运行环境
openCL的kernel运行和内存分派模式
内存的结构
下面是一个简单的加法例子
下面这部分是host部分,重点看main中添加了数字标号的注释部分,理解前面介绍的host需要的5个data structure。
#define PROGRAM_FILE "add_numbers.cl" #define KERNEL_FUNC "add_numbers" #define ARRAY_SIZE 64 #include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #ifdef MAC #include <OpenCL/cl.h> #else #include <CL/cl.h> #endif /* Find a GPU or CPU associated with the first available platform */ cl_device_id create_device() { cl_platform_id platform; cl_device_id dev; int err; /* Identify a platform */ err = clGetPlatformIDs(1, &platform, NULL); if(err < 0) { perror("Couldn't identify a platform"); exit(1); } /* Access a device */ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL); if(err == CL_DEVICE_NOT_FOUND) { err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL); } if(err < 0) { perror("Couldn't access any devices"); exit(1); } return dev; } /* Create program from a file and compile it */ cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) { cl_program program; FILE *program_handle; char *program_buffer, *program_log; size_t program_size, log_size; int err; /* Read program file and place content into buffer */ program_handle = fopen(filename, "r"); if(program_handle == NULL) { perror("Couldn't find the program file"); exit(1); } fseek(program_handle, 0, SEEK_END); program_size = ftell(program_handle); rewind(program_handle); program_buffer = (char*)malloc(program_size + 1); program_buffer[program_size] = '\0'; fread(program_buffer, sizeof(char), program_size, program_handle); fclose(program_handle); /* Create program from file */ program = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); if(err < 0) { perror("Couldn't create the program"); exit(1); } free(program_buffer); /* Build program */ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if(err < 0) { /* Find size of log and print to std output */ clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size + 1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } return program; } int main() { /* OpenCL structures */ cl_device_id device; cl_context context; cl_program program; cl_kernel kernel; cl_command_queue queue; cl_int i, j, err; size_t local_size, global_size; /* Data and buffers */ float data[ARRAY_SIZE]; float sum[2], total, actual_sum; cl_mem input_buffer, sum_buffer; cl_int num_groups; /* Initialize data */ for(i=0; i<ARRAY_SIZE; i++) { data[i] = 1.0f*i; } /* Create device and context */ device = create_device(); // 1. get device context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); //2. create context if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build program */ program = build_program(context, device, PROGRAM_FILE); //3. build program /* Create data buffer */ global_size = 8; local_size = 4; num_groups = global_size/local_size; input_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err); sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), sum, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); // 4. create command queue if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Create a kernel */ kernel = clCreateKernel(program, KERNEL_FUNC, &err); // 5.create kernel if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create kernel arguments */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_buffer); err |= clSetKernelArg(kernel, 1, local_size * sizeof(float), NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &sum_buffer); if(err < 0) { perror("Couldn't create a kernel argument"); exit(1); } /* Enqueue kernel */ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Read the kernel's output */ err = clEnqueueReadBuffer(queue, sum_buffer, CL_TRUE, 0, sizeof(sum), sum, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Check result */ total = 0.0f; for(j=0; j<num_groups; j++) { total += sum[j]; } actual_sum = 1.0f * ARRAY_SIZE/2*(ARRAY_SIZE-1); printf("Computed sum = %.1f.\n", total); if(fabs(total - actual_sum) > 0.01*fabs(actual_sum)) printf("Check failed.\n"); else printf("Check passed.\n"); /* Deallocate resources */ clReleaseKernel(kernel); clReleaseMemObject(sum_buffer); clReleaseMemObject(input_buffer); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
下面就是一个kernel的例子。
__kernel void add_numbers(__global float4* data, __local float* local_result, __global float* group_result) { float sum; float4 input1, input2, sum_vector; uint global_addr, local_addr; global_addr = get_global_id(0) * 2; input1 = data[global_addr]; input2 = data[global_addr+1]; sum_vector = input1 + input2; local_addr = get_local_id(0); local_result[local_addr] = sum_vector.s0 + sum_vector.s1 + sum_vector.s2 + sum_vector.s3; barrier(CLK_LOCAL_MEM_FENCE); if(get_local_id(0) == 0) { sum = 0.0f; for(int i=0; i<get_local_size(0); i++) { sum += local_result[i]; } group_result[get_group_id(0)] = sum; } }