看了一些结构介绍后,开始跟着别人的足迹写代码,先写大家经常起步的简单一维数组相加的opencl代码,以此熟悉opencl代码结构。此代码是在freescale的i.MX6q上跑的,通过交叉编译工具,加上opencl库和头文件等编译选项进行编译。
#include <CL/cl.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <iostream> #include <fstream> using namespace std; int main(int argc, char**argv) { cl_platform_id platform; cl_context context = 0; cl_command_queue queue = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_int errNum = 0; cl_uint numPlatforms = 0; //Platform errNum = clGetPlatformIDs( 1, &platform, &numPlatforms ); if( errNum != CL_SUCCESS ) { cout << "Error getting platform id:" <<endl; return 1; } //Device errNum = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if( errNum != CL_SUCCESS ) { cout << "Error getting device ids:" <<endl; return 1; } //Context context = clCreateContext(0, 1, &device, NULL, NULL, &errNum); if( errNum != CL_SUCCESS ) { cout << "Error creating context:" <<endl; return 1; } //Command-queue queue = clCreateCommandQueue( context, device, 0, &errNum ); if( errNum != CL_SUCCESS ) { cout << "Error creating command queue:" <<endl; return 1; } //create memory objects that will be used as arguements to //kernel. First create host memory arrays that will be //used to store the arguments to the kernel const int ARRAY_SIZE = 1024; float result[ARRAY_SIZE]; float a[ARRAY_SIZE]; float b[ARRAY_SIZE]; for(int i=0; i<ARRAY_SIZE; i++) { a[i] = i*1.0; b[i] = i*2.0; } cl_mem memObjects[3] = { 0, 0, 0 }; memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ARRAY_SIZE*sizeof(float), a, &errNum); memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ARRAY_SIZE*sizeof(float), b, &errNum); memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, ARRAY_SIZE*sizeof(float), NULL, &errNum); if( memObjects[0] == NULL || memObjects[1] == NULL || memObjects[2] == NULL) { cout << "Error creating memory object." << endl; return 1; } const char* source = "__kernel void helloworld(__global const float *a, \ __global const float *b, \ __global float *result) \ {\ int gid = get_global_id(0);\ result[gid] = a[gid] + b[gid];\ }"; cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, NULL ); //assert( errNum == CL_SUCCESS ); if( errNum != CL_SUCCESS ) { cout << "Error creating program!" <<endl; return 1; } //builds the program errNum = clBuildProgram( program, 1, &device, NULL, NULL, NULL ); //assert( errNum == CL_SUCESS ); if( errNum != CL_SUCCESS ) { cout << "Error building program!" <<endl; return 1; } //Shows the log char* build_log; size_t log_size; //first call to know the proper size clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size ); build_log = new char[ log_size+1 ]; //second call to know the proper size clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL ); build_log[log_size] = '\0'; cout << build_log << endl; delete[] build_log; //extracting the kernel kernel = clCreateKernel( program, "helloworld", &errNum); if( errNum != CL_SUCCESS ) { switch(errNum){ case CL_INVALID_PROGRAM:printf("1");break; case CL_INVALID_PROGRAM_EXECUTABLE:printf("2");break; case CL_INVALID_KERNEL_NAME:printf("3");break; case CL_INVALID_KERNEL_DEFINITION:printf("4");break; case CL_INVALID_VALUE:printf("5");break; case CL_OUT_OF_RESOURCES:printf("6");break; case CL_OUT_OF_HOST_MEMORY:printf("7");break; } cout << "Error create kernel." <<endl; return 1; } //set the kernel argument (result, a, b) errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); if( errNum != CL_SUCCESS ) { cout << "Error setting kernel arguments 1." <<endl; return 1; } errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); if( errNum != CL_SUCCESS ) { cout << "Error setting kernel arguments 2." <<endl; return 1; } errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); if( errNum != CL_SUCCESS ) { cout << "Error setting kernel arguments 3." <<endl; return 1; } size_t globalWorkSize[1] = { ARRAY_SIZE }; size_t localWorkSize[1] = { 1 }; //queue the kernel up for execution across the array errNum = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if( errNum != CL_SUCCESS ) { cout << "Error queuing kernel for execution." <<endl; return 1; } //read the output buffer back to the Host errNum = clEnqueueReadBuffer(queue, memObjects[2], CL_TRUE, 0, ARRAY_SIZE * sizeof(float), result, 0, NULL, NULL); if( errNum != CL_SUCCESS ) { switch(errNum){ case CL_INVALID_COMMAND_QUEUE:printf("1");break; case CL_INVALID_CONTEXT:printf("2");break; case CL_INVALID_MEM_OBJECT:printf("3");break; case CL_INVALID_VALUE:printf("4");break; case CL_INVALID_EVENT_WAIT_LIST:printf("5");break; case CL_MISALIGNED_SUB_BUFFER_OFFSET:printf("6");break; case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:printf("7");break; case CL_MEM_OBJECT_ALLOCATION_FAILURE:printf("8");break; case CL_OUT_OF_RESOURCES:printf("9");break; case CL_OUT_OF_HOST_MEMORY:printf("10");break; } printf("\n"); cout << "Error reading result buffer." <<endl; return 1; } printf("after output\n"); //output the result buffer for( int i = 0; i < ARRAY_SIZE; i++) { cout << result[i] << " "; } cout << endl; cout << "executed program successfully." << endl; delete[] a; delete[] b; delete[] result; clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); clReleaseMemObject(memObjects[0]); clReleaseMemObject(memObjects[1]); clReleaseMemObject(memObjects[2]); return 0; }