该例子的是将CPU设备分成两个,实现的函数为clCreateSubDevices,使用的特性为CL_DEVICE_PARTITION_BY_COUNTS特性。在该例子中只创建了一个缓冲区。由其中的一个子设备来写入初值,之后两个设备同时进行计算,一个子设备调用add内核,一个子设备调用sub内核。
首先看下内核实现:
__kernel void Add(__global int* input, __global int* output) { size_t xPos = get_global_id(0); output[xPos] = input[xPos] + 1; } __kernel void Sub(__global int* input, __global int* output) { size_t xPos = get_global_id(0); output[xPos] = input[xPos] - 1; }
int DeviceFission::setupDeviceFission() { // Make sure length is multiple of group size * numSubDevices unsigned int mulFactor = (unsigned int)groupSize * numSubDevices; length = (length < mulFactor) ? mulFactor : length; length = (length / mulFactor) * mulFactor; // Calculate half length half_length = length >> 1; // Get allocate memory for input buffer input = (cl_int*)malloc(half_length * sizeof(cl_int)); CHECK_ALLOCATION(input, "Failed to allocate host memory. (input)"); // Random initialisation of input fillRandom<cl_int>(input, half_length, 1, 1, 8); // Unless sampleArgs->quiet mode has been enabled, print the INPUT array if(!sampleArgs->quiet) { printArray<cl_int>("Input:", input, half_length, 1); } // Get allocate memory for subOutput buffer subOutput = (cl_int*)malloc(length * sizeof(cl_int)); CHECK_ALLOCATION(subOutput, "Failed to allocate host memory. (subOutput)"); return SDK_SUCCESS; }输入量初始化部分。
int DeviceFission::setupCLPlatform() { cl_int status = CL_SUCCESS; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = getPlatform(platform, sampleArgs->platformId, sampleArgs->isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform(rootplatform) failed"); // Display available devices. retValue = displayDevices(platform, CL_DEVICE_TYPE_ALL); CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices(rootplatform) failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; rContext = clCreateContextFromType(platform ? cps : NULL, CL_DEVICE_TYPE_ALL, NULL, NULL, &status); CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed."); // getting devices on which to run the sample status = getDevices(rContext, &Devices, 0, sampleArgs->isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed"); // Set deviceListSize from clGetContextInfo status = clGetContextInfo(rContext, CL_CONTEXT_DEVICES, 0, 0, &deviceListSize); CHECK_ERROR(status, SDK_SUCCESS, "clGetContextInfo failed. (deviceListSize)"); // Get GPU device and CPU devices by the deviceInfo. for (cl_uint i = 0 ; i < deviceListSize / sizeof(cl_device_id) ; i++) { retValue = deviceInfo.setDeviceInfo(Devices[i]); CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed"); if (deviceInfo.dType == CL_DEVICE_TYPE_CPU) { cpuDevice = Devices[i]; } } // Get allocate memory for subDevices subDevices = (cl_device_id*)malloc(numSubDevices * sizeof(cl_device_id)); CHECK_ALLOCATION(subDevices, "Failed to allocate memory. (subDevices)"); // Get allocate memory for subKernel subKernel = (cl_kernel*)malloc(numSubDevices * sizeof(cl_kernel)); CHECK_ALLOCATION(subKernel, "Failed to allocate memory. (subKernel)"); // Get maxSubDevices from clGetDeviceInfo cl_uint maxSubDevices; status = clGetDeviceInfo(cpuDevice, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, sizeof(maxSubDevices), &maxSubDevices, NULL); CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed. (maxSubDevices)") if(maxSubDevices <= 1) { std::cout<<"Error: The CPU should have more than one core to run this sample."<<std::endl; return SDK_FAILURE; } // Initialize required partition property cl_device_partition_property partitionPrty[5] = { CL_DEVICE_PARTITION_BY_COUNTS, maxSubDevices / 2, maxSubDevices / 2, CL_DEVICE_PARTITION_BY_COUNTS_LIST_END, 0 }; // Create sub-devices status = clCreateSubDevices(cpuDevice, partitionPrty, numSubDevices, subDevices, NULL); CHECK_OPENCL_ERROR( status, "clCreateSubDevices failed."); return SDK_SUCCESS; }这个函数就是设备拆分的关键之一了,前面没什么好讲的就是获取平台信息然后取一个CPU设备以及创建Context等。之后就是根据需要创建的子设备数分配足够的subDivices和subKernel空间。然后通过clGetDeviceInfo首先通过CL_DEVICE_PARTITION_MAX_SUB_DEVICES查找到maxSubDevices参数并判断是否可以拆分。获取之后就可以进行设备的创建了。就是将CPU平均分成两份。那个cl_device_partition_property格式挺别致的,注意下写法。之后就是用clCreateSubDevices创建子设备列表并存储在subDevices中。为了程序的完整,这里将setupRuntime也贴一下吧:
int DeviceFission::setupCLRuntime() { cl_int status = CL_SUCCESS; // Create a CL program using the kernel source buildProgramData buildData; buildData.kernelName = std::string("DeviceFission_Kernels.cl"); buildData.devices = Devices; buildData.deviceId = sampleArgs->deviceId; buildData.flagsStr = std::string(""); if(sampleArgs->isLoadBinaryEnabled()) { buildData.binaryName = std::string(sampleArgs->loadBinary.c_str()); } if(sampleArgs->isComplierFlagsSpecified()) { buildData.flagsFileName = std::string(sampleArgs->flags.c_str()); } // Get allocate memory for subCmdQueue subCmdQueue = (cl_command_queue*)malloc(numSubDevices * sizeof( cl_command_queue)); CHECK_ALLOCATION(subCmdQueue,"Failed to allocate memory. (subCmdQueue)"); // Create command queue subCmdQueue for(cl_uint i = 0; i < numSubDevices; i++) { // Create command queue subCmdQueue[i] = clCreateCommandQueue(rContext, subDevices[i], 0, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed. (subCmdQueue)"); } // Create memory objects for input InBuf = clCreateBuffer(rContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, length * sizeof(cl_int), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (InBuf)"); // Get allocate memory for sub devices output subOutBuf = (cl_mem*)malloc(numSubDevices * sizeof(cl_mem)); for(cl_uint i = 0; i < numSubDevices; i++) { // Create memory objects for sub devices output subOutBuf[i] = clCreateBuffer(rContext, CL_MEM_WRITE_ONLY, half_length * sizeof(cl_int) , NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (subOutBuf)"); } SDKFile kernelFile; std::string kernelPath = getPath(); char * source = NULL; size_t sourceSize[] = {0}; char * binary = NULL; size_t binarySize = 0; if(sampleArgs->isLoadBinaryEnabled()) { kernelPath += sampleArgs->loadBinary; if(kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } // Get binaries and binary sizes for CPU devices char** subBinaries = (char**)malloc(numSubDevices * sizeof(char*)); if(subBinaries == NULL) { error("Failed to allocate memory(subBinaries)"); return SDK_FAILURE; } size_t* subBinariesSize = (size_t*)malloc(numSubDevices * sizeof(size_t*)); if(subBinariesSize == NULL) { error("Failed to allocate memory(subBinariesSize)"); return SDK_FAILURE; } for(cl_uint i = 0; i < numSubDevices; ++i) { subBinaries[i] = (char*)kernelFile.source().c_str(); subBinariesSize[i] = kernelFile.source().size(); } subProgram = clCreateProgramWithBinary(rContext, numSubDevices, subDevices, (const size_t *)subBinariesSize, (const unsigned char**)subBinaries, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithBinary failed.(subProgram)"); free(subBinaries); free(subBinariesSize); subBinariesSize = NULL; subBinaries = NULL; } else { kernelPath.append("DeviceFission_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str()))//bool { std::cout << "Failed to load kernel file: " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = {strlen(source)}; // Create a CL program for sub-devices using the kernel source subProgram = clCreateProgramWithSource(rContext, 1, (const char**)&source, sourceSize, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(subProgram)"); // Create a CL program for GPU device using the kernel source gpuProgram = clCreateProgramWithSource(rContext, 1, (const char**)&source, sourceSize, &status); CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(gpuProgram)"); } // Get build options const char *flags; SDKFile flagsFile; std::string flagsPath = getPath(); if(buildData.flagsFileName.size() != 0) { flagsPath.append(buildData.flagsFileName.c_str()); if(!flagsFile.open(flagsPath.c_str())) { std::cout << "Failed to load flags file: " << flagsPath << std::endl; return SDK_FAILURE; } flagsFile.replaceNewlineWithSpaces(); flags = flagsFile.source().c_str(); if(strlen(flags) != 0) { std::cout << "Build Options are : " << flags << std::endl; } } else { flags = NULL; } // Create a cl program executable for all sub-devices status = clBuildProgram(subProgram, numSubDevices, subDevices, flags, NULL, NULL); CHECK_OPENCL_ERROR(status, "clBuildProgram failed.(subProgram)"); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo(subProgram, subDevices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { return SDK_FAILURE; } buildLog = (char*)malloc(buildLogSize); if(NULL == buildLog) { error("Failed to allocate host memory.(buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo(subProgram, subDevices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!checkVal(logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG(SUB-DEVICES)\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!checkVal(status, CL_SUCCESS, "clBuildProgram failed. (SUB-DEVICES)")) { return SDK_FAILURE; } } // Get a kernel object handle for a kernel with the given name subKernel[0] = clCreateKernel(subProgram, "Add", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[0])"); // Get a kernel object handle for a kernel with the given name subKernel[1] = clCreateKernel(subProgram, "Sub", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[1])"); return SDK_SUCCESS; }这里可以看到对于每一个子设备都创建了对应的CommandQueue,并且创建了两个内核,一个对应于Add操作,一个对应于Sub操作。最后看下执行函数:
int DeviceFission::runCLALLKerenls() { cl_int status; cl_event writeEvent; cl_event rangeEvent[2]; // Set global and local work items size_t globalThreads[] = {half_length}; size_t localThreads[] = {groupSize}; // Enqueue write Buffer to the first sub device queue status = clEnqueueWriteBuffer(subCmdQueue[0], InBuf, CL_FALSE, 0, half_length* sizeof(cl_int), input, 0, NULL, &writeEvent); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed"); cl_uint rangeEventNum = 0; rangeEvent[0] = rangeEvent[1] = writeEvent; rangeEventNum++; for(cl_uint i = 0; i < numSubDevices; ++i) { // Set subOutBuf as second argument status = clSetKernelArg(subKernel[i], 1, sizeof(cl_mem), (void*)&subOutBuf[i]); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (subOutBuf)"); // Set InBuf as first argument status = clSetKernelArg(subKernel[i], 0, sizeof(cl_mem),(void*)&InBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (InBuf)"); // Enqueue kernel status = clEnqueueNDRangeKernel(subCmdQueue[i], subKernel[i], 1, NULL, globalThreads, localThreads, rangeEventNum, &rangeEvent[i], NULL); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.(subCmdQueue)"); // Enqueue readBuffer status = clEnqueueReadBuffer(subCmdQueue[i], subOutBuf[i], CL_FALSE, 0, half_length * sizeof(cl_int), subOutput + half_length * i, 0, NULL, NULL); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed. (subCmdQueue)"); } // Flush all queues together for(cl_uint i = 0; i < numSubDevices; ++i) { status = clFlush(subCmdQueue[i]); CHECK_OPENCL_ERROR(status, "clFlush failed. (subCmdQueue)"); } // Finish all queues status = clFinish(subCmdQueue[0]); CHECK_OPENCL_ERROR(status, "clFinish failed. (subCmdQueue[0])"); status = clFinish(subCmdQueue[1]); CHECK_OPENCL_ERROR(status, "clFinish failed. (subCmdQueue[1])"); status = clReleaseEvent(writeEvent); CHECK_OPENCL_ERROR(status, "clReleaseEvent failed. (writeEvent)"); return SDK_SUCCESS; }