AMD OpenCL例子阅读笔记系列之DeviceFission

1. 什么是DeviceFission?

        DeviceFission就是设备拆分,目前仅支持CPU的拆分。在《OpenCL异构计算》中的第11章有部分内容。其中举得例子是AMD的6核Instanbul x86 CPU,该CPU有6个核,在默认时是作为一个设备看待的,但是通过设备拆分可以将6个核各自看做一个OpenCL设备。每一个设备都可以有一个或多个命令队列,这些命令队列时异步的并且各自在自己的线程种运行。这样可以用设备拆分来创建一个简单强大的基于任务并行的多线程应用程序。

2.例子的功能介绍

    该例子的是将CPU设备分成两个,实现的函数为clCreateSubDevices,使用的特性为CL_DEVICE_PARTITION_BY_COUNTS特性。在该例子中只创建了一个缓冲区。由其中的一个子设备来写入初值,之后两个设备同时进行计算,一个子设备调用add内核,一个子设备调用sub内核。

3.内核实现函数

    首先看下内核实现:

__kernel
void 
Add(__global int* input, __global int* output)
{
    size_t xPos = get_global_id(0);
    output[xPos] = input[xPos] + 1;
}

__kernel
void 
Sub(__global int* input, __global int* output)
{
    size_t xPos = get_global_id(0);
    output[xPos] = input[xPos] - 1;
}

    这两个内核都很简单,这里不再解释。

4.主机关键部分解析

   

int
DeviceFission::setupDeviceFission()
{
    // Make sure length is multiple of group size * numSubDevices
    unsigned int mulFactor = (unsigned int)groupSize * numSubDevices;
    length = (length < mulFactor) ? mulFactor : length;
    length = (length / mulFactor) * mulFactor;

    // Calculate half length
    half_length = length >> 1;

    // Get allocate memory for input buffer
    input = (cl_int*)malloc(half_length * sizeof(cl_int));
    CHECK_ALLOCATION(input, "Failed to allocate host memory. (input)");

    // Random initialisation of input
    fillRandom<cl_int>(input, half_length, 1, 1, 8);

    // Unless sampleArgs->quiet mode has been enabled, print the INPUT array
    if(!sampleArgs->quiet)
    {
        printArray<cl_int>("Input:", input, half_length, 1);
    }

    // Get allocate memory for subOutput buffer
    subOutput = (cl_int*)malloc(length * sizeof(cl_int));
    CHECK_ALLOCATION(subOutput, "Failed to allocate host memory. (subOutput)");

    return SDK_SUCCESS;
}
    输入量初始化部分。

int
DeviceFission::setupCLPlatform()
{
    cl_int status = CL_SUCCESS;

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = getPlatform(platform, sampleArgs->platformId,
                               sampleArgs->isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform(rootplatform) failed");

    // Display available devices.
    retValue = displayDevices(platform, CL_DEVICE_TYPE_ALL);
    CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices(rootplatform) failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform,
        0
    };

    rContext = clCreateContextFromType(platform ? cps : NULL,
                                       CL_DEVICE_TYPE_ALL,
                                       NULL,
                                       NULL,
                                       &status);
    CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed.");

    // getting devices on which to run the sample
    status = getDevices(rContext, &Devices, 0, sampleArgs->isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed");

    // Set deviceListSize from clGetContextInfo
    status = clGetContextInfo(rContext, CL_CONTEXT_DEVICES, 0, 0, &deviceListSize);
    CHECK_ERROR(status, SDK_SUCCESS, "clGetContextInfo failed. (deviceListSize)");

    // Get GPU device and CPU devices by the deviceInfo.
    for (cl_uint i = 0 ; i < deviceListSize / sizeof(cl_device_id) ; i++)
    {
        retValue = deviceInfo.setDeviceInfo(Devices[i]);
        CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed");
        if (deviceInfo.dType == CL_DEVICE_TYPE_CPU)
        {
            cpuDevice = Devices[i];
        }
    }

    // Get allocate memory for subDevices
    subDevices = (cl_device_id*)malloc(numSubDevices * sizeof(cl_device_id));
    CHECK_ALLOCATION(subDevices, "Failed to allocate memory. (subDevices)");

    // Get allocate memory for subKernel
    subKernel = (cl_kernel*)malloc(numSubDevices * sizeof(cl_kernel));
    CHECK_ALLOCATION(subKernel, "Failed to allocate memory. (subKernel)");

    // Get maxSubDevices from clGetDeviceInfo
    cl_uint maxSubDevices;
    status = clGetDeviceInfo(cpuDevice, CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
                             sizeof(maxSubDevices), &maxSubDevices, NULL);
    CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed. (maxSubDevices)")

    if(maxSubDevices <= 1)
    {
        std::cout<<"Error: The CPU should have more than one core to run this sample."<<std::endl;
        return SDK_FAILURE;
    }

    // Initialize required partition property
    cl_device_partition_property partitionPrty[5] =
    {
        CL_DEVICE_PARTITION_BY_COUNTS,
        maxSubDevices / 2, maxSubDevices / 2,
        CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
        0
    };

    // Create sub-devices
    status = clCreateSubDevices(cpuDevice, partitionPrty, numSubDevices, subDevices,
                                NULL);
    CHECK_OPENCL_ERROR( status, "clCreateSubDevices failed.");

    return SDK_SUCCESS;
}
    这个函数就是设备拆分的关键之一了,前面没什么好讲的就是获取平台信息然后取一个CPU设备以及创建Context等。之后就是根据需要创建的子设备数分配足够的subDivices和subKernel空间。然后通过clGetDeviceInfo首先通过CL_DEVICE_PARTITION_MAX_SUB_DEVICES查找到maxSubDevices参数并判断是否可以拆分。获取之后就可以进行设备的创建了。就是将CPU平均分成两份。那个cl_device_partition_property格式挺别致的,注意下写法。之后就是用clCreateSubDevices创建子设备列表并存储在subDevices中。为了程序的完整,这里将setupRuntime也贴一下吧:

int
DeviceFission::setupCLRuntime()
{
    cl_int status = CL_SUCCESS;

    // Create a CL program using the kernel source
    buildProgramData buildData;
    buildData.kernelName = std::string("DeviceFission_Kernels.cl");
    buildData.devices = Devices;
    buildData.deviceId = sampleArgs->deviceId;
    buildData.flagsStr = std::string("");
    if(sampleArgs->isLoadBinaryEnabled())
    {
        buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());
    }

    if(sampleArgs->isComplierFlagsSpecified())
    {
        buildData.flagsFileName = std::string(sampleArgs->flags.c_str());
    }

    // Get allocate memory for subCmdQueue
    subCmdQueue = (cl_command_queue*)malloc(numSubDevices * sizeof(
            cl_command_queue));
    CHECK_ALLOCATION(subCmdQueue,"Failed to allocate memory. (subCmdQueue)");

    // Create command queue subCmdQueue
    for(cl_uint i = 0; i < numSubDevices; i++)
    {
        // Create command queue
        subCmdQueue[i] = clCreateCommandQueue(rContext,
                                              subDevices[i],
                                              0,
                                              &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed. (subCmdQueue)");
    }

    // Create memory objects for input
    InBuf = clCreateBuffer(rContext,
                           CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                           length * sizeof(cl_int),
                           NULL,
                           &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (InBuf)");

    // Get allocate memory for sub devices output
    subOutBuf = (cl_mem*)malloc(numSubDevices * sizeof(cl_mem));

    for(cl_uint i = 0; i < numSubDevices; i++)
    {
        // Create memory objects for sub devices output
        subOutBuf[i] = clCreateBuffer(rContext,
                                      CL_MEM_WRITE_ONLY,
                                      half_length * sizeof(cl_int) ,
                                      NULL,
                                      &status);
        CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (subOutBuf)");
    }

    SDKFile kernelFile;
    std::string kernelPath = getPath();

    char * source = NULL;
    size_t sourceSize[] = {0};
    char * binary = NULL;
    size_t binarySize = 0;

    if(sampleArgs->isLoadBinaryEnabled())
    {
        kernelPath += sampleArgs->loadBinary;

        if(kernelFile.readBinaryFromFile(kernelPath.c_str()))
        {
            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
            return SDK_FAILURE;
        }

        // Get binaries and binary sizes for CPU devices
        char** subBinaries = (char**)malloc(numSubDevices * sizeof(char*));
        if(subBinaries == NULL)
        {
            error("Failed to allocate memory(subBinaries)");
            return SDK_FAILURE;
        }

        size_t* subBinariesSize = (size_t*)malloc(numSubDevices * sizeof(size_t*));
        if(subBinariesSize == NULL)
        {
            error("Failed to allocate memory(subBinariesSize)");
            return SDK_FAILURE;
        }

        for(cl_uint i = 0; i < numSubDevices; ++i)
        {
            subBinaries[i] = (char*)kernelFile.source().c_str();
            subBinariesSize[i] = kernelFile.source().size();
        }

        subProgram = clCreateProgramWithBinary(rContext,
                                               numSubDevices,
                                               subDevices,
                                               (const size_t *)subBinariesSize,
                                               (const unsigned char**)subBinaries,
                                               NULL,
                                               &status);
        CHECK_OPENCL_ERROR(status, "clCreateProgramWithBinary failed.(subProgram)");

        free(subBinaries);
        free(subBinariesSize);
        subBinariesSize = NULL;
        subBinaries = NULL;
    }
    else
    {
        kernelPath.append("DeviceFission_Kernels.cl");
        if(!kernelFile.open(kernelPath.c_str()))//bool
        {
            std::cout << "Failed to load kernel file: " << kernelPath << std::endl;
            return SDK_FAILURE;
        }
        const char * source = kernelFile.source().c_str();
        size_t sourceSize[] = {strlen(source)};

        // Create a CL program for sub-devices using the kernel source
        subProgram = clCreateProgramWithSource(rContext,
                                               1,
                                               (const char**)&source,
                                               sourceSize,
                                               &status);
        CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(subProgram)");

        // Create a CL program for GPU device using the kernel source
        gpuProgram = clCreateProgramWithSource(rContext,
                                               1,
                                               (const char**)&source,
                                               sourceSize,
                                               &status);
        CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(gpuProgram)");
    }

    // Get build options
    const char *flags;
    SDKFile flagsFile;
    std::string flagsPath = getPath();
    if(buildData.flagsFileName.size() != 0)
    {
        flagsPath.append(buildData.flagsFileName.c_str());
        if(!flagsFile.open(flagsPath.c_str()))
        {
            std::cout << "Failed to load flags file: " << flagsPath << std::endl;
            return SDK_FAILURE;
        }
        flagsFile.replaceNewlineWithSpaces();
        flags = flagsFile.source().c_str();
        if(strlen(flags) != 0)
        {
            std::cout << "Build Options are : " << flags << std::endl;
        }
    }
    else
    {
        flags = NULL;
    }

    // Create a cl program executable for all sub-devices
    status = clBuildProgram(subProgram,
                            numSubDevices,
                            subDevices,
                            flags,
                            NULL,
                            NULL);
    CHECK_OPENCL_ERROR(status, "clBuildProgram failed.(subProgram)");
    if(status != CL_SUCCESS)
    {
        if(status == CL_BUILD_PROGRAM_FAILURE)
        {
            cl_int logStatus;
            char * buildLog = NULL;
            size_t buildLogSize = 0;
            logStatus = clGetProgramBuildInfo(subProgram,
                                              subDevices[0],
                                              CL_PROGRAM_BUILD_LOG,
                                              buildLogSize,
                                              buildLog,
                                              &buildLogSize);
            if(!checkVal(logStatus,
                         CL_SUCCESS,
                         "clGetProgramBuildInfo failed."))
            {
                return SDK_FAILURE;
            }

            buildLog = (char*)malloc(buildLogSize);
            if(NULL == buildLog)
            {
                error("Failed to allocate host memory.(buildLog)");
                return SDK_FAILURE;
            }
            memset(buildLog, 0, buildLogSize);

            logStatus = clGetProgramBuildInfo(subProgram,
                                              subDevices[0],
                                              CL_PROGRAM_BUILD_LOG,
                                              buildLogSize,
                                              buildLog,
                                              NULL);
            if(!checkVal(logStatus,
                         CL_SUCCESS,
                         "clGetProgramBuildInfo failed."))
            {
                free(buildLog);
                return SDK_FAILURE;
            }

            std::cout << " \n\t\t\tBUILD LOG(SUB-DEVICES)\n";
            std::cout << " ************************************************\n";
            std::cout << buildLog << std::endl;
            std::cout << " ************************************************\n";
            free(buildLog);
        }

        if(!checkVal(status,
                     CL_SUCCESS,
                     "clBuildProgram failed. (SUB-DEVICES)"))
        {
            return SDK_FAILURE;
        }
    }

    // Get a kernel object handle for a kernel with the given name
    subKernel[0] = clCreateKernel(subProgram,
                                  "Add",
                                  &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[0])");

    // Get a kernel object handle for a kernel with the given name
    subKernel[1] = clCreateKernel(subProgram,
                                  "Sub",
                                  &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[1])");

    return SDK_SUCCESS;
}
     这里可以看到对于每一个子设备都创建了对应的CommandQueue,并且创建了两个内核,一个对应于Add操作,一个对应于Sub操作。最后看下执行函数:

int
DeviceFission::runCLALLKerenls()
{
    cl_int status;
    cl_event writeEvent;
    cl_event rangeEvent[2];

    // Set global and local work items
    size_t globalThreads[] = {half_length};
    size_t localThreads[] = {groupSize};

    // Enqueue write Buffer to the first sub device queue
    status = clEnqueueWriteBuffer(subCmdQueue[0],
                                  InBuf,
                                  CL_FALSE,
                                  0,
                                  half_length* sizeof(cl_int),
                                  input,
                                  0,
                                  NULL,
                                  &writeEvent);
    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed");

    cl_uint rangeEventNum = 0;

    rangeEvent[0] = rangeEvent[1] = writeEvent;
    rangeEventNum++;

    for(cl_uint i = 0; i < numSubDevices; ++i)
    {
        // Set subOutBuf as second argument
        status = clSetKernelArg(subKernel[i], 1, sizeof(cl_mem), (void*)&subOutBuf[i]);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (subOutBuf)");

        // Set InBuf as first argument
        status = clSetKernelArg(subKernel[i], 0, sizeof(cl_mem),(void*)&InBuf);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (InBuf)");

        // Enqueue kernel
        status = clEnqueueNDRangeKernel(subCmdQueue[i],
                                        subKernel[i],
                                        1,
                                        NULL,
                                        globalThreads,
                                        localThreads,
                                        rangeEventNum,
                                        &rangeEvent[i],
                                        NULL);
        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.(subCmdQueue)");

        // Enqueue readBuffer
        status = clEnqueueReadBuffer(subCmdQueue[i],
                                     subOutBuf[i],
                                     CL_FALSE,
                                     0,
                                     half_length * sizeof(cl_int),
                                     subOutput + half_length * i,
                                     0,
                                     NULL,
                                     NULL);
        CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed. (subCmdQueue)");
    }

    // Flush all queues together

    for(cl_uint i = 0; i < numSubDevices; ++i)
    {
        status = clFlush(subCmdQueue[i]);
        CHECK_OPENCL_ERROR(status, "clFlush failed. (subCmdQueue)");
    }

    // Finish all queues
    status = clFinish(subCmdQueue[0]);
    CHECK_OPENCL_ERROR(status, "clFinish failed. (subCmdQueue[0])");

    status = clFinish(subCmdQueue[1]);
    CHECK_OPENCL_ERROR(status, "clFinish failed. (subCmdQueue[1])");

    status = clReleaseEvent(writeEvent);
    CHECK_OPENCL_ERROR(status, "clReleaseEvent failed. (writeEvent)");

    return SDK_SUCCESS;
}

  可以看到InBuf的大小为subOutBuf大小的一半。需要注意的是clEnqueueNDRangeKernel是需要等待clEnqueueWriteBuffer写完输入数据才会开始进行计算。



你可能感兴趣的:(OpenCL)