元气少女缘结神

OpenCL的学习

环境:CentOS7 显卡：NVIDIA 按照http://blog.csdn.net/fly_yr/article/details/49796649配置好OpenCL 然后按照：https://docs.google.com/document/pub?id=1NPo1TK30IOYZxI53t_V3uenSHTMSFYs5cupVDniqVK4 运行这个小例子测试是否配置好了本来我一直报错说：failed to load kernel. 把代码中这一句fp = fopen("vector_add_kernel.cl", "r");改成绝对路径比如我的是fp = fopen("/home/.../vector_add_kernel.cl", "r");即可哦中间还报了一个小错误说头文件‘CL/cl.h’没有这个目录或头文件解决办法：yum install opencl-headers就不报错了。出来是两个非负整数相加等于1000的所有可能。

1、我是先对着上面那个程序然后看这个博客 http://blog.csdn.net/leonwei/article/category/1410041 初步了解OpenCL

2、对着http://blog.csdn.net/lien0906/article/category/2925315 http://blog.csdn.net/Augusdi/article/category/1687179 进一步熟悉 OpenCV的OCL模块就是支持OpenCL的哦如果只是用OpenCL进行图像开发就用这个应该就够了吧但目前好像上面包含的函数比较少哎相同部分可跳过

3、对着http://www.cnblogs.com/mikewolf2002/category/343145.html 学习

4、上面都看完后按照http://www.cnblogs.com/mikewolf2002/archive/2012/09/11/2680689.html 这位大神的OpenCL系列有很多篇。我看到图像旋转这里：

这之前几篇还有一个向量相加的小程序：因为我是linux 改了点：

#include 
#include 
#ifdef __APPLE__
#include 
#else
#include 
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void)
{
        // Create the two input vectors
        int i;
        const int LIST_SIZE = 100000;
        int *A = (int*) malloc(sizeof(int) * LIST_SIZE);
        int *B = (int*) malloc(sizeof(int) * LIST_SIZE);
        for (i = 0; i < LIST_SIZE; i++)
        {
                A[i] = i;
                B[i] = LIST_SIZE - i;
        }
        /////////////////////////////////////// Load the kernel source code into the array source_str
        FILE *fp;
        char *source_str;
        size_t source_size;
        fp = fopen("/home/jumper/OpenCL_projects/HelloOpenCL/src/vector_add_kernel.cl", "r");
        if (!fp)
        	{
                fprintf(stderr, "Failed to load kernel.\n");
                exit(1);
        	}
        source_str = (char*) malloc(MAX_SOURCE_SIZE);
        source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
        fclose(fp);
        ///////////////////////////////////////////////////////////// Get platform and device information
        cl_platform_id platform_id = NULL;
        cl_device_id device_id = NULL;
        cl_uint ret_num_devices;
        cl_uint ret_num_platforms;
        cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
                        &ret_num_devices);
        ///////////////////////////////////////////////////////////// Create an OpenCL context
        cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
        ///////////////////////////////// Create a command queue
        cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0,
                        &ret);
        ///////////////////////////////// Create memory buffers on the device for each vector
        cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
                        LIST_SIZE * sizeof(int), NULL, &ret);
        cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
                        LIST_SIZE * sizeof(int), NULL, &ret);
        cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                        LIST_SIZE * sizeof(int), NULL, &ret);
        // Copy the lists A and B to their respective memory buffers
        ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
                        LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
        ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
                        LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
        // Create a program from the kernel source
        cl_program program = clCreateProgramWithSource(context, 1,
                        (const char **) &source_str, (const size_t *) &source_size, &ret);
        // Build the program
        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
        // Create the OpenCL kernel
        cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
        // Set the arguments of the kernel
        ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a_mem_obj);
        ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b_mem_obj);
        ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &c_mem_obj);
        // Execute the OpenCL kernel on the list
        size_t global_item_size = LIST_SIZE; // Process the entire lists
        size_t local_item_size = 1; // Process one item at a time
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
                        &global_item_size, &local_item_size, 0, NULL, NULL);
        // Read the memory buffer C on the device to the local variable C
        int *C = (int*) malloc(sizeof(int) * LIST_SIZE);
        ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
                        LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
        // Display the result to the screen
        for (i = 0; i < LIST_SIZE; i++)
                printf("%d + %d = %d\n", A[i], B[i], C[i]);
        // Clean up
        ret = clFlush(command_queue);
        ret = clFinish(command_queue);
        ret = clReleaseKernel(kernel);
        ret = clReleaseProgram(program);
        ret = clReleaseMemObject(a_mem_obj);
        ret = clReleaseMemObject(b_mem_obj);
        ret = clReleaseMemObject(c_mem_obj);
        ret = clReleaseCommandQueue(command_queue);
        ret = clReleaseContext(context);
        free(A);
        free(B);
        free(C);
        return 0;
}

结果：

再看这个图像旋转的：

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include "gclFile.h"
//#include "gclTimer.h"
#include "gFreeImage.h"
using namespace std;

int waitForEventAndRelease(cl_event *event)
{
	cl_int status = CL_SUCCESS;
	cl_int eventStatus = CL_QUEUED;
	while(eventStatus != CL_COMPLETE)
	{
		status = clGetEventInfo(*event,CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof(cl_int),&eventStatus,NULL);
	}
	//status = clReleaseEvent(*event); //it causes error:No source available "0*0"
	return 0;
}


void cpu_rotate(unsigned char* inbuf, unsigned char* outbuf, int w, int h,float sinTheta, float cosTheta)
{
	int i, j;
	int xc = w/2;
	int yc = h/2;

	for(i = 0; i < h; i++)
	{
		for(j=0; j< w; j++)
		{
			int xpos =  ( j-xc)*cosTheta - (i-yc)*sinTheta+xc;    
			int ypos =  (j-xc)*sinTheta + ( i-yc)*cosTheta+yc; 

			if(xpos>=0&&ypos>=0&&xpos

他自己写了个类gFreeImage  方便读存图片 结果： 
   
  

 
  5、后来发现这本书 《OpenCL Parallel Programming Development CookBook》就下载下来了  真的很好  而且提供了源代码 边运行代码边看书上的讲解 真的感觉不那么难了。 但我没全部照着运行 改动了的    
  6、《OpenCL Parallel Programming Development CookBook》-Ch1 
  书上几个例子综合一下： 
   
  #include 
#include 
#include 
#include 
#include 
#include 
#ifdef APPLE
#include 
#else
#include 
#endif

void displayPlatformInfo(cl_platform_id id,
                         cl_platform_info param_name,
                         const char* paramNameAsStr) {
    cl_int error = 0;
    size_t paramSize = 0;
    error = clGetPlatformInfo( id, param_name, 0, NULL, ¶mSize );
    char* moreInfo = (char*)alloca( sizeof(char) * paramSize);
    error = clGetPlatformInfo( id, param_name, paramSize, moreInfo, NULL );
    if (error != CL_SUCCESS ) {
        perror("Unable to find any OpenCL platform information");
        return;
    }
    printf("%s: %s\n", paramNameAsStr, moreInfo);
}

void displayDeviceDetails(cl_device_id id,
                          cl_device_info param_name,
                          const char* paramNameAsStr) {
  cl_int error = 0;
  size_t paramSize = 0;

  error = clGetDeviceInfo( id, param_name, 0, NULL, ¶mSize );
  if (error != CL_SUCCESS ) {
    perror("Unable to obtain device info for param\n");
    return;
  }

  /* the cl_device_info are preprocessor directives defined in cl.h */
  switch (param_name) {
    case CL_DEVICE_TYPE: {
            cl_device_type* devType = (cl_device_type*) alloca(sizeof(cl_device_type) * paramSize);
            error = clGetDeviceInfo( id, param_name, paramSize, devType, NULL );
            if (error != CL_SUCCESS ) {
                perror("Unable to obtain device info for param\n");
                return;
            }
            switch (*devType) {
              case CL_DEVICE_TYPE_CPU : printf("CPU detected\n");break;
              case CL_DEVICE_TYPE_GPU : printf("GPU detected\n");break;
              case CL_DEVICE_TYPE_ACCELERATOR : printf("Accelerator detected\n");break;
              case CL_DEVICE_TYPE_DEFAULT : printf("default detected\n");break;
            }
            }break;
    case CL_DEVICE_VENDOR_ID :
    case CL_DEVICE_MAX_COMPUTE_UNITS :
    case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS : {
            cl_uint* ret = (cl_uint*) alloca(sizeof(cl_uint) * paramSize);
            error = clGetDeviceInfo( id, param_name, paramSize, ret, NULL );
            if (error != CL_SUCCESS ) {
                perror("Unable to obtain device info for param\n");
                return;
            }
            switch (param_name) {
                case CL_DEVICE_VENDOR_ID: printf("\tVENDOR ID: 0x%x\n", *ret); break;
                case CL_DEVICE_MAX_COMPUTE_UNITS: printf("\tMaximum number of parallel compute units: %d\n", *ret); break;
                case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: printf("\tMaximum dimensions for global/local work-item IDs: %d\n", *ret); break;
            }
         }break;
    case CL_DEVICE_MAX_WORK_ITEM_SIZES : {
            cl_uint maxWIDimensions;
            size_t* ret = (size_t*) alloca(sizeof(size_t) * paramSize);
            error = clGetDeviceInfo( id, param_name, paramSize, ret, NULL );

            error = clGetDeviceInfo( id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &maxWIDimensions, NULL );
            if (error != CL_SUCCESS ) {
                perror("Unable to obtain device info for param\n");
                return;
            }
            printf("\tMaximum number of work-items in each dimension: ( ");
            for(cl_int i =0; i < maxWIDimensions; ++i ) {
                printf("%d ", ret[i]);
            }
            printf(" )\n");
            }break;
    case CL_DEVICE_MAX_WORK_GROUP_SIZE : {
            size_t* ret = (size_t*) alloca(sizeof(size_t) * paramSize);
            error = clGetDeviceInfo( id, param_name, paramSize, ret, NULL );
            if (error != CL_SUCCESS ) {
                perror("Unable to obtain device info for param\n");
                return;
            }
            printf("\tMaximum number of work-items in a work-group: %d\n", *ret);
            }break;
    case CL_DEVICE_NAME :
    case CL_DEVICE_VENDOR : {
            char data[48];
            error = clGetDeviceInfo( id, param_name, paramSize, data, NULL );
            if (error != CL_SUCCESS ) {
                perror("Unable to obtain device name/vendor info for param\n");
                return;
            }
            switch (param_name) {
                case CL_DEVICE_NAME : printf("\tDevice name is %s\n", data);break;
                case CL_DEVICE_VENDOR : printf("\tDevice vendor is %s\n", data);break;
            }
    } break;
    case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: {
            cl_uint* size = (cl_uint*) alloca(sizeof(cl_uint) * paramSize);
            error = clGetDeviceInfo( id, param_name, paramSize, size, NULL );
            if (error != CL_SUCCESS ) {
                perror("Unable to obtain device name/vendor info for param\n");
                return;
            }
            printf("\tDevice global cacheline size: %d bytes\n", (*size)); break;
    } break;
    case CL_DEVICE_GLOBAL_MEM_SIZE:
    case CL_DEVICE_MAX_MEM_ALLOC_SIZE: {
            cl_ulong* size = (cl_ulong*) alloca(sizeof(cl_ulong) * paramSize);
            error = clGetDeviceInfo( id, param_name, paramSize, size, NULL );
            if (error != CL_SUCCESS ) {
                perror("Unable to obtain device name/vendor info for param\n");
                return;
            }
            switch (param_name) {
                case CL_DEVICE_GLOBAL_MEM_SIZE: printf("\tDevice global mem: %ld mega-bytes\n", (*size)>>20); break;
                case CL_DEVICE_MAX_MEM_ALLOC_SIZE: printf("\tDevice max memory allocation: %ld mega-bytes\n", (*size)>>20); break;
            }
    } break;

  } //end of switch

}

void displayDeviceInfo(cl_platform_id id,
                       cl_device_type dev_type) {
    /* OpenCL 1.1 device types */
    cl_int error = 0;
    cl_uint numOfDevices = 0;

    /* Determine how many devices are connected to your platform */
    error = clGetDeviceIDs(id, dev_type, 0, NULL, &numOfDevices);
    if (error != CL_SUCCESS ) {
        perror("Unable to obtain any OpenCL compliant device info");
        exit(1);
    }
    cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices);

    /* Load the information about your devices into the variable 'devices' */
    error = clGetDeviceIDs(id, dev_type, numOfDevices, devices, NULL);
    if (error != CL_SUCCESS ) {
        perror("Unable to obtain any OpenCL compliant device info");
        exit(1);
    }
    printf("Number of detected OpenCL devices: %d\n", numOfDevices);
    /* We attempt to retrieve some information about the devices. */
    for(int i = 0; i < numOfDevices; ++ i ) {
        displayDeviceDetails( devices[i], CL_DEVICE_TYPE, "CL_DEVICE_TYPE" );
        displayDeviceDetails( devices[i], CL_DEVICE_NAME, "CL_DEVICE_NAME" );
        displayDeviceDetails( devices[i], CL_DEVICE_VENDOR, "CL_DEVICE_VENDOR" );
        displayDeviceDetails( devices[i], CL_DEVICE_VENDOR_ID, "CL_DEVICE_VENDOR_ID" );
        displayDeviceDetails( devices[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "CL_DEVICE_MAX_MEM_ALLOC_SIZE" );
        displayDeviceDetails( devices[i], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE" );
        displayDeviceDetails( devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, "CL_DEVICE_GLOBAL_MEM_SIZE" );
        displayDeviceDetails( devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, "CL_DEVICE_MAX_COMPUTE_UNITS" );
        displayDeviceDetails( devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS" );
        displayDeviceDetails( devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, "CL_DEVICE_MAX_WORK_ITEM_SIZES" );
        displayDeviceDetails( devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "CL_DEVICE_MAX_WORK_GROUP_SIZE" );
    }
}

void createAndReleaseContext(cl_platform_id id,
                             cl_device_type dev_type) {
    /* OpenCL 1.1 device types */
    cl_int error = 0;
    cl_uint numOfDevices = 0;

    /* Determine how many devices are connected to your platform */
    error = clGetDeviceIDs(id, dev_type, 0, NULL, &numOfDevices);
    if (error != CL_SUCCESS ) {
        perror("Unable to obtain any OpenCL compliant device info");
        exit(1);
    }
    cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices);

    /* Load the information about your devices into the variable 'devices' */
    error = clGetDeviceIDs(id, dev_type, numOfDevices, devices, NULL);
    if (error != CL_SUCCESS ) {
        perror("Unable to obtain any OpenCL compliant device info");
        exit(1);
    }
    printf("Number of detected OpenCL devices: %d\n", numOfDevices);

    /*
       We attempt to create contexts for each device we find, report it
       and release the context. Once a context is created, its context is implicitly
       retained and so you don't have to invoke 'clRetainContext'
     */
    for(int i = 0; i < numOfDevices; ++ i ) {
        cl_context context = clCreateContext(NULL, 1, &devices[i], NULL, NULL, &error);
        cl_uint ref_cnt = 0;
        if (error != CL_SUCCESS) {
            perror("Can't create a context");
            exit(1);
        }
        error = clGetContextInfo(context, CL_CONTEXT_REFERENCE_COUNT, sizeof(ref_cnt), &ref_cnt, NULL);
        if (error != CL_SUCCESS) {
            perror("Can't obtain context information");
            exit(1);
        }
        printf("Reference count of device is %d\n", ref_cnt);

        // Release the context
        clReleaseContext(context);
    }
}

int main() {
	//Get all the platforms that support OpenCL!
   cl_platform_id* platforms;
   cl_uint numOfPlatforms;
   cl_int  error;
   error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
   if(error != CL_SUCCESS) {
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }
   printf("Number of OpenCL platforms found: %d\n", numOfPlatforms);

   //Get the addresses of every platform. Allocate memory for the number of installed platforms.
   // alloca(...) occupies some stack space but is automatically freed on return
   platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms);
   error = clGetPlatformIDs(numOfPlatforms, platforms, NULL);
   if(error != CL_SUCCESS) {
      perror("Unable to find any OpenCL platforms");
      exit(1);
   }

   //visit every platform by their addresses.
   for(cl_uint i = 0; i < numOfPlatforms; ++i) {
	   //platforms information
        displayPlatformInfo( platforms[i], CL_PLATFORM_PROFILE, "CL_PLATFORM_PROFILE" );
        displayPlatformInfo( platforms[i], CL_PLATFORM_VERSION, "CL_PLATFORM_VERSION" );
        displayPlatformInfo( platforms[i], CL_PLATFORM_NAME,    "CL_PLATFORM_NAME" );
        displayPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,  "CL_PLATFORM_VENDOR" );
        displayPlatformInfo( platforms[i], CL_PLATFORM_EXTENSIONS, "CL_PLATFORM_EXTENSIONS" );
        //devices information
        displayDeviceInfo( platforms[i], CL_DEVICE_TYPE_ALL );
        //contexts information with devices
        createAndReleaseContext( platforms[i], CL_DEVICE_TYPE_ALL );
   }

   return 0;
}结果： 
   
  

 
  书上有句话解释了  不同的SDK对OpenCL platform和device的支持不一样  NVIDIA只支持GPU  AMD公司的支持CPU和GPU 等 
      学完第一章，我试着写了个： 
   
  #include 
#include 
#include 
#include 
#include 
#include "gclFile.h"

void loadProgramSource(const char** files,
                       size_t length,
                       char** buffer,
                       size_t* sizes) {
	   /* Read each source file (*.cl) and store the contents into a temporary datastore */
	   for(size_t i=0; i < length; i++) {
	      FILE* file = fopen(files[i], "r");
	      if(file == NULL) {
	         perror("Couldn't read the program file");
	         exit(1);
	      }
	      fseek(file, 0, SEEK_END);
	      sizes[i] = ftell(file);
	      rewind(file); // reset the file pointer so that 'fread' reads from the front
	      buffer[i] = (char*)malloc(sizes[i]+1);
	      buffer[i][sizes[i]] = '\0';
	      fread(buffer[i], sizeof(char), sizes[i], file);
	      fclose(file);
	   }
}

int main(){

	//first : platform
	cl_uint platformnum;
	cl_int error;
	error=clGetPlatformIDs(0,NULL,&platformnum);
	if(error!=CL_SUCCESS){
		printf("no suitable platforms for OpenCL\n");
		return -1;
	}
	cl_platform_id* platformIDs;
	platformIDs=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformnum);
	error=clGetPlatformIDs(platformnum,platformIDs,NULL);
	if(error!=CL_SUCCESS){
		printf("platforms addresses error\n");
		return -1;
	}


	//second:device
	cl_device_id device;
	char platformName[100];
	error = clGetPlatformInfo(platformIDs[0],CL_PLATFORM_VENDOR,sizeof(platformName),platformName,NULL);
	if(error!=CL_SUCCESS){
		printf("wrong when getting the platform in use.\n");
		return -1;
	}
	cl_platform_id platformInUse = platformIDs[0];
	clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);


	//third: context and CommandQueue
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue = clCreateCommandQueue( context,device,CL_QUEUE_PROFILING_ENABLE, &error );
	if (error != CL_SUCCESS) {
		perror("Unable to create command-queue");
		exit(1);
	}


	//forth: load kernel(more than one) and create program to compile it!!!
	const char *file_names[] = {"simple.cl", "simple_2.cl"};
	const int NUMBER_OF_FILES = 2;
	char* buffer[NUMBER_OF_FILES];  /*remember:kernel buffer must be released!*/
	size_t sizes[NUMBER_OF_FILES];
	loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);
	cl_program program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error);
	if(error != CL_SUCCESS) {
	  perror("Can't create the OpenCL program object");
	  return -1;
	}
	error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
					/* build the program using build-options statically:
							 const char options[] = "-cl-finite-math-only -cl-no-signed-zeros";
							 error = clBuildProgram(program, 1, &device, options, NULL, NULL);
						 There is dynamic solution to build the program by using argv in command line. such as:
							 error = clBuildProgram(program, 1, &device, argv[1], NULL, NULL);
					 */
	char *program_log;
	size_t log_size;
	if(error != CL_SUCCESS) {
	  clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
	  program_log = (char*) malloc(log_size+1);
	  program_log[log_size] = '\0';
	  clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,log_size+1, program_log, NULL);
	  printf("\n=== ERROR ===\n\n%s\n=============\n", program_log);
	  free(program_log);
	  return -1;
	}
					/* Query the program as to how many kernels were detected and how many variables in every kernel function */
	cl_uint numOfKernels;
	error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels);
	if (error != CL_SUCCESS) {
		perror("Unable to retrieve kernel count from program");
		exit(1);
	}
	cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels);/*once I saw "alloca() can be free automatically",but here?only cl_kernel cannot*/
	error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL);
	for(cl_uint i = 0; i < numOfKernels; i++) {
		char kernelName[32];
		cl_uint argCnt;
		clGetKernelInfo(kernels[i], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL);
		clGetKernelInfo(kernels[i], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL);
		printf("Kernel name: %s with arity: %d\n", kernelName, argCnt);
	}


	//fifth: Buffer (prepare the real arguments of the kernel function.for READ Buffer :Received real arguments from host):
	const cl_int mem_size=900;
	cl_mem a_in = clCreateBuffer(context, CL_MEM_READ_ONLY,mem_size*sizeof(cl_int), NULL, NULL);
	cl_mem a_in2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY,mem_size*sizeof(cl_int), NULL, NULL);
	cl_mem b_in = clCreateBuffer(context, CL_MEM_READ_ONLY,mem_size*sizeof(cl_int), NULL, NULL);
	cl_mem b_in2 = clCreateBuffer(context, CL_MEM_READ_ONLY,mem_size*sizeof(cl_int), NULL, NULL);
	cl_mem c_in = clCreateBuffer(context, CL_MEM_WRITE_ONLY,mem_size*sizeof(cl_int), NULL, NULL);
	cl_mem c_in2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY,mem_size*sizeof(cl_int), NULL, NULL);
					/*real arguments */
	cl_int real_a[mem_size],real_a2[mem_size];
	cl_int real_b[mem_size],real_b2[mem_size];
	for(cl_int i=0;i
同时让2个kernels运行的 我不太会 所以上面是试着写的 果然出来有点问题C[]和C2[]的结果那里 怎么是随机的数值呢 应该是900啊 难道在kernel里没有运行？看： 
   
  
 应该哪里我写得有点问题  可能等看了第二章就明白了。 
  7、《OpenCL Parallel Programming Development CookBook》-Ch2
 
  第一例子 自己写： 
  #include 
#include 
#include 
#include 
#include 

void loadProgramSource(const char** files,
                       size_t length,
                       char** buffer,
                       size_t* sizes) {
	   /* Read each source file (*.cl) and store the contents into a temporary datastore */
	   for(size_t i=0; i < length; i++) {
	      FILE* file = fopen(files[i], "r");
	      if(file == NULL) {
	         perror("Couldn't read the program file");
	         exit(1);
	      }
	      fseek(file, 0, SEEK_END);
	      sizes[i] = ftell(file);
	      rewind(file); // reset the file pointer so that 'fread' reads from the front
	      buffer[i] = (char*)malloc(sizes[i]+1);
	      buffer[i][sizes[i]] = '\0';
	      fread(buffer[i], sizeof(char), sizes[i], file);
	      fclose(file);
	   }
}

typedef struct UserData {
    int x;
    int y;
    int z;
    int w;
} UserData;

int main(){

	cl_uint platformNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformNum);
	if(status!=CL_SUCCESS){
		printf("cannot get platforms number.\n");
		return -1;
	}
	cl_platform_id* platforms;
	platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
	status=clGetPlatformIDs(platformNum,platforms,NULL);
	if(status!=CL_SUCCESS){
		printf("cannot get platforms addresses.\n");
		return -1;
	}
	cl_platform_id platformInUse=platforms[0];
	cl_device_id device;
	clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);
	const char* files[]={"user_test.cl"};
	const int kernelnum=1;
	char *buffer[kernelnum];
	size_t sizes[kernelnum];
	loadProgramSource(files, kernelnum, buffer, sizes);
	cl_program program=clCreateProgramWithSource(context,kernelnum,(const char**)buffer,sizes,&status);
	if(status != CL_SUCCESS) {
	  perror("Can't create the OpenCL program object");
	  return -1;
	}
	status=clBuildProgram(program,1,&device,NULL,NULL,NULL);
	if(status != CL_SUCCESS) {
	  perror("Can't build the OpenCL program object");
	  return -1;
	}
	UserData realdata;
	realdata.x=3;
	realdata.y=4;
	realdata.z=5;
	realdata.w=6;
	cl_mem datain=clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(UserData),NULL,NULL);
	status=clEnqueueWriteBuffer(queue,datain,CL_TRUE,0,sizeof(UserData),&realdata,0,NULL,NULL);
	if(status!=CL_SUCCESS){
		perror("cannot write data into buffer.\n");
		return -1;
	}
	cl_kernel kernel = clCreateKernel(program, "hello", &status);
	status=clSetKernelArg(kernel,0,sizeof(cl_mem),(void*)&datain);

	size_t global_item_size=1;
	size_t local_item_size=1;
	status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);

	UserData *databack = (UserData*) alloca(sizeof(UserData)) ;
	status = clEnqueueReadBuffer(queue, datain, CL_TRUE, 0, sizeof(UserData), databack, 0, NULL, NULL);
	printf("x:%d  y: %d  z:%d w:%d\n", databack->x,databack->y,databack->z,databack->w);

	clReleaseCommandQueue(queue);
	clReleaseKernel(kernel);
	free(buffer[0]);
	clReleaseProgram(program);
	clReleaseContext(context);
	status = clReleaseMemObject(datain);
	return 0;
}结果是对的： 
   
  
 
  刚看了下书上这个例子的源码  它的和我的略不一样 在编译kernel的那个函数 还有实参它有多个实例化后的结构体 我只有一个 到目前为止 我发现我的这个例子虽然对了 但对workItem和workgroup以及device上lobal local的设置还不太会。可能后面就会介绍吧书上。 
  第二个例子：我觉得书上的是错的 因为本该UserData.w是在kernel上计算出来的 所以无论在host上w被初始化成什么经过kernel拷贝回来后一定是正确的x+y+z的结果，不然kernel执行就没有意义了：书上的例子源码： 
   
  #include 
#include 
#include 
#include 
typedef struct UserData {
    int x;
    int y;
    int z;
    int w;
} UserData;

void loadProgramSource(const char** files, size_t length,char** buffer,size_t* sizes) {
	   /* Read each source file (*.cl) and store the contents into a temporary datastore */
	   for(size_t i=0; i < length; i++) {
	      FILE* file = fopen(files[i], "r");
	      if(file == NULL) {
	         perror("Couldn't read the program file");
	         exit(1);
	      }
	      fseek(file, 0, SEEK_END);
	      sizes[i] = ftell(file);
	      rewind(file); // reset the file pointer so that 'fread' reads from the front
	      buffer[i] = (char*)malloc(sizes[i]+1);
	      buffer[i][sizes[i]] = '\0';
	      fread(buffer[i], sizeof(char), sizes[i], file);
	      fclose(file);
	   }
}

void displayBufferDetails(cl_mem memobj) {
    cl_mem_object_type objT;
    cl_mem_flags flags;
    size_t memSize;

    clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &objT, 0);
    clGetMemObjectInfo(memobj, CL_MEM_FLAGS, sizeof(cl_mem_flags), &flags, 0);
    clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &memSize, 0);

    char* str = '\0';
    switch (objT) {
        case CL_MEM_OBJECT_BUFFER: str = "Buffer or Sub-buffer";break;
        case CL_MEM_OBJECT_IMAGE2D: str = "2D Image Object";break;
        case CL_MEM_OBJECT_IMAGE3D: str = "3D Image Object";break;
    }

    char flagStr[128] = {'\0'};
    if(flags & CL_MEM_READ_WRITE)     strcat(flagStr, "Read-Write|");
    if(flags & CL_MEM_WRITE_ONLY)     strcat(flagStr, "Write Only|");
    if(flags & CL_MEM_READ_ONLY)      strcat(flagStr, "Read Only|");
    if(flags & CL_MEM_COPY_HOST_PTR)  strcat(flagStr, "Copy from Host|");
    if(flags & CL_MEM_USE_HOST_PTR)   strcat(flagStr, "Use from Host|");
    if(flags & CL_MEM_ALLOC_HOST_PTR) strcat(flagStr, "Alloc from Host|");

    printf("\tOpenCL Buffer's details =>\n\t size: %lu MB,\n\t object type is: %s,\n\t flags:0x%lx (%s) \n", memSize >> 20, str, flags, flagStr);
}

int main(){
	cl_uint platformsNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformsNum);
	cl_platform_id *platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformsNum);
	status=clGetPlatformIDs(platformsNum,platforms,NULL);
	cl_device_id device;
	clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,&status);
	const char* files[]={"user_test.cl"};
	const int filesNum=1;
	char* buffer[filesNum];
	size_t sizes[filesNum];
	loadProgramSource(files,filesNum,buffer,sizes);
	cl_program program=clCreateProgramWithSource(context,filesNum,(const char**)buffer,sizes,&status);
	status=clBuildProgram(program,1,&device,NULL,NULL,NULL);

	const int DATA_SIZE=5;
	UserData* ud_in = (UserData*) malloc( sizeof(UserData) * DATA_SIZE); // input to device
   UserData* ud_out = (UserData*) malloc( sizeof(UserData) * DATA_SIZE); // output from device
   for( int i = 0; i < DATA_SIZE; ++i) {
	  (ud_in + i)->x = i;
	  (ud_in + i)->y = i;
	  (ud_in + i)->z = i;
	  (ud_in + i)->w = i;//the result is wrong which seems the kernel function is not significant.
	  //(ud_in + i)->w = 3*i;
   }

   cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * 1);
   status = clCreateKernelsInProgram(program, 1, kernels, NULL);
   cl_mem UDObj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(UserData) * DATA_SIZE, (void *)ud_in, &status);
   displayBufferDetails(UDObj);
   status = clSetKernelArg(kernels[0], 0, sizeof(cl_mem), &UDObj);
   status = clEnqueueTask(queue, kernels[0], 0, NULL, NULL);
   status = clEnqueueReadBuffer(queue, UDObj, CL_TRUE,  0,  sizeof(UserData) * DATA_SIZE,ud_out, 0, NULL, NULL);
   for(int i=0;i!=DATA_SIZE;i++){
	   printf("x:%d + y: %d + z:%d = w:%d\n", (ud_out[i]).x,(ud_out[i]).y,(ud_out[i]).z,(ud_out[i]).w);
   }

	clReleaseCommandQueue(queue);
	status = clReleaseMemObject(UDObj);
	clReleaseKernel(kernels[0]);
	free(buffer[0]);
	clReleaseProgram(program);
	clReleaseContext(context);
	free(ud_in);
   free(ud_out);
	return 0;
}结果： 
   
  是错的 kernel根本执行没有了意义  如果用原来的3*i也是 
  我改成了下面这样： 
   
  #include 
#include 
#include 
#include 
typedef struct UserData {
    int x;
    int y;
    int z;
    int w;
} UserData;

void displayBufferDetails(cl_mem memobj);
void loadProgramSource(const char** files, size_t length,char** buffer,size_t* sizes);

int main(){
	cl_uint platformsNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformsNum);
	cl_platform_id *platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformsNum);
	status=clGetPlatformIDs(platformsNum,platforms,NULL);
	cl_device_id device;
	clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,&status);
	const char* files[]={"user_test.cl"};
	const int filesNum=1;
	char* buffer[filesNum];
	size_t sizes[filesNum];
	loadProgramSource(files,filesNum,buffer,sizes);
	cl_program program=clCreateProgramWithSource(context,filesNum,(const char**)buffer,sizes,&status);
	status=clBuildProgram(program,1,&device,NULL,NULL,NULL);

	const int DATA_SIZE=5;
	UserData ud_in[DATA_SIZE] ; // input to device
   UserData ud_out[DATA_SIZE]; // output from device
   for( int i = 0; i < DATA_SIZE; ++i) {
	  ud_in[ i].x = i;
	  ud_in[ i].y = i;
	  ud_in[ i].z = i;
	  ud_in[ i].w=  i; //the code will always be correct whatever w is.
   }

   cl_kernel kernel=clCreateKernel(program,"hello",&status);
   cl_mem UDObj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(UserData) * DATA_SIZE, (void *)ud_in, &status);
   displayBufferDetails(UDObj);
   status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &UDObj);
   status=clEnqueueWriteBuffer(queue,UDObj,CL_TRUE,0,sizeof(UserData)*DATA_SIZE,ud_in,0,NULL,NULL);
   size_t global_item_size=DATA_SIZE;
	size_t local_item_size=1;
	status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
   //status = clEnqueueTask(queue, kernel, 0, NULL, NULL);
	status = clEnqueueReadBuffer(queue, UDObj, CL_TRUE, 0, sizeof(UserData)*DATA_SIZE, ud_out, 0, NULL, NULL);
	for(int i=0;i!=DATA_SIZE;i++){
		printf("x:%d + y: %d + z:%d = w:%d\n", (ud_out[i]).x,(ud_out[i]).y,(ud_out[i]).z,(ud_out[i]).w);
	}

	clReleaseCommandQueue(queue);
	status = clReleaseMemObject(UDObj);
	clReleaseKernel(kernel);
	free(buffer[0]);
	clReleaseProgram(program);
	clReleaseContext(context);
	return 0;
}结果： 
   
  
 证明是对的  即使在host上被初始化为i  执行kernel后还是正确了  这才有意义嘛 
   
  下一个例子是sub-buffer的 我写了下运行了下 和源码对比 原来只有platform或device有多个时才能sub-buffer  不然会报错 我的电脑就是。 
  再下一个例子是events：延迟command queue中的命令 但这个例子： 
  #include 
#include 
#include 
#include 

void loadProgramSource(const char** files, size_t length,char** buffer,size_t* sizes) {
	   /* Read each source file (*.cl) and store the contents into a temporary datastore */
	   for(size_t i=0; i < length; i++) {
	      FILE* file = fopen(files[i], "r");
	      if(file == NULL) {
	         perror("Couldn't read the program file");
	         exit(1);
	      }
	      fseek(file, 0, SEEK_END);
	      sizes[i] = ftell(file);
	      rewind(file); // reset the file pointer so that 'fread' reads from the front
	      buffer[i] = (char*)malloc(sizes[i]+1);
	      buffer[i][sizes[i]] = '\0';
	      fread(buffer[i], sizeof(char), sizes[i], file);
	      fclose(file);
	   }
}

void CL_CALLBACK postProcess(cl_event event, cl_int status, void *data) {
	printf("%s\n", (char*)data);
}

int main(){
	cl_uint platformsNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformsNum);
	cl_platform_id *platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformsNum);
	status=clGetPlatformIDs(platformsNum,platforms,NULL);
	cl_device_id device;
	clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,&status);
	const char* files[]={"sample_kernel.cl"};
	const int filesnum=1;
	char* buffer[filesnum];
	size_t sizes[filesnum];
	loadProgramSource(files,filesnum,buffer,sizes);
	cl_program program=clCreateProgramWithSource(context,filesnum,(const char**)buffer,sizes,&status);
	status=clBuildProgram(program,1,&device,NULL,NULL,NULL);

	int i, j;
	float *A;
	float *B;
	float *C;
	A = (float *)alloca(4*4*sizeof(float));
	B = (float *)alloca(4*4*sizeof(float));
	C = (float *)alloca(4*4*sizeof(float));
	for (i=0; i<4; i++) {
		for (j=0; j<4; j++) {
			A[i*4+j] = i*4+j+1;
			B[i*4+j] = j*4+i+1;
		}
	}
	cl_kernel kernel=clCreateKernel(program,"sample",&status);
	cl_mem a_buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(float)*4*4,NULL,&status);
	cl_mem b_buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(float)*4*4,NULL,&status);
	cl_mem c_buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(float)*4*4,NULL,&status);

	/*
	 * Creating an user event
	 * As a user event is created, its execution status is set to be CL_SUBMITTED
	 * and we tag the event to a callback so when event reaches CL_COMPLETE, it will
	 * execute postProcess
	 */
	cl_event event1 = clCreateUserEvent(context, &status);
	char* eventfarg4="Looks like its done.";
	clSetEventCallback(event1, CL_COMPLETE, &postProcess, eventfarg4);

	status=clSetKernelArg(kernel,0,sizeof(cl_mem),&a_buffer);
	status=clSetKernelArg(kernel,0,sizeof(cl_mem),&b_buffer);
	status=clSetKernelArg(kernel,0,sizeof(cl_mem),&c_buffer);
	status=clEnqueueWriteBuffer(queue,a_buffer,CL_TRUE,0,sizeof(float)*4*4,A,0,NULL,NULL);
	//status=clEnqueueWriteBuffer(queue,b_buffer,CL_TRUE,0,sizeof(float)*4*4,B,0,NULL,NULL);
	status=clEnqueueWriteBuffer(queue,b_buffer,CL_TRUE,0,sizeof(float)*4*4,B,1,&event1,NULL);

	clSetUserEventStatus(event1, CL_COMPLETE);
	size_t global_item_size = 4;
	size_t local_item_size = 1;
	status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);

	status = clEnqueueReadBuffer(queue, c_buffer, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);

	for (i=0; i<4; i++) {
		for (j=0; j<4; j++) {
			printf("%7.2f ", C[i*4+j]);
		}
		printf("\n");
	}

	clReleaseCommandQueue(queue);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseMemObject(a_buffer);
	clReleaseMemObject(b_buffer);
	clReleaseMemObject(c_buffer);
	clReleaseContext(context);
	clReleaseEvent(event1);
	return 0;
}但这个在写入A以后，就一直在等待event延迟，可是什么时候结束呢 ？我等了一会儿还是没有结束 我没看到这个CL_CALLBACK怎么定义结束的 看运行结果也是在写入A后感觉像卡死在event里了？！ 
   
  下一个例子是创建2D buffer_memory的 第一次看到这种不用读kernel的 那这种有什么意义呢用OpenCL 怎么去加速呢？是不是只要有buffer就可以实现加速了？

 
  #include 
#include 
#include 

int main(){
	cl_uint platformsNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformsNum);
	cl_platform_id *platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformsNum);
	status=clGetPlatformIDs(platformsNum,platforms,NULL);
	cl_device_id device;
	clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,&status);

	const int NUM_BUFFER_ELEMENTS=16;
	cl_int hostBuffer[NUM_BUFFER_ELEMENTS] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
	//cannot be CL_MEM_READ_ONLY  ,the result is not correct!
	cl_mem DObj=clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR,sizeof(int)*NUM_BUFFER_ELEMENTS,hostBuffer,&status);
	cl_int outputPtr[16] = {-1, -1, -1, -1,-1, -1, -1, -1,-1, -1, -1, -1,-1, -1, -1, -1};
	for(int idx = 0; idx < 4; ++idx) {
		size_t buffer_origin[3] = {idx*2*sizeof(int), idx, 0};
		size_t host_origin[3] = {idx*2*sizeof(int), idx, 0};
		size_t region[3] = {2*sizeof(int), 2, 1};
		//here????????????????????has no kernel and so on but it can get correct results! 
		status = clEnqueueReadBufferRect(queue,DObj,CL_TRUE, buffer_origin, host_origin, region, 
			   0, // buffer_row_pitch
			   0, // buffer_slice_pitch
			   0, // host_row_pitch
			   0, // host_slice_pitch
			   outputPtr, 0, NULL, NULL);
	}
	for(int i = 0; i < 16; i++)
		printf("%d\n", outputPtr[i]);

	clReleaseCommandQueue(queue);
	status = clReleaseMemObject(DObj);
	clReleaseContext(context);
	return 0;
}
结果是对的 但下面问号部分我是按书上源代码  我是有疑问的：这种二维的  我知道buffer_origin和host_origin 是点对点传 第三维度是0 也就是只有二维 但region和拷贝回来那里我没怎么理解 
   
  

 
  下个例子我补的是《OpenCL编程思想》的卷积：也是二维的例子 可以看成图像： 
  int main(){
	cl_uint platformsNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformsNum);
	cl_platform_id *platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformsNum);
	status=clGetPlatformIDs(platformsNum,platforms,NULL);
	cl_device_id device;
	clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,&status);

	const char* filelist[]={"convolution.cl"};
	const int filenum=1;
	char* buffer[filenum];
	size_t sizes[filenum];
	loadProgramSource(filelist,filenum,buffer,sizes);
	cl_program program=clCreateProgramWithSource(context,filenum,(const char**)buffer,sizes,&status);
	status=clBuildProgram(program,1,&device,NULL,NULL,NULL);

	const uint inputSignalWidth=8,inputSignalHeight=8,outputSignalWidth=6,outputSignalHeight=6,maskWidth=3,maskHeight=3;
	cl_uint inputsignal[inputSignalWidth][inputSignalHeight]={{3,1,1,4,8,2,1,3},{4,2,1,1,2,1,2,3},{4,4,4,4,3,2,2,2},{9,8,6,4,2,3,4,4},{1,1,6,4,0,0,0,0},{0,9,0,5,3,0,5,5},{8,6,4,3,3,3,1,1,},{5,6,0,0,0,0,6,2}};
	cl_uint mask[maskWidth][maskHeight]={{1,1,1},{1,0,1},{1,1,1}};

	cl_kernel kernel=clCreateKernel(program,"convolve",&status);
	cl_mem input_mem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int) *inputSignalWidth* inputSignalHeight,inputsignal, &status);
	cl_mem mask_mem =clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(int) *maskWidth* maskHeight, mask, &status);
	cl_mem output_mem =clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) *outputSignalWidth* outputSignalHeight, NULL, &status);
	status=clEnqueueWriteBuffer(queue,input_mem,CL_TRUE,0,sizeof(int) *inputSignalWidth* inputSignalHeight,inputsignal,0,NULL,NULL);
	status=clEnqueueWriteBuffer(queue,mask_mem,CL_TRUE,0,sizeof(int) *maskWidth* maskHeight,mask,0,NULL,NULL);
	status=clSetKernelArg(kernel,0,sizeof(cl_mem),&input_mem);
	status=clSetKernelArg(kernel,1,sizeof(cl_mem),&mask_mem);
	status=clSetKernelArg(kernel,2,sizeof(cl_mem),&output_mem);
	status=clSetKernelArg(kernel,3,sizeof(cl_uint),&inputSignalWidth);
	status=clSetKernelArg(kernel,4,sizeof(cl_uint),&maskWidth);

	size_t global_item_size=outputSignalWidth* outputSignalHeight;
	size_t local_item_size=1;
	status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);

	cl_uint outputsignal[outputSignalWidth][outputSignalHeight];
	status=clEnqueueReadBuffer(queue, output_mem, CL_TRUE, 0,sizeof(cl_uint) *outputSignalWidth* outputSignalHeight, outputsignal, 0, NULL, NULL);
	for(int y=0;y	clReleaseCommandQueue(queue);
	status = clReleaseMemObject(input_mem);
	status = clReleaseMemObject(mask_mem);
	status = clReleaseMemObject(output_mem);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseContext(context);
	return 0;
}结果是对的 上面的我也理解  但我不怎么理解的是它的kernel函数： 
   
  
 
   
  __kernel void convolve(const __global uint* const input,__constant uint* const mask,__global uint* output,const int inputWidth,const int maskWidth){
	const int x=get_global_id(0);
	const int y=get_global_id(1);
	uint sum=0;
	for(int r=0;r
是不是0代表的是第一个形参的地址 1代表的是第二个形参 所以通过get_global_id是可以确定的谁是谁的 当有多个时？是这样理解吧。要不然x是谁的首地址 y是谁的怎么区分。我知道r*maskWidth+c是计算mask窗口的第几个数的下标 后面的我没看明白？ 
  
 
  现在看到《OpenCL编程指南》第9章 上面这个核函数的问题我自己已明白 通过第8章的一个例子：高斯滤波  改动： 
   
  #include 
#include 
#include 
#include 
#include 
#include 
#include 
#include "FreeImage.h"
#include "gFreeImage.h"
cl_mem LoadImage(cl_context context, char *fileName, int &width, int &height)
{
    FREE_IMAGE_FORMAT format = FreeImage_GetFileType(fileName, 0);
    FIBITMAP* image = FreeImage_Load(format, fileName);
    // Convert to 32-bit image
    FIBITMAP* temp = image;
    image = FreeImage_ConvertTo32Bits(image);
    FreeImage_Unload(temp);
    width = FreeImage_GetWidth(image);
    height = FreeImage_GetHeight(image);
    char *buffer = new char[width * height * 4];
    memcpy(buffer, FreeImage_GetBits(image), width * height * 4);
    FreeImage_Unload(image);
    // Create OpenCL image
    cl_image_format clImageFormat;
    clImageFormat.image_channel_order = CL_RGBA;
    clImageFormat.image_channel_data_type = CL_UNORM_INT8;
    cl_int errNum;
    cl_mem clImage;
    clImage = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,&clImageFormat,width,height, 0,buffer,&errNum);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error creating CL image object" << std::endl;
        return 0;
    }
    return clImage;
}
size_t RoundUp(int groupSize, int globalSize)
{
    int r = globalSize % groupSize;
    if(r == 0)
    {
     	return globalSize;
    }
    else
    {
     	return globalSize + groupSize - r;
    }
}
bool SaveImage(char *fileName, char *buffer, int width, int height)
{
    FREE_IMAGE_FORMAT format = FreeImage_GetFIFFromFilename(fileName);
    FIBITMAP *image = FreeImage_ConvertFromRawBits((BYTE*)buffer, width,
                        height, width * 4, 32,
                        0xFF000000, 0x00FF0000, 0x0000FF00);
    return (FreeImage_Save(format, image, fileName) == TRUE) ? true : false;
}
int main(){
		cl_uint platformNum;
		cl_int status;
		status=clGetPlatformIDs(0,NULL,&platformNum);
		if(status!=CL_SUCCESS){
			printf("cannot get platforms number.\n");
			return -1;
		}
		cl_platform_id* platforms;
		platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
		status=clGetPlatformIDs(platformNum,platforms,NULL);
		if(status!=CL_SUCCESS){
			printf("cannot get platforms addresses.\n");
			return -1;
		}
		cl_platform_id platformInUse=platforms[0];
		cl_device_id device;
		clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);
		cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
		cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);
		//Create Input Image Object
		char file[]={"sample_raw.png"};
		int imgwidth,imgheight;
		cl_mem image=LoadImage(context,file,imgwidth,imgheight);
		cl_image_format clImageFormat;
		//Create Output Image Object
		clImageFormat.image_channel_order = CL_RGBA;
		clImageFormat.image_channel_data_type = CL_UNORM_INT8;
		cl_mem outimg;
		outimg = clCreateImage2D(context, CL_MEM_WRITE_ONLY,&clImageFormat,imgwidth,imgheight, 0,NULL,&status);
		//Create Image Sampler
		cl_sampler sampler=clCreateSampler(context,CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE,CL_FILTER_NEAREST,&status);
		std::ifstream srcFile("ImageFilter2D.cl");
		std::string srcProg(std::istreambuf_iterator(srcFile),(std::istreambuf_iterator()));
		const char * src = srcProg.c_str();
		size_t length = srcProg.length();
		cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);
		status=clBuildProgram(program,1,&device,NULL,NULL,NULL);
		cl_kernel kernel=clCreateKernel(program,"gaussian_filter",NULL);
		status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image);
		status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &outimg);
		status |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
		status |= clSetKernelArg(kernel, 3, sizeof(cl_int), &imgwidth);
		status |= clSetKernelArg(kernel, 4, sizeof(cl_int), &imgheight);
		size_t localWorkSize[2] = { 16, 16 };
		size_t globalWorkSize[2] =  { RoundUp(localWorkSize[0], imgwidth),RoundUp(localWorkSize[1], imgheight) };

		status=clEnqueueNDRangeKernel(queue,kernel,2, NULL,globalWorkSize, localWorkSize,0, NULL, NULL);
//		char *buffer = (char*)alloca[imgwidth * imgheight * 4]; //wrong !!!!
		char *buffer = (char*)alloca(imgwidth * imgheight * 4);
		size_t origin[3] = { 0, 0, 0 };
		size_t region[3] = { imgwidth, imgheight, 1};
		status = clEnqueueReadImage(queue, outimg, CL_TRUE,origin, region, 0, 0, buffer,0, NULL, NULL);
		char imgname[]={"filteredimg.png"};
		bool flag=SaveImage(imgname, buffer, imgwidth, imgheight);
		clReleaseCommandQueue(queue);
		clReleaseContext(context);
		clReleaseProgram(program);
		clReleaseKernel(kernel);
		clReleaseMemObject(image);
		clReleaseMemObject(outimg);
		clReleaseSampler(sampler);

		return 0;
} 
   
  它的kernel： 
   
  __kernel void gaussian_filter(__read_only image2d_t srcImg,__write_only image2d_t dstImg,sampler_t sampler,int width, int height)
{
    float kernelWeights[9] = { 1.0f, 2.0f, 1.0f,
                               2.0f, 4.0f, 2.0f,
                               1.0f, 2.0f, 1.0f };
    int2 startImageCoord = (int2) (get_global_id(0) - 1, get_global_id(1) - 1);
    int2 endImageCoord   = (int2) (get_global_id(0) + 1, get_global_id(1) + 1);
    int2 outImageCoord = (int2) (get_global_id(0), get_global_id(1));
    if (outImageCoord.x < width && outImageCoord.y < height)
    {
        int weight = 0;
        float4 outColor = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
        for( int y = startImageCoord.y; y <= endImageCoord.y; y++)
        {
            for( int x = startImageCoord.x; x <= endImageCoord.x; x++)
            {
                outColor += (read_imagef(srcImg, sampler, (int2)(x, y)) * (kernelWeights[weight] / 16.0f));
                weight += 1;
            }
        }
        // Write the output value to image
        write_imagef(dstImg, outImageCoord, outColor);
    }
}其实就是这个意思： 
   
  
 所以之前没有想明白的问题 也就是之前那个卷积的kernel我也想明白了  还有我查了下 get_global_size(0)是什么意思了 原来是指每个工作组x方向上占多大即多少个workitems。这个高斯滤波奇怪之处在于我开始用一幅图处理结果竟然是个白纸 ！有问题 但我换了一幅图就好了。即使直接用书上源码 不用我上面那个也是这样！奇怪！ 
   
  看到《OpenCL编程指南》第9.5节 其实第9章我没怎么理解  因为没有什么实例供我学习。但看到这里  我知道我之前的那个event的工程为什么错了 首先源代码中的意思是向buffer中传入B参数时host创建了一个event 这个事件会回调一个异步函数 等这个event标志被host调为CL_COMPLETE后就继续执行写入B_buffer和执行kernel。它位置放错了set CL_COMPLETE位置放在后面了 所以那句写B等待一直读不到CL_COMPLETE所以一直卡在那里！还有一个就是我粗心写错的 就是SetKernel那里 我改过来了。 
   
  #include 
#include 
#include 
#include 
void loadProgramSource(const char** files, size_t length,char** buffer,size_t* sizes) {
	   /* Read each source file (*.cl) and store the contents into a temporary datastore */
	   for(size_t i=0; i < length; i++) {
	      FILE* file = fopen(files[i], "r");
	      if(file == NULL) {
	         perror("Couldn't read the program file");
	         exit(1);
	      }
	      fseek(file, 0, SEEK_END);
	      sizes[i] = ftell(file);
	      rewind(file); // reset the file pointer so that 'fread' reads from the front
	      buffer[i] = (char*)malloc(sizes[i]+1);
	      buffer[i][sizes[i]] = '\0';
	      fread(buffer[i], sizeof(char), sizes[i], file);
	      fclose(file);
	   }
}
void CL_CALLBACK postProcess(cl_event event, cl_int status, void *data) {
	printf("%s\n", (char*)data);
	for(int i=0;i<10;i++)
		printf("%d ",i);
	printf("\nevent call back has being done!\n");
}

int main(){
	cl_uint platformsNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformsNum);
	cl_platform_id *platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformsNum);
	status=clGetPlatformIDs(platformsNum,platforms,NULL);
	cl_device_id device;
	clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,&status);
	const char* files[]={"sample_kernel.cl"};
	const int filesnum=1;
	char* buffer[filesnum];
	size_t sizes[filesnum];
	loadProgramSource(files,filesnum,buffer,sizes);
	cl_program program=clCreateProgramWithSource(context,filesnum,(const char**)buffer,sizes,&status);
	status=clBuildProgram(program,1,&device,NULL,NULL,NULL);
	int i, j;
	float *A = (float *)alloca(4*4*sizeof(float));
	float *B = (float *)alloca(4*4*sizeof(float));
	float *C = (float *)alloca(4*4*sizeof(float));
	for (i=0; i<4; i++) {
		for (j=0; j<4; j++) {
			A[i*4+j] = i*4+j+1;
			B[i*4+j] = j*4+i+1;
			printf("%7.2f,%7.2f  ", A[i*4+j],B[i*4+j]);
		}
		printf("\n");
	}
	cl_kernel kernel=clCreateKernel(program,"sample",&status);
	cl_mem a_buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(float)*4*4,NULL,&status);
	cl_mem b_buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(float)*4*4,NULL,&status);
	cl_mem c_buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(float)*4*4,NULL,&status);

	/*
	 * Creating an user event
	 * As a user event is created, its execution status is set to be CL_SUBMITTED
	 * and we tag the event to a callback so when event reaches CL_COMPLETE, it will
	 * execute postProcess
	 */
	cl_event event1 = clCreateUserEvent(context, &status);
	char* eventfarg4="Looks like its done.";
	clSetEventCallback(event1, CL_COMPLETE, &postProcess, eventfarg4);
	status=clSetKernelArg(kernel,0,sizeof(cl_mem),&a_buffer);
	status=clSetKernelArg(kernel,1,sizeof(cl_mem),&b_buffer);
	status=clSetKernelArg(kernel,2,sizeof(cl_mem),&c_buffer);
	status=clEnqueueWriteBuffer(queue,a_buffer,CL_TRUE,0,sizeof(float)*4*4,A,0,NULL,NULL);
	printf("have written a\n");
	//status=clEnqueueWriteBuffer(queue,b_buffer,CL_TRUE,0,sizeof(float)*4*4,B,0,NULL,NULL);
	clSetUserEventStatus(event1, CL_COMPLETE);
	status=clEnqueueWriteBuffer(queue,b_buffer,CL_TRUE,0,sizeof(float)*4*4,B,1,&event1,NULL);
	printf("have written b\n");
	size_t global_item_size = 4;
	size_t local_item_size = 1;
	status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
	status = clEnqueueReadBuffer(queue, c_buffer, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);

	for (i=0; i<4; i++) {
		for (j=0; j<4; j++) {
			printf("%7.2f ", C[i*4+j]);
		}
		printf("\n");
	}
	clReleaseCommandQueue(queue);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseMemObject(a_buffer);
	clReleaseMemObject(b_buffer);
	clReleaseMemObject(c_buffer);
	clReleaseContext(context);
	clReleaseEvent(event1);
	return 0;
}这下就解决了，结果如下： 
   
  
 通过这个例子 我理解了host创建event 回调函数显示控制commandqueue中的命令顺序 ！但第9章所说的跨不同context和queue的交互我没怎么懂、同步点那里也没怎么懂！没实例！ 
  还有一个例子 就是之前我说clEnqueueReadBufferRect()中第7、8、9和10个参数我不懂的 看了书上后懂了意思，书上给了计算公式 套公式就行。这个例子是不进入kernel而是直接在buffer进行计算： 
   
  #include 
#include 
#include 
//?????????????????
int main(){
	cl_uint platformsNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformsNum);
	cl_platform_id *platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformsNum);
	status=clGetPlatformIDs(platformsNum,platforms,NULL);
	cl_device_id device;
	clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,&status);

	const int NUM_BUFFER_ELEMENTS=16;
	cl_int hostBuffer[NUM_BUFFER_ELEMENTS] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
	//cannot be CL_MEM_READ_ONLY  ,the result is not correct!
	cl_mem DObj=clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR,sizeof(int)*NUM_BUFFER_ELEMENTS,hostBuffer,&status);
	cl_int outputPtr[16] = {-1, -1, -1, -1,-1, -1, -1, -1,-1, -1, -1, -1,-1, -1, -1, -1};
	for(int idx = 0; idx < 4; ++idx) {
		size_t buffer_origin[3] = {idx*2*sizeof(int), idx, 0};
		size_t host_origin[3] = {idx*2*sizeof(int), idx, 0};
		size_t region[3] = {2*sizeof(int), 2, 1};
		//here ,has no kernel and so on!
		status = clEnqueueReadBufferRect(queue,DObj,CL_TRUE, buffer_origin, host_origin, region,
				0, // buffer_row_pitch
			   0, // buffer_slice_pitch
			   0, // host_row_pitch
			   0, // host_slice_pitch
			   outputPtr, 0, NULL, NULL);
	}
	for(int i = 0; i < 16; i++)
		printf("%d ", outputPtr[i]);
	printf("\n");

	int ptr[4]={-1,-1,-1,-1};
	size_t origin_buffer[3]={1*sizeof(int),1,0};
	size_t origin_host[3]={0,0,0};
	size_t regions[3]={2*sizeof(int),2,1};
	status = clEnqueueReadBufferRect(queue,DObj,CL_TRUE, origin_buffer, origin_host, regions,
					4*sizeof(int), // buffer_row_pitch
				   0, // buffer_slice_pitch
				   0, // host_row_pitch
				   2*sizeof(int), // host_slice_pitch
				   ptr, 0, NULL, NULL);
	for(int i = 0; i < 4; i++)
			printf("%d ", ptr[i]); //I can calculate "5" ,but why the sequence is 6 9 10?????????
		printf("\n");


	//do not use clEnqueueReadBuffer() or clEnqueueReadBufferRect() but clEnqueueMapBuffer() and clEnqueueUnmapMemObject()
	cl_int *mapptr=(cl_int*)clEnqueueMapBuffer(queue,DObj,CL_TRUE,CL_MAP_WRITE,0,sizeof(cl_int)*NUM_BUFFER_ELEMENTS,0,NULL,NULL,&status);
	for(uint i=0;i
结果： 
   
   
  第一行和第三行的结果直接套公式就知道。但第二行的结果 我套公式只能算出5， 那个6 9 10怎么计算来的 我没懂？？？？？？？？ 
  看到性能评估这一章： 
   
  #include 
#include 
#include 
#include 
void loadProgramSource(const char** files, size_t length,char** buffer,size_t* sizes);
void CL_CALLBACK eventCallBack(cl_event ev,cl_int event_status,void* user_data);

int main(){
	cl_uint platformsNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformsNum);
	cl_platform_id *platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformsNum);
	status=clGetPlatformIDs(platformsNum,platforms,NULL);
	cl_device_id device;
	clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
	cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE,&status);
	const char* files[]={"sample_kernel.cl"};
	const int filesnum=1;
	char* buffer[filesnum];
	size_t sizes[filesnum];
	loadProgramSource(files,filesnum,buffer,sizes);
	cl_program program=clCreateProgramWithSource(context,filesnum,(const char**)buffer,sizes,&status);
	status=clBuildProgram(program,1,&device,NULL,NULL,NULL);

	int i, j;
	float *A = (float *)alloca(4*4*sizeof(float));
	float *B = (float *)alloca(4*4*sizeof(float));
	float *C = (float *)alloca(4*4*sizeof(float));

	for (i=0; i<4; i++) {
		for (j=0; j<4; j++) {
			A[i*4+j] = i*4+j+1;
			B[i*4+j] = j*4+i+1;
			printf("%7.2f,%7.2f  ", A[i*4+j],B[i*4+j]);
		}
		printf("\n");
	}
	cl_kernel kernel=clCreateKernel(program,"sample",&status);
	cl_mem a_buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(float)*4*4,NULL,&status);
	cl_mem b_buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(float)*4*4,NULL,&status);
	cl_mem c_buffer=clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(float)*4*4,NULL,&status);

	status=clSetKernelArg(kernel,0,sizeof(cl_mem),&a_buffer);
	status=clSetKernelArg(kernel,1,sizeof(cl_mem),&b_buffer);
	status=clSetKernelArg(kernel,2,sizeof(cl_mem),&c_buffer);
	status=clEnqueueWriteBuffer(queue,a_buffer,CL_TRUE,0,sizeof(float)*4*4,A,0,NULL,NULL);
	printf("have written a\n");
	status=clEnqueueWriteBuffer(queue,b_buffer,CL_TRUE,0,sizeof(float)*4*4,B,0,NULL,NULL);
	printf("have written b\n");
	size_t global_item_size = 4;
	size_t local_item_size = 1;

	cl_event usr_callback_event=clCreateUserEvent(context,&status);
	cl_event prof_event;
	status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_item_size, &local_item_size, 1, &usr_callback_event, &prof_event);
	int ID=0;
	clSetEventCallback(prof_event, CL_COMPLETE, &eventCallBack, (void*)ID);
	status=clSetUserEventStatus(usr_callback_event,CL_COMPLETE);

	status = clEnqueueReadBuffer(queue, c_buffer, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);
	for (i=0; i<4; i++) {
		for (j=0; j<4; j++) {
			printf("%7.2f ", C[i*4+j]);
		}
		printf("\n");
	}
	clReleaseCommandQueue(queue);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseMemObject(a_buffer);
	clReleaseMemObject(b_buffer);
	clReleaseMemObject(c_buffer);
	clReleaseContext(context);
	clReleaseEvent(usr_callback_event);
	clReleaseEvent(prof_event);

	return 0;
}结果： 
   
  

 但有个问题：之前我写过一个event的工程 先试卡在事件那里 后来我把seteventstatus放在执行commandqueue前面 就可以了 但这个程序中是放在后面的 为什么不卡呢？？ 
  status=clSetUserEventStatus(usr_callback_event,CL_COMPLETE);这句明明是在后面 为什么没有像我之前那样卡住  一直等待CL_COMPLETE  我的之前的放在后面就一直等待？？？？ 
   
  《OpenCL编程指南》我跨过了与3D渲染有关的：OpenCL、Direct3D的交互以及嵌入式。这些我暂时用不到。 
  《OpenCL编程指南》第二部分 案例研究 
  一、图像直方图计算 
  我开始自己写了个： 
   
  #include 
#include 
#include 
#include 
#include 
#include 
#include 
#include "FreeImage.h"
#include "gFreeImage.h"

cl_mem LoadImage(cl_context context, char *fileName, int &width, int &height)
{
    FREE_IMAGE_FORMAT format = FreeImage_GetFileType(fileName, 0);
    FIBITMAP* image = FreeImage_Load(format, fileName);
    // Convert to 32-bit image
    FIBITMAP* temp = image;
    image = FreeImage_ConvertTo32Bits(image);
    FreeImage_Unload(temp);
    width = FreeImage_GetWidth(image);
    height = FreeImage_GetHeight(image);
    char *buffer = new char[width * height * 4];
    memcpy(buffer, FreeImage_GetBits(image), width * height * 4);
    FreeImage_Unload(image);
    // Create OpenCL image
    cl_image_format clImageFormat;
    clImageFormat.image_channel_order = CL_RGBA;
    clImageFormat.image_channel_data_type = CL_UNORM_INT8;
    cl_int errNum;
    cl_mem clImage;
    clImage = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,&clImageFormat,width,height, 0,buffer,&errNum);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Error creating CL image object" << std::endl;
        return 0;
    }
    delete [] buffer;
    return clImage;
}

int main(){
		cl_uint platformNum;
		cl_int status;
		status=clGetPlatformIDs(0,NULL,&platformNum);
		if(status!=CL_SUCCESS){
			printf("cannot get platforms number.\n");
			return -1;
		}
		cl_platform_id* platforms;
		platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
		status=clGetPlatformIDs(platformNum,platforms,NULL);
		if(status!=CL_SUCCESS){
			printf("cannot get platforms addresses.\n");
			return -1;
		}
		cl_platform_id platformInUse=platforms[0];
		cl_device_id device;
		clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_GPU,1,&device,NULL);
		cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,NULL);
		cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);

		//Create Input Image Object
		char file[]={"/home/jumper/OpenCL_projects/Book_ch14_Sample_histogram/lenna.jpg"};
		int imgwidth,imgheight;
		cl_mem image=LoadImage(context,file,imgwidth,imgheight);
		cl_sampler sampler=clCreateSampler(context,CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE,CL_FILTER_NEAREST,&status);
		cl_mem outmem=clCreateBuffer(context, CL_MEM_WRITE_ONLY,sizeof(int)*256*3,NULL,NULL);

		std::ifstream srcFile("/home/jumper/OpenCL_projects/Book_ch14_Sample_histogram/histogramcl.cl");
		std::string srcProg(std::istreambuf_iterator(srcFile),(std::istreambuf_iterator()));
		const char * src = srcProg.c_str();
		size_t length = srcProg.length();
		cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);
		status=clBuildProgram(program,1,&device,NULL,NULL,NULL);

		cl_kernel kernel=clCreateKernel(program,"calculateHistogram",NULL);
		status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image);
		status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &outmem);
		status |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
		status |= clSetKernelArg(kernel, 3, sizeof(cl_int), &imgwidth);
		status |= clSetKernelArg(kernel, 4, sizeof(cl_int), &imgheight);

		size_t localWorkSize[2] = { 16, 16 };
		size_t globalWorkSize[2] =  {imgwidth,imgheight};
		clEnqueueNDRangeKernel(queue,kernel,2, NULL,globalWorkSize, localWorkSize,0, NULL, NULL);

		int* outHistogram=(int*)alloca(sizeof(int)*256*3);
		status=clEnqueueReadBuffer(queue,outmem,CL_TRUE,0,sizeof(int)*256*3,outHistogram,0,NULL,NULL);

		int j=0;
		for(int ind=0;ind<=(255*3);ind=ind+3){
			printf("R:%d  G:%d  B:%d",outHistogram[ind],outHistogram[ind+1],outHistogram[ind+2]);
			printf("    %d\n",j);
			j=j+1;
		}

		clReleaseKernel(kernel);
		clReleaseProgram(program);
		clReleaseCommandQueue(queue);
		clReleaseMemObject(image);
		clReleaseMemObject(outmem);
		clReleaseContext(context);

		return 0;

} 
   
  其中我的kernel是： 
   
  __kernel void calculateHistogram(__read_only image2d_t srcimg,__global const int *outmatrix,sample_t sampler,__global const int width,__global const int height){
	int2 srcposition=(int2)(get_global_id(0),get_global_id(1));
	if (srcposition.x < width && srcposition.y < height)
    {
        int4 positionValue = (int4)(0, 0, 0, 0);
        positionValue = read_imagef(srcImg, sampler, (int2)(x, y)) ;
        int RValue=position.x;
        int GValue=position.y;
        int BValue=position.z;
        outmatrix[RValue*2+RValue]=outmatrix[RValue*2+RValue]+1;
        outmatrix[GValue*3+GValue]=outmatrix[GValue*3+GValue]+1;
        outmatrix[BValue*4+BValue]=outmatrix[BValue*4+BValue]+1;
        
    }
           
} 
  
 但结果出来不对  又出现这种无意义的数： 
   
  
 
  对于这种无意义的数  应该是和前几次我出现无意义的数一样的问题  只是我没找到而已 ？？？？？？？ 
  还有我看了博客http://www.cnblogs.com/mikewolf2002/archive/2012/10/22/2734462.html  大神的计算直方图  出来好奇怪： 
   
  #include 
#include 
#include 
#include 
#include 
#include 
#include 
#include "gclFile.h"
#include "gFreeImage.h"

using namespace std;

//#pragma comment (lib,"OpenCL.lib")
//#pragma comment(lib,"freeimage.lib")

cl_int binSize;            //bins�Ĵ�С�����ڻҶ�ͼ��һ��Ϊ256
cl_int groupSize;       //workgroup��С
cl_int subHistgCnt;   //��ͼ��ֿ�����ֱ��ͼ������ٺϲ�������ͼ���ֱ��ͼ
cl_uint *data;            //ͼ������
cl_int width;              //ͼ����
cl_int height;            //ͼ��߶�
cl_uint *hostBin;       //cpu����õ�������ֱ��ͼ���
cl_uint *midDeviceBin;      //gpu�����ӿ��ֱ��ͼ�����Ҳ��kernel����Ľ�����ϲ���deviceBin�С�
cl_uint *deviceBin;         //�豸(gpu)���������ս��

cl_mem dataBuf;                 //ͼ��device memory
cl_mem midDeviceBinBuf;         //�ӿ�ֱ��ͼ���device memory���������������һ��������kernel������

//cpu��ֱ��ͼ
void cpu_histgo()
	{
	int i, j;
	for(i = 0; i < height; ++i)
		{
		for(j = 0; j < width; ++j)
			{
			//printf("data: %d\n", data[i * width + j] );
			hostBin[data[i * width + j]]++;
			//printf("hostbin %d=%d\n", data[i * width + j], hostBin[data[i * width + j]]);
			}
		}
	}

int waitForEventAndRelease(cl_event *event)
	{
	cl_int status = CL_SUCCESS;
	cl_int eventStatus = CL_QUEUED;
	while(eventStatus != CL_COMPLETE)
		{
		status = clGetEventInfo(
			*event, 
			CL_EVENT_COMMAND_EXECUTION_STATUS, 
			sizeof(cl_int),
			&eventStatus,
			NULL);
		}

	status = clReleaseEvent(*event);

	return 0;
	}

int main0(int argc, char* argv[])
{
	unsigned char *src_image=0;
	gFreeImage img;
	if(!img.LoadImageGrey("/home/jumper/OpenCL_projects/Book_ch14_Sample_histogram/sample_raw.png"))
		{
		printf("can not load lenna.jpg\n");
		exit(0);
		}
	src_image = img.getImageDataGrey(width,height);

	binSize = 256;
	groupSize = 128;
	subHistgCnt = (width *height)/(binSize * groupSize);

	//width��binSize����������height��groupsize��������
	width = (width / binSize ? width / binSize: 1) * binSize;
	height = (height / groupSize ? height / groupSize: 1) * groupSize;

	// ����ͼ������
	data = (cl_uint*)malloc(width * height * sizeof(cl_uint));
	if(!data)
		{
		printf("malloc error\n");
			return 0;
		}
	memset(data, 0,width * height * sizeof(cl_uint));
	 
	int i, j;
	//��ÿ������ֵ����һ��float���飬��Ҫ�Ǽ��㷽�㣬Ҳ������kernel��ֱ����uchar
	for(i = 0; i < width * height; i++)
		{
		data[i] =  (cl_uint)src_image[i]; 
		//printf("%d\n", data[i]);
		//printf("src= %d\n", src_image[i]);
		}

	hostBin = (cl_uint*)malloc(binSize * sizeof(cl_uint));
	if(!hostBin)
		{
		printf("malloc error\n");
			return 0;
		}

	memset(hostBin, 0, binSize * sizeof(cl_uint));

	midDeviceBin = (cl_uint*)malloc(binSize * subHistgCnt * sizeof(cl_uint));
	if(!midDeviceBin)
		{
		printf("malloc error\n");
			return 0;
		}

	memset(midDeviceBin, 0, binSize * subHistgCnt * sizeof(cl_uint));

	deviceBin = (cl_uint*)malloc(binSize * sizeof(cl_uint));
	if(!deviceBin)
		{
		printf("malloc error\n");
			return 0;
		}

	memset(deviceBin, 0, binSize * sizeof(cl_uint));

	//����driver dump il��isa�ļ�
	//_putenv("GPU_DUMP_DEVICE_KERNEL=3");

	cl_uint status;
	cl_platform_id platform;

	//����ƽ̨����
	//status = clGetPlatformIDs( 1, &platform, NULL );
	cl_uint numPlatforms;
	std::string platformVendor; 
	status = clGetPlatformIDs(0, NULL, &numPlatforms);
	if(status != CL_SUCCESS)
		{
		return 0;
		}
	if (0 < numPlatforms) 
		{
		cl_platform_id* platforms = new cl_platform_id[numPlatforms];
		status = clGetPlatformIDs(numPlatforms, platforms, NULL);

		char platformName[100];
		for (unsigned i = 0; i < numPlatforms; ++i) 
			{
			status = clGetPlatformInfo(platforms[i],
				CL_PLATFORM_VENDOR,
				sizeof(platformName),
				platformName,
				NULL);

			platform = platforms[i];
			platformVendor.assign(platformName);

			if (!strcmp(platformName, "Advanced Micro Devices, Inc.")) 
				{
				break;
				}
			}

		std::cout << "Platform found : " << platformName << "\n";
		delete[] platforms;
		}
	cl_device_id device;

	//����GPU�豸
	clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU,
		1,
		&device,
		NULL);
	//����context
	cl_context context = clCreateContext( NULL,
		1,
		&device,
		NULL, NULL, NULL);
	//�����������
	cl_command_queue queue = clCreateCommandQueue( context,
		device,
		CL_QUEUE_PROFILING_ENABLE, NULL );

	//����2��OpenCL�ڴ����
	dataBuf = clCreateBuffer(
		context, 
		CL_MEM_READ_ONLY,
		sizeof(cl_uint) * width  * height,
		NULL, 
		0);

	midDeviceBinBuf = clCreateBuffer(
		context, 
		CL_MEM_WRITE_ONLY,
		sizeof(cl_uint) * binSize * subHistgCnt,
		NULL, 
		0);

	//��ͼ������д��device memory��
	cl_event writeEvt;
	status = clEnqueueWriteBuffer(queue,
		dataBuf,
		CL_FALSE,
		0,
		width * height * sizeof(cl_uint),
		data,
		0,
		NULL,
		&writeEvt);

	status = clFlush(queue);
	waitForEventAndRelease(&writeEvt);

	//kernel�ļ�Ϊhisto.cl
	gclFile kernelFile;
	if(!kernelFile.open("/home/jumper/OpenCL_projects/Book_ch14_Sample_histogram/src/blog_histo.cl"))
		{
		printf("Failed to load kernel file \n");
		exit(0);
		}
	const char * source = kernelFile.source().c_str();
	size_t sourceSize[] = {strlen(source)};
	//�����������
	cl_program program = clCreateProgramWithSource(context, 1, &source,sourceSize,NULL);
	//����������
	status = clBuildProgram( program, 1, &device, NULL, NULL, NULL );
	if(status != 0)
		{
		printf("clBuild failed:%d\n", status);
		char tbuf[0x10000];
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0x10000, tbuf, NULL);
		printf("\n%s\n", tbuf);
		return -1;
		}

	//����Kernel����
	cl_kernel kernel = clCreateKernel( program, "histogram256", NULL );
	//����Kernel����
	cl_mem pos=clCreateBuffer( context,CL_MEM_WRITE_ONLY,groupSize * binSize * sizeof(cl_uchar),NULL,0);
	status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&dataBuf); 
	status = clSetKernelArg(kernel, 1, groupSize * binSize * sizeof(cl_uchar), NULL); //local memroy size, lds for amd
	status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&midDeviceBinBuf); 

	cl_event ev;
	size_t globalThreads;
	size_t localThreads ;

	globalThreads = (width * height) / binSize ;
	localThreads = groupSize;
	printf("global_work_size =%d, local_work_size=%d\n", globalThreads, localThreads);

	clEnqueueNDRangeKernel( queue,kernel,1,NULL,&globalThreads,&localThreads, 0, NULL, &ev);
	clFlush( queue );
	waitForEventAndRelease(&ev);

//	cl_ulong startTime, endTime;
//	clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); //error accured when run this code sentence!???????
//	clGetEventProfilingInfo(ev,  CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &endTime, NULL);
//	cl_ulong kernelExecTimeNs = endTime-startTime;
//	printf("kernal exec time :%8.6f ms\n ", kernelExecTimeNs*1e-6 );

	cl_event readEvt;
	status = clEnqueueReadBuffer(queue, midDeviceBinBuf, CL_FALSE,0,subHistgCnt * binSize * sizeof(cl_uint),midDeviceBin,0,NULL,&readEvt);
	clWaitForEvents(1, &readEvt);

	for(i = 0; i < subHistgCnt; ++i)
	{
		for( j = 0; j < binSize; ++j)
			{
			deviceBin[j] += midDeviceBin[i * binSize + j];
			}
	}
	cpu_histgo();

	bool result = true;
	for(i = 0; i < binSize; ++i)
	{
		printf("host%d=%d  device%d=%d\n",i,hostBin[i], i, deviceBin[i]);
//		if(hostBin[i] != deviceBin[i])
//			{
//			printf("host%d:%d, device%d:%d\n",i, hostBin[i], i, deviceBin[i]);
//			result = false;
//			break;
//			}
	}

	if(result)
	{
		printf("passed\n");
	}
	else
	{
		printf("failed\n");
	}

	free(data);
	free(hostBin);
	free(midDeviceBin);
	free(deviceBin);
	//ɾ��OpenCL��Դ����
	clReleaseProgram(program);
	clReleaseCommandQueue(queue);
	clReleaseContext(context);
	return 0;
}他的kernel： 
   
   
  #define LINEAR_MEM_ACCESS
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 

#define BIN_SIZE 256

__kernel void histogram256(__global const uint* data,__local uchar* sharedArray,__global uint* binResult)
{
    size_t localId = get_local_id(0);
    size_t globalId = get_global_id(0);
    size_t groupId = get_group_id(0);
    size_t groupSize = get_local_size(0);

    for(int i = 0; i < BIN_SIZE; ++i)
        sharedArray[localId * BIN_SIZE + i] = 0;

    barrier(CLK_LOCAL_MEM_FENCE);
    
    for(int i = 0; i < BIN_SIZE; ++i)
    {
#ifdef LINEAR_MEM_ACCESS
        uint value = data[groupId * groupSize * BIN_SIZE + i * groupSize + localId];
#else
        uint value = data[globalId * BIN_SIZE + i];
#endif // LINEAR_MEM_ACCESS
        sharedArray[localId * BIN_SIZE + value]++;
    }
    
    barrier(CLK_LOCAL_MEM_FENCE); 
    
    for(int i = 0; i < BIN_SIZE / groupSize; ++i)
    {
        uint binCount = 0;
        for(int j = 0; j < groupSize; ++j)
            binCount += sharedArray[j * BIN_SIZE + i * groupSize + localId];
            
        binResult[groupId * BIN_SIZE + i * groupSize + localId] = binCount;
    }
}出来的结果是： 
   
  

 看一会儿正确一会儿不正确！？？？！！！不管是用终端还是用eclipse 我没改动过 同样的工程和图片   我和同事看了下发现第二个参数他传的NULL进来，同事说可能这里的问题导致的？： 
   
  status = clSetKernelArg(kernel, 1, groupSize * binSize * sizeof(cl_uchar), NULL);这里传了NULL 而kernel里对第2个参数进行了寻址  所以导致了上面忽对忽错的结果？？？？ 
   
  
 发现 https://chenxiaowei.gitbooks.io/heterogeneous-computing-with-opencl2-0/content/content/chapter1/1.1-chinese.html   介绍同步、事件机制很好  比之前懂一些了。

OpenCL的学习

你可能感兴趣的:(OpenCL)