openCL在GPU与CPU设备上的区别--计算E值

最近老是布置了个作业,让考虑计算E的不同并行算法和不同语言是想,于是考虑到了最近openCL的实现方式,之前也考虑了openMP及多线程的实现,这里只讨论openCL的实现及发现的问题,之后整理一下自己的文章,把所有的并行语言的计算E值的实现贴出来。

一、首先介绍两种计算E值并行结构

      openCL在GPU与CPU设备上的区别--计算E值_第1张图片

openCL在GPU与CPU设备上的区别--计算E值_第2张图片

openCL在GPU与CPU设备上的区别--计算E值_第3张图片

二、代码实现

  1.多项式分段kernel

   

// Enter your kernel in this window
__kernel
void CaluE(__global float* result,
           int StepNum,
           int MaxItem
 )
{
    int id = get_global_id(0);
    float fact = 1;
	float e = 0;
	for(int i = id+1; i <= StepNum;i+=MaxItem)
	{
		for(int j=0; j<MaxItem && j<i;j++)
        {
              fact *= (i-j);
        }
        e += (1.0/fact);  
	}
	result[id] = e;
//	barrier(CLK_LOCAL_MEM_FENCE);  
};

 2.提取公因式kernel

// Enter your kernel in this window
__kernel
void CaluE_2(__global float* result,
           int StepNum,
           int MaxItem
 )
{

    int id=get_global_id(0);
    float start,end,res;
	int offest = StepNum/MaxItem;
	//获得所求的初末
	start = id*offest+1;
	end = (id+1)*offest+1;
	//开始计算
	res = 0;
	float fact = 1;

	for(int i = start; i < end; i++)
	{
		fact *= i;
		res += (1.0/fact);
	}
	//传回   
    result[id*2] = res; 
	result[id*2+1] = fact;
  
	barrier(CLK_LOCAL_MEM_FENCE);  
};


 3.主机端程序

/*
   项目:openCL的计算E
   作者:刘荣
   时间:2012.11.20

   在本次运行中,采用了两种计算E的方法,
      1.采用分多项式
	  2.提取多项式
*/
#include <iostream>
#include<time.h>
#include <string> 
#include<math.h>
#include <vector>
#include <CL/cl.h>
#include <fstream>
using namespace std;
//kernel函数
std::string
convertToString(const char *filename)//将kernel源码,即自己写的并行化的函数,转化成字符串
{
    size_t size;
    char*  str;
    std::string s;

    std::fstream f(filename, (std::fstream::in | std::fstream::binary));

    if(f.is_open())
    {
        size_t fileSize;
        f.seekg(0, std::fstream::end);
        size = fileSize = (size_t)f.tellg();
        f.seekg(0, std::fstream::beg);

        str = new char[size+1];
        if(!str)
        {
            f.close();
            std::cout << "Memory allocation failed";
            return NULL;
        }

        f.read(str, fileSize);
        f.close();
        str[size] = '\0';
    
        s = str;
        delete[] str;
        return s;
    }
    else
    {
        std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n";
        exit(1);
    }
    return NULL;
}

double method1(int MaxItem, int StepNum, int dev_id=0)
{
	//int MaxItem=10;
   //int StepNum = 1000;
	//
	double start,end,time1,time2;
	//查询平台
	cl_int ciErrNum;
	cl_platform_id platform;
	ciErrNum = clGetPlatformIDs(1, &platform, NULL);
	if(ciErrNum != CL_SUCCESS)
	{
		cout<<"获取设备失败"<<endl;
		return 0;
	}
	//获取设备信息
	cl_device_id device;
	cl_int   status;
    cl_uint maxDims;
    cl_event events[3];
    size_t globalThreads[1];
    size_t localThreads[1];
    size_t maxWorkGroupSize;
    size_t maxWorkItemSizes[3];
  //创建设备
	if(dev_id==0)
	{
		ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
	}
	else
	{
        ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
	}
	
	//创建上下文
	cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
	cl_context ctx = clCreateContext(cps, 1, &device, NULL, NULL, &ciErrNum);
	if(ciErrNum != CL_SUCCESS)
	{
		cout<<"创建上下文失败"<<endl;
		return 0;
	}
	cl_command_queue myqueue = clCreateCommandQueue(ctx,device,0,&ciErrNum);
	if(ciErrNum != CL_SUCCESS)
	{
		cout<<"命令队列失败"<<endl;
		return 0;
	}
	//声明buffer,传输数据
	float *C = NULL; // 输出数组
	
	size_t datasize = sizeof(float)*MaxItem;

	// 分配内存空间
	C = (float*)malloc(datasize);

	// 初始化输入数组
	cl_mem bufferC = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,datasize*sizeof(float),NULL,&ciErrNum);

	//运行时kernel编译
	const char * filename  = "CaluE.cl";
    std::string  sourceStr = convertToString(filename);
    const char * source    = sourceStr.c_str();
    size_t sourceSize[]    = { strlen(source) };
	//直接将CL文件读到记忆体
    cl_program myprog = clCreateProgramWithSource(
                  ctx, 
                  1, 
                  &source,
                  sourceSize,
                  &ciErrNum);
	//cl_program myprog = clCreateProgramWithSource(ctx,1,(const char**)&programSource,NULL,&ciErrNum);
	if(ciErrNum != 0)
	{
		cout<<"createprogram failed"<<endl;
	}
	ciErrNum = clBuildProgram(myprog,0,NULL,NULL,NULL,NULL);
	if(ciErrNum != 0)
	{
		cout<<"clBuildProgram failed"<<endl;
	}

	cl_kernel mykernel = clCreateKernel(myprog,"CaluE",&ciErrNum);
	if(ciErrNum != 0)
	{
		cout<<"clCreateKernel failed"<<endl;
	}
	//运行程序,设置参数
	clSetKernelArg(mykernel,0,sizeof(cl_mem),(void*)&bufferC);
	clSetKernelArg(mykernel,1,sizeof(int),&StepNum);
	clSetKernelArg(mykernel,2,sizeof(int),&MaxItem);
    
	size_t globalWorkSize[1];
	globalWorkSize[0] = MaxItem;
	//
	start = clock();
	ciErrNum = clEnqueueNDRangeKernel(myqueue,mykernel,1,NULL,globalWorkSize,NULL,0,NULL,&events[0]);
	if(ciErrNum != 0)
	{
		cout<<"clEnqueueNDRangeKernel failed"<<endl;
	}
	//时间同步
	status = clWaitForEvents(1, &events[0]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Waiting for kernel run to finish. \
            (clWaitForEvents0)\n";
        return 0;
    }
   status = clReleaseEvent(events[0]);
	//将结果拷贝到主机端
	end = clock();
	time1=end-start;
	cout<<"method1 time: "<<time1<<endl;
	ciErrNum = clEnqueueReadBuffer(myqueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,&events[1]);

	status = clWaitForEvents(1, &events[1]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Waiting for read buffer call to finish. \
            (clWaitForEvents1)n";
        return 0;
    }
    status = clReleaseEvent(events[1]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Release event object. \
            (clReleaseEvent)\n";
        return 0;
    }
	double e=0;
	//
	for(int i=0; i<MaxItem; i++)
	{ 
		e += C[i];
	}
	printf("method1 e = %1.22f  \n",e+1);
	return time1;
}
double method2(int MaxItem, int StepNum,int dev_id=0)
{
	//
	double start,end,time1,time2;
	//查询平台
	cl_int ciErrNum;
	cl_platform_id platform;
	ciErrNum = clGetPlatformIDs(1, &platform, NULL);
	if(ciErrNum != CL_SUCCESS)
	{
		cout<<"获取设备失败"<<endl;
		return 0;
	}
	//获取设备信息
	cl_device_id device;
	cl_int   status;
    cl_uint maxDims;
    cl_event events[3];
    size_t globalThreads[1];
    size_t localThreads[1];
    size_t maxWorkGroupSize;
    size_t maxWorkItemSizes[3];
  //创建设备
	if(dev_id==0)
	{
		ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
	}
	else
	{
        ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
	}
  
	
	//创建上下文
	cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
	cl_context ctx = clCreateContext(cps, 1, &device, NULL, NULL, &ciErrNum);
	if(ciErrNum != CL_SUCCESS)
	{
		cout<<"创建上下文失败"<<endl;
		return 0;
	}
	cl_command_queue myqueue = clCreateCommandQueue(ctx,device,0,&ciErrNum);
	if(ciErrNum != CL_SUCCESS)
	{
		cout<<"命令队列失败"<<endl;
		return 0;
	}
	//声明buffer,传输数据
	float *C = NULL; // 输出数组
	//int MaxItem=100;
//	int StepNum = 1000000000;
	size_t datasize = sizeof(float)*MaxItem*2;

	// 分配内存空间
	C = (float*)malloc(datasize);

	// 初始化输入数组
	cl_mem bufferC = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,datasize,NULL,&ciErrNum);

	//运行时kernel编译
	const char * filename  = "CaluE_2.cl";
    std::string  sourceStr = convertToString(filename);
    const char * source    = sourceStr.c_str();
    size_t sourceSize[]    = { strlen(source) };
	//直接将CL文件读到记忆体
    cl_program myprog = clCreateProgramWithSource(
                  ctx, 
                  1, 
                  &source,
                  sourceSize,
                  &ciErrNum);
	//cl_program myprog = clCreateProgramWithSource(ctx,1,(const char**)&programSource,NULL,&ciErrNum);
	if(ciErrNum != 0)
	{
		cout<<"createprogram failed"<<endl;
	}
	ciErrNum = clBuildProgram(myprog,0,NULL,NULL,NULL,NULL);
	if(ciErrNum != 0)
	{
		cout<<"clBuildProgram failed"<<endl;
	}

	cl_kernel mykernel = clCreateKernel(myprog,"CaluE_2",&ciErrNum);
	if(ciErrNum != 0)
	{
		cout<<"clCreateKernel failed"<<endl;
	}
	//运行程序,设置参数
	clSetKernelArg(mykernel,0,sizeof(cl_mem),(void*)&bufferC);
	clSetKernelArg(mykernel,1,sizeof(int),&StepNum);
	clSetKernelArg(mykernel,2,sizeof(int),&MaxItem);
    
	size_t globalWorkSize[1];
	globalWorkSize[0] = MaxItem;
	//
	start = clock();
	ciErrNum = clEnqueueNDRangeKernel(myqueue,mykernel,1,NULL,globalWorkSize,NULL,0,NULL,&events[0]);
	if(ciErrNum != 0)
	{
		cout<<"clEnqueueNDRangeKernel failed"<<endl;
	}
	//时间同步
	status = clWaitForEvents(1, &events[0]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Waiting for kernel run to finish. \
            (clWaitForEvents0)\n";
        return 0;
    }
    status = clReleaseEvent(events[0]);
	//将结果拷贝到主机端
	end = clock();
	time1=end-start;
	cout<<"method2 时间: "<<time1<<endl;
	ciErrNum = clEnqueueReadBuffer(myqueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,&events[1]);

	status = clWaitForEvents(1, &events[1]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Waiting for read buffer call to finish. \
            (clWaitForEvents1)n";
        return 0;
    }
    status = clReleaseEvent(events[1]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Release event object. \
            (clReleaseEvent)\n";
        return 0;
    }
	double e=0;
	double result = 0;
	double temp = 1;
	//
	for(int i=0; i<MaxItem; i++)
	{ 
		result = C[i*2];
		e += (1/temp)*result;
		temp = C[i*2+1];
	}
	printf("method2 e = %1.18f \n",e+1);
	return time1;

}
double serial(int StepNum)
{
	int start,end,time;
	double fact = 1;
	double e = 1;
	start = clock();
	for(int i=1; i < StepNum; i++)
	{
		fact *= i;
		e += (1.0/fact);
	}
	end  = clock();
	time = end - start;
	printf("串行结果:%1.12f \n",e);
	printf("串行时间: %d \n",time);
	return time;
}
//测试
int main()
{

	double time1,time2,time3;
	int MaxItem=100;
    int StepNum = 1000000;
	int dev_id;
	cout<<"选择kernel设备(0-cpu;1-gpu)"<<endl;//kernel运行的设备不一样,结果是不一样的
	cin>>dev_id;
	cout<<"输入MaxItem(1-128)"<<endl;
	cin>>MaxItem;
	cout<<"输入StepNum"<<endl;
	cin>>StepNum;
	time1 = serial(StepNum);
	time2 = method1(MaxItem, StepNum,dev_id);
	time3 = method2(MaxItem, StepNum, dev_id);
	printf("方法1与串行加速比:%f \n",time1/time2);
	printf("方法2与串行加速比:%f \n",time1/time3);

	return 0;
}


三、测试结果

      本人cpu amd两盒两线程,显卡amd hd 4000

     1.基于cpu

      在测试中

  

      (  3.2对应方法2,3.3对应方法1)

     2.基于gpu

      在本机上测试,当步数变大时,电脑就死了。只是测试不步数小的时候,加速不明显

四、总结

     由于时间临近考试,没有做一些分析,希望大牛给解释,并且自己在kernel中的GPU中不能使用double,很是郁闷,求解决

你可能感兴趣的:(openCL在GPU与CPU设备上的区别--计算E值)