最近老是布置了个作业,让考虑计算E的不同并行算法和不同语言是想,于是考虑到了最近openCL的实现方式,之前也考虑了openMP及多线程的实现,这里只讨论openCL的实现及发现的问题,之后整理一下自己的文章,把所有的并行语言的计算E值的实现贴出来。
一、首先介绍两种计算E值并行结构
二、代码实现
1.多项式分段kernel
// Enter your kernel in this window __kernel void CaluE(__global float* result, int StepNum, int MaxItem ) { int id = get_global_id(0); float fact = 1; float e = 0; for(int i = id+1; i <= StepNum;i+=MaxItem) { for(int j=0; j<MaxItem && j<i;j++) { fact *= (i-j); } e += (1.0/fact); } result[id] = e; // barrier(CLK_LOCAL_MEM_FENCE); };
2.提取公因式kernel
// Enter your kernel in this window __kernel void CaluE_2(__global float* result, int StepNum, int MaxItem ) { int id=get_global_id(0); float start,end,res; int offest = StepNum/MaxItem; //获得所求的初末 start = id*offest+1; end = (id+1)*offest+1; //开始计算 res = 0; float fact = 1; for(int i = start; i < end; i++) { fact *= i; res += (1.0/fact); } //传回 result[id*2] = res; result[id*2+1] = fact; barrier(CLK_LOCAL_MEM_FENCE); };
3.主机端程序
/* 项目:openCL的计算E 作者:刘荣 时间:2012.11.20 在本次运行中,采用了两种计算E的方法, 1.采用分多项式 2.提取多项式 */ #include <iostream> #include<time.h> #include <string> #include<math.h> #include <vector> #include <CL/cl.h> #include <fstream> using namespace std; //kernel函数 std::string convertToString(const char *filename)//将kernel源码,即自己写的并行化的函数,转化成字符串 { size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = (size_t)f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); std::cout << "Memory allocation failed"; return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } else { std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n"; exit(1); } return NULL; } double method1(int MaxItem, int StepNum, int dev_id=0) { //int MaxItem=10; //int StepNum = 1000; // double start,end,time1,time2; //查询平台 cl_int ciErrNum; cl_platform_id platform; ciErrNum = clGetPlatformIDs(1, &platform, NULL); if(ciErrNum != CL_SUCCESS) { cout<<"获取设备失败"<<endl; return 0; } //获取设备信息 cl_device_id device; cl_int status; cl_uint maxDims; cl_event events[3]; size_t globalThreads[1]; size_t localThreads[1]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; //创建设备 if(dev_id==0) { ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); } else { ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); } //创建上下文 cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; cl_context ctx = clCreateContext(cps, 1, &device, NULL, NULL, &ciErrNum); if(ciErrNum != CL_SUCCESS) { cout<<"创建上下文失败"<<endl; return 0; } cl_command_queue myqueue = clCreateCommandQueue(ctx,device,0,&ciErrNum); if(ciErrNum != CL_SUCCESS) { cout<<"命令队列失败"<<endl; return 0; } //声明buffer,传输数据 float *C = NULL; // 输出数组 size_t datasize = sizeof(float)*MaxItem; // 分配内存空间 C = (float*)malloc(datasize); // 初始化输入数组 cl_mem bufferC = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,datasize*sizeof(float),NULL,&ciErrNum); //运行时kernel编译 const char * filename = "CaluE.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; //直接将CL文件读到记忆体 cl_program myprog = clCreateProgramWithSource( ctx, 1, &source, sourceSize, &ciErrNum); //cl_program myprog = clCreateProgramWithSource(ctx,1,(const char**)&programSource,NULL,&ciErrNum); if(ciErrNum != 0) { cout<<"createprogram failed"<<endl; } ciErrNum = clBuildProgram(myprog,0,NULL,NULL,NULL,NULL); if(ciErrNum != 0) { cout<<"clBuildProgram failed"<<endl; } cl_kernel mykernel = clCreateKernel(myprog,"CaluE",&ciErrNum); if(ciErrNum != 0) { cout<<"clCreateKernel failed"<<endl; } //运行程序,设置参数 clSetKernelArg(mykernel,0,sizeof(cl_mem),(void*)&bufferC); clSetKernelArg(mykernel,1,sizeof(int),&StepNum); clSetKernelArg(mykernel,2,sizeof(int),&MaxItem); size_t globalWorkSize[1]; globalWorkSize[0] = MaxItem; // start = clock(); ciErrNum = clEnqueueNDRangeKernel(myqueue,mykernel,1,NULL,globalWorkSize,NULL,0,NULL,&events[0]); if(ciErrNum != 0) { cout<<"clEnqueueNDRangeKernel failed"<<endl; } //时间同步 status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for kernel run to finish. \ (clWaitForEvents0)\n"; return 0; } status = clReleaseEvent(events[0]); //将结果拷贝到主机端 end = clock(); time1=end-start; cout<<"method1 time: "<<time1<<endl; ciErrNum = clEnqueueReadBuffer(myqueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,&events[1]); status = clWaitForEvents(1, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for read buffer call to finish. \ (clWaitForEvents1)n"; return 0; } status = clReleaseEvent(events[1]); if(status != CL_SUCCESS) { std::cout << "Error: Release event object. \ (clReleaseEvent)\n"; return 0; } double e=0; // for(int i=0; i<MaxItem; i++) { e += C[i]; } printf("method1 e = %1.22f \n",e+1); return time1; } double method2(int MaxItem, int StepNum,int dev_id=0) { // double start,end,time1,time2; //查询平台 cl_int ciErrNum; cl_platform_id platform; ciErrNum = clGetPlatformIDs(1, &platform, NULL); if(ciErrNum != CL_SUCCESS) { cout<<"获取设备失败"<<endl; return 0; } //获取设备信息 cl_device_id device; cl_int status; cl_uint maxDims; cl_event events[3]; size_t globalThreads[1]; size_t localThreads[1]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; //创建设备 if(dev_id==0) { ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); } else { ciErrNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); } //创建上下文 cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; cl_context ctx = clCreateContext(cps, 1, &device, NULL, NULL, &ciErrNum); if(ciErrNum != CL_SUCCESS) { cout<<"创建上下文失败"<<endl; return 0; } cl_command_queue myqueue = clCreateCommandQueue(ctx,device,0,&ciErrNum); if(ciErrNum != CL_SUCCESS) { cout<<"命令队列失败"<<endl; return 0; } //声明buffer,传输数据 float *C = NULL; // 输出数组 //int MaxItem=100; // int StepNum = 1000000000; size_t datasize = sizeof(float)*MaxItem*2; // 分配内存空间 C = (float*)malloc(datasize); // 初始化输入数组 cl_mem bufferC = clCreateBuffer(ctx,CL_MEM_WRITE_ONLY,datasize,NULL,&ciErrNum); //运行时kernel编译 const char * filename = "CaluE_2.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; //直接将CL文件读到记忆体 cl_program myprog = clCreateProgramWithSource( ctx, 1, &source, sourceSize, &ciErrNum); //cl_program myprog = clCreateProgramWithSource(ctx,1,(const char**)&programSource,NULL,&ciErrNum); if(ciErrNum != 0) { cout<<"createprogram failed"<<endl; } ciErrNum = clBuildProgram(myprog,0,NULL,NULL,NULL,NULL); if(ciErrNum != 0) { cout<<"clBuildProgram failed"<<endl; } cl_kernel mykernel = clCreateKernel(myprog,"CaluE_2",&ciErrNum); if(ciErrNum != 0) { cout<<"clCreateKernel failed"<<endl; } //运行程序,设置参数 clSetKernelArg(mykernel,0,sizeof(cl_mem),(void*)&bufferC); clSetKernelArg(mykernel,1,sizeof(int),&StepNum); clSetKernelArg(mykernel,2,sizeof(int),&MaxItem); size_t globalWorkSize[1]; globalWorkSize[0] = MaxItem; // start = clock(); ciErrNum = clEnqueueNDRangeKernel(myqueue,mykernel,1,NULL,globalWorkSize,NULL,0,NULL,&events[0]); if(ciErrNum != 0) { cout<<"clEnqueueNDRangeKernel failed"<<endl; } //时间同步 status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for kernel run to finish. \ (clWaitForEvents0)\n"; return 0; } status = clReleaseEvent(events[0]); //将结果拷贝到主机端 end = clock(); time1=end-start; cout<<"method2 时间: "<<time1<<endl; ciErrNum = clEnqueueReadBuffer(myqueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,&events[1]); status = clWaitForEvents(1, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for read buffer call to finish. \ (clWaitForEvents1)n"; return 0; } status = clReleaseEvent(events[1]); if(status != CL_SUCCESS) { std::cout << "Error: Release event object. \ (clReleaseEvent)\n"; return 0; } double e=0; double result = 0; double temp = 1; // for(int i=0; i<MaxItem; i++) { result = C[i*2]; e += (1/temp)*result; temp = C[i*2+1]; } printf("method2 e = %1.18f \n",e+1); return time1; } double serial(int StepNum) { int start,end,time; double fact = 1; double e = 1; start = clock(); for(int i=1; i < StepNum; i++) { fact *= i; e += (1.0/fact); } end = clock(); time = end - start; printf("串行结果:%1.12f \n",e); printf("串行时间: %d \n",time); return time; } //测试 int main() { double time1,time2,time3; int MaxItem=100; int StepNum = 1000000; int dev_id; cout<<"选择kernel设备(0-cpu;1-gpu)"<<endl;//kernel运行的设备不一样,结果是不一样的 cin>>dev_id; cout<<"输入MaxItem(1-128)"<<endl; cin>>MaxItem; cout<<"输入StepNum"<<endl; cin>>StepNum; time1 = serial(StepNum); time2 = method1(MaxItem, StepNum,dev_id); time3 = method2(MaxItem, StepNum, dev_id); printf("方法1与串行加速比:%f \n",time1/time2); printf("方法2与串行加速比:%f \n",time1/time3); return 0; }
三、测试结果
本人cpu amd两盒两线程,显卡amd hd 4000
1.基于cpu
在测试中
( 3.2对应方法2,3.3对应方法1)
2.基于gpu
在本机上测试,当步数变大时,电脑就死了。只是测试不步数小的时候,加速不明显
四、总结
由于时间临近考试,没有做一些分析,希望大牛给解释,并且自己在kernel中的GPU中不能使用double,很是郁闷,求解决