本文基于NVIDIA GPU和VS2019讲解如何配置OpenCL环境和快速上手编程。
从NVIDIA 官网下载CUDA 并双击运行安装程序:
https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=7&target_type=exelocal
然后在安装路径找到以下资源:
1 .\lib\x64\OpenCL.lib
OpenCL.lib
2.CL 头文件
CL 头文件
3.在显卡的默认驱动路径找到
OpenCL64.dll
在这里插入图片描述OpenCL64.dll
新建OpenCL_inc 和OpenCL_lib 目录。将上一步找到的资源分别复制到这两个目录。
其中OpenCL_inc 用来包含CL 头文件,OpenCL_lib 目录用来包含OpenCL.lib以及OpenCL.dll,OpenCL64.dll。
VS2019 环境配置:
1.
项目–> (项目名)属性 -->C/C+±->常规 -->附加包含目录 --> F:\OPENCL\code\OpenCL_inc
2.
项目–> (项目名)属性 -->连接器 --> 常规 --> 附加库目录 --> F:\OPENCL\code\OpenCL_lib
3.
项目–> (项目名)属性 -->连接器 -->输入–> 附加依赖项 --> OpenCL.lib
原文地址
参考翻译
尝试将vector_add_gpu.cl kernel文件直接以字符串的方式放在代码中,然后在kernel编译(即执行 clBuildProgram)后再获取PROGRAM SOURCE:
error = clGetProgramInfo(program, CL_PROGRAM_SOURCE, bufSize, programBuffer, &program_size_ret);
发现这样不使用.cl 文件也时可行的。但是需要注意的是将编译后的字符串打印出来时buffer长度需要+1。
#include
#include
//#include
#include "CL\opencl.h"
#include
#include
using namespace std;
string getPlatformName(const cl_platform_id pid) {
size_t param_value_size;
clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, ¶m_value_size);
char* param_value = new char[param_value_size];
clGetPlatformInfo(pid, CL_PLATFORM_NAME, param_value_size, param_value, NULL);
return param_value;
}
size_t shrRoundUp(size_t localWorkSize, size_t numItems) {
size_t result = localWorkSize;
while (result < numItems)
result += localWorkSize;
return result;
}
void vector_add_cpu(const float* const src_a,
const float* const src_b,
float* const res,
const int size)
{
for (int i = 0; i < size; i++) {
res[i] = src_a[i] + src_b[i];
}
}
int main() {
cl_uint num_platforms;
cl_int error = 0;
cl_context context;
cl_command_queue queue;
cl_device_id device;
cl_platform_id platformNvidia;
// Platform
clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id* platforms = new cl_platform_id[num_platforms];
clGetPlatformIDs(num_platforms, platforms, NULL);
for (cl_uint i = 0; i < num_platforms; i++) {
string platname = getPlatformName(platforms[i]);
cout << "<" << i << "> " << "Platform name is :" << platname << endl;
}
platformNvidia = platforms[1];
string platname1 = getPlatformName(platformNvidia);
cout << "<" << platname1 << "> " << "choose Platform 1 :" << platname1 << endl;
//Device
cl_uint num_devices;
error = clGetDeviceIDs(platformNvidia, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
if (error != CL_SUCCESS) {
cout << "Error getting device ids: " << error << endl;
exit(error);
}
cout << "num of devices is : "<< num_devices << endl;
//context
context = clCreateContext(0, 1, &device, NULL, NULL, &error);
if (error != CL_SUCCESS) {
cout << "Error creating context: " << error << endl;
exit(error);
}
// Command-queue
queue = clCreateCommandQueue(context, device, 0, &error);
if (error != CL_SUCCESS) {
cout << "Error creating command queue: " << error << endl;
exit(error);
}
///////memory
const int size = 123456;
float* src_a_h = new float[size];
float* src_b_h = new float[size];
float* res_h = new float[size];
//cl_mem_ion_host_ptr ion_src_a;
// init vectors
for (int i = 0; i < size; i++)
{
src_a_h[i] = src_b_h[i] = (float) i;
}
const int mem_size = sizeof(float) * size;
//allocate device buffer
cl_mem src_a_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_a_h, &error );
cl_mem src_b_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_b_h, &error );
cl_mem res_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error);
// create the program
const char* programSource =
"__kernel void vector_add_gpu(__global const float* src_a, "
" __global const float* src_b, "
" __global float* res, "
" const int num) \n "
"{ \n "
" /* get_global_id(0) 返回正在执行的这个线程的ID。 \n "
" 许多线程会在同一时间开始执行同一个kernel, \n "
" 每个线程都会收到一个不同的ID,所以必然会执行一个不同的计算。*/ \n "
" const int idx = get_global_id(0); \n "
" \n "
" /* 每个work-item都会检查自己的id是否在向量数组的区间内。 \n "
" 如果在,work-item就会执行相应的计算。*/ \n "
" if (idx < num) \n "
" res[idx] = src_a[idx] + src_b[idx]; \n "
" } \n ";
printf("programSource\n %s \n", programSource);
cl_program program = clCreateProgramWithSource(context, 1, reinterpret_cast<const char**>(&programSource), NULL, &error);
if (error != CL_SUCCESS) {
cout << "Error creating program: " << error << endl;
exit(error);
}
//builds the program
error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
if (error != CL_SUCCESS) {
cout << "Error Build program: " << error << endl;
exit(error);
}
// check the kernel code
size_t bufSize = strlen(programSource) +1; //为什么大一个字节
char* programBuffer = (char*)malloc(bufSize);
size_t program_size_ret;
error = clGetProgramInfo(program, CL_PROGRAM_SOURCE, bufSize, programBuffer, &program_size_ret);
if (error != CL_SUCCESS)
{
cout << "Error clGetProgramInfo:" << error << endl;
}
printf(" program_size_ret %ld\n", program_size_ret);
printf("bufSize = %ld \n", bufSize);
printf("Print Program Source:\n");
printf("\n %s \n", programBuffer);
// shows the log
char* build_log;
size_t log_size;
// 1st get the log_size
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
build_log = new char[log_size + 1];
cout << log_size << "log_size" << endl;
//2nd get log
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
build_log[log_size] = '\0';
cout << build_log <<"build log" <<endl;
delete[] build_log;
//extacting the kernel
cl_kernel vector_add_k = clCreateKernel(program, "vector_add_gpu", &error);
if (error != CL_SUCCESS) {
cout << "Error extacting the kernel: " << error << endl;
exit(error);
}
// Enqueuing parameters
error = clSetKernelArg(vector_add_k, 0, sizeof(cl_mem), &src_a_d);
if (error != CL_SUCCESS)
{
cout << "Error Enqueuing 0 parameters:" << error << endl;
}
error |= clSetKernelArg(vector_add_k, 1, sizeof(cl_mem), &src_b_d);
if (error != CL_SUCCESS)
{
cout << "Error Enqueuing 1 parameters:" << error << endl;
}
error |= clSetKernelArg(vector_add_k, 2, sizeof(cl_mem), &res_d);
if (error != CL_SUCCESS)
{
cout << "Error Enqueuing 2 parameters:" << error << endl;
}
error |= clSetKernelArg(vector_add_k, 3, sizeof(int), &size);
if (error != CL_SUCCESS)
{
cout << "Error Enqueuing 3 parameters:" << error << endl;
}
//launching the kernel
const size_t local_ws = 512; // Number of work-items per work-group
const size_t global_ws = shrRoundUp(local_ws, size);
ULONGLONG t1 = GetTickCount64();
error = clEnqueueNDRangeKernel(queue, vector_add_k, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL);
ULONGLONG t2 = GetTickCount64();
cout << "GPU cost time :" << (t2 - t1) <<endl;
if (error != CL_SUCCESS)
{
cout << "Error Enqueuing EnqueueNDRangeKernel:" << error << endl;
}
float* check = new float[size];
clEnqueueReadBuffer(queue, res_d, CL_TRUE, 0, mem_size, check, 0, NULL, NULL);
// Checking with the CPU results;
ULONGLONG t3 = GetTickCount64();
vector_add_cpu(src_a_h, src_b_h, res_h, size);
ULONGLONG t4 = GetTickCount64();
cout << "CPU cost time :" << (t4 - t3) << endl;
for (int i = 0; i < size; i++)
{
if (check[i] != res_h[i])
{
printf("calc ERROR GPU value is %f CPU value is %f \n", check[i], res_h[i]);
}
}
cout << "Congratulations, it's working! \n" << endl;
// Cleaning up
delete[] src_a_h;
delete[] src_b_h;
delete[] res_h;
delete[] check;
clReleaseKernel(vector_add_k);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseMemObject(src_a_d);
clReleaseMemObject(src_b_d);
clReleaseMemObject(res_d);
return 0;
}