VS2019 OpenCL安装和快速入门

本文基于NVIDIA GPU和VS2019讲解如何配置OpenCL环境和快速上手编程。

文章目录

    • 1.OpenCL安装在VS2019 上
      • 1.安装资源准备
      • 2.安装步骤
    • 2.OpenCL 快速入门
      • 1.原文和翻译
      • 2.代码改动和调试
    • 3.测试通过的代码

1.OpenCL安装在VS2019 上

1.安装资源准备

从NVIDIA 官网下载CUDA 并双击运行安装程序:

https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=7&target_type=exelocal

然后在安装路径找到以下资源:
1 .\lib\x64\OpenCL.lib
在这里插入图片描述OpenCL.lib
2.CL 头文件

VS2019 OpenCL安装和快速入门_第1张图片CL 头文件
3.在显卡的默认驱动路径找到
OpenCL64.dll
在这里插入图片描述VS2019 OpenCL安装和快速入门_第2张图片OpenCL64.dll

2.安装步骤

新建OpenCL_inc 和OpenCL_lib 目录。将上一步找到的资源分别复制到这两个目录。
其中OpenCL_inc 用来包含CL 头文件,OpenCL_lib 目录用来包含OpenCL.lib以及OpenCL.dll,OpenCL64.dll。

VS2019 环境配置:
1.
项目–> (项目名)属性 -->C/C+±->常规 -->附加包含目录 --> F:\OPENCL\code\OpenCL_inc
2.
项目–> (项目名)属性 -->连接器 --> 常规 --> 附加库目录 --> F:\OPENCL\code\OpenCL_lib
3.
项目–> (项目名)属性 -->连接器 -->输入–> 附加依赖项 --> OpenCL.lib

2.OpenCL 快速入门

1.原文和翻译

原文地址
参考翻译

2.代码改动和调试

尝试将vector_add_gpu.cl kernel文件直接以字符串的方式放在代码中,然后在kernel编译(即执行 clBuildProgram)后再获取PROGRAM SOURCE:

error = clGetProgramInfo(program, CL_PROGRAM_SOURCE, bufSize, programBuffer, &program_size_ret);

发现这样不使用.cl 文件也时可行的。但是需要注意的是将编译后的字符串打印出来时buffer长度需要+1。

3.测试通过的代码

#include 
#include 

//#include 
#include "CL\opencl.h"
#include 
#include 

using namespace std;
string getPlatformName(const cl_platform_id pid) {
	size_t param_value_size;
	clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &param_value_size);
	char* param_value = new char[param_value_size];
	clGetPlatformInfo(pid, CL_PLATFORM_NAME, param_value_size, param_value, NULL);
	return param_value;
}


size_t shrRoundUp(size_t localWorkSize, size_t numItems) {
	size_t result = localWorkSize;
	while (result < numItems)
		result += localWorkSize;
	return result;
}

void vector_add_cpu(const float* const src_a,
	const float* const src_b,
	float* const res,
	const int size)
{
	for (int i = 0; i < size; i++) {
		res[i] = src_a[i] + src_b[i];
	}
}

int main() {
	
	cl_uint num_platforms;
	cl_int error = 0;
	cl_context context;
	cl_command_queue queue;
	cl_device_id device;
	cl_platform_id platformNvidia;

	// Platform
	clGetPlatformIDs(0, NULL, &num_platforms);
	cl_platform_id* platforms = new cl_platform_id[num_platforms];
	clGetPlatformIDs(num_platforms, platforms, NULL);
	for (cl_uint i = 0; i < num_platforms; i++) {
		string platname = getPlatformName(platforms[i]);
		cout << "<" << i << "> " << "Platform name is :" << platname << endl;
	}
	platformNvidia = platforms[1];
	
	string platname1 = getPlatformName(platformNvidia);
	cout << "<" << platname1 << "> " << "choose Platform 1 :" << platname1 << endl;

	//Device
	cl_uint num_devices;
	error = clGetDeviceIDs(platformNvidia, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
	if (error != CL_SUCCESS) {
		cout << "Error getting device ids: " << error << endl;
		exit(error);
	}
	cout << "num of devices is : "<< num_devices << endl;

	//context
	context = clCreateContext(0, 1, &device, NULL, NULL, &error);
	if (error != CL_SUCCESS) {
		cout << "Error creating context: " << error << endl;
		exit(error);
	}


	// Command-queue
	queue = clCreateCommandQueue(context, device, 0, &error);
	if (error != CL_SUCCESS) {
		cout << "Error creating command queue: " << error << endl;
		exit(error);
	}


	///////memory
	const int size = 123456;
	float* src_a_h = new float[size];
	float* src_b_h = new float[size];
	float* res_h = new float[size];
	
	//cl_mem_ion_host_ptr ion_src_a;
	
	// init vectors
	for (int i = 0; i < size; i++)
	{
		src_a_h[i] = src_b_h[i] = (float) i;
	}

	const int mem_size = sizeof(float) * size;

	//allocate device buffer
	cl_mem src_a_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_a_h, &error );
	cl_mem src_b_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_b_h, &error );
	cl_mem res_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error);

	// create the program
	
	const char* programSource =
		"__kernel void vector_add_gpu(__global const float* src_a, "
			"	__global const float* src_b, "
			"	__global float* res, "
			"	const int num)         \n                                           "
		"{ \n                                                                   "
		"	/* get_global_id(0) 返回正在执行的这个线程的ID。      \n              "
		"	许多线程会在同一时间开始执行同一个kernel,             \n             "
		"	每个线程都会收到一个不同的ID,所以必然会执行一个不同的计算。*/   \n   "
		"	const int idx = get_global_id(0);                                 \n  "
		"                                                                      \n "
		"	/* 每个work-item都会检查自己的id是否在向量数组的区间内。           \n "
		"	如果在,work-item就会执行相应的计算。*/                            \n "
		"	if (idx < num)                                                     \n "
		"		res[idx] = src_a[idx] + src_b[idx];                      \n  "
		" }                                                                    \n  ";


	printf("programSource\n %s \n", programSource);

	cl_program program = clCreateProgramWithSource(context, 1, reinterpret_cast<const char**>(&programSource), NULL, &error);
	if (error != CL_SUCCESS) {
		cout << "Error creating program: " << error << endl;
		exit(error);
	}

	//builds the program
	error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
	if (error != CL_SUCCESS) {
		cout << "Error Build program: " << error << endl;
		exit(error);
	}
	
	// check the kernel code
	size_t bufSize = strlen(programSource) +1;         //为什么大一个字节
	char* programBuffer = (char*)malloc(bufSize);
	size_t program_size_ret;
	error = clGetProgramInfo(program, CL_PROGRAM_SOURCE, bufSize, programBuffer, &program_size_ret);
	if (error != CL_SUCCESS)
	{
		cout << "Error clGetProgramInfo:" << error << endl;
	}
	
	printf("  program_size_ret %ld\n", program_size_ret);

	printf("bufSize = %ld  \n", bufSize);
	printf("Print Program Source:\n");
	printf("\n %s \n", programBuffer);
	
	// shows the log
	char* build_log;
	size_t log_size;
	// 1st get the log_size
	clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
	build_log = new char[log_size + 1];
	cout << log_size << "log_size" << endl;
	//2nd get log
	clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
	build_log[log_size] = '\0';
	cout << build_log <<"build log" <<endl;
	delete[] build_log;

	//extacting the kernel
	cl_kernel vector_add_k = clCreateKernel(program, "vector_add_gpu", &error);
	if (error != CL_SUCCESS) {
		cout << "Error extacting the kernel: " << error << endl;
		exit(error);
	}
	// Enqueuing parameters
	error = clSetKernelArg(vector_add_k, 0, sizeof(cl_mem), &src_a_d);
	if (error != CL_SUCCESS)
	{
		cout << "Error Enqueuing 0 parameters:" << error << endl;
	}
	error |= clSetKernelArg(vector_add_k, 1, sizeof(cl_mem), &src_b_d);
	if (error != CL_SUCCESS)
	{
		cout << "Error Enqueuing 1 parameters:" << error << endl;
	}
	error |= clSetKernelArg(vector_add_k, 2, sizeof(cl_mem), &res_d);
	if (error != CL_SUCCESS)
	{
		cout << "Error Enqueuing 2 parameters:" << error << endl;
	}
	error |= clSetKernelArg(vector_add_k, 3, sizeof(int), &size);
	if (error != CL_SUCCESS)
	{
		cout << "Error Enqueuing 3 parameters:" << error << endl;
	}

	//launching the kernel
	const size_t local_ws = 512;  // Number of work-items per work-group

	const size_t global_ws = shrRoundUp(local_ws, size);
	
	ULONGLONG t1 = GetTickCount64();
	error = clEnqueueNDRangeKernel(queue, vector_add_k, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL);
	ULONGLONG t2 = GetTickCount64();
	cout << "GPU cost time :" << (t2 - t1) <<endl;

	if (error != CL_SUCCESS)
	{
		cout << "Error Enqueuing EnqueueNDRangeKernel:" << error << endl;
	}

	float* check = new float[size];
	
	clEnqueueReadBuffer(queue, res_d, CL_TRUE, 0, mem_size, check, 0, NULL, NULL);
	
	// Checking with the CPU results;
	ULONGLONG t3 = GetTickCount64();
	vector_add_cpu(src_a_h, src_b_h, res_h, size);	
	ULONGLONG t4 = GetTickCount64();
	cout << "CPU cost time :" << (t4 - t3) << endl;
	
	for (int i = 0; i < size; i++)
	{
		if (check[i] != res_h[i])
		{
			printf("calc ERROR GPU value is %f CPU value is %f  \n", check[i], res_h[i]);
		}
	}
	
	cout << "Congratulations, it's working! \n" << endl;

	// Cleaning up

	delete[] src_a_h;

	delete[] src_b_h;

	delete[] res_h;

	delete[] check;

	clReleaseKernel(vector_add_k);

	clReleaseCommandQueue(queue);

	clReleaseContext(context);

	clReleaseMemObject(src_a_d);

	clReleaseMemObject(src_b_d);

	clReleaseMemObject(res_d);
	return 0;
}

你可能感兴趣的:(并行编程)