[菜鸟每天来段CUDA_C]多GPU的使用

单个GPU具有强大的并行计算的能力,当把多个GPU同时用来执行同一个任务的时候,计算的性能将会得到更大的提升。本文在两块GPU上实现大数据量的向量点积运算。主要步骤为:

1.      获得设备数量;

2.      任务分配;

3.      为每个任务创建一个线程;

4.      启动每个线程进行运算;

5.      合并每个GPU得到的结果。


程序代码:

主程序:main.cpp

#include "main.h"
#include 

extern "C" void runDotProduct(float *dev_a, float *dev_b, float *dev_partial_c, int size);

void* worker(void *pvoidData)
{
	GPUPlan *plan = (GPUPlan*) pvoidData;
	HANDLE_ERROR(cudaSetDevice(plan->deviceID));

	int size = plan->size;
	float *a, *b, c, *partial_c;
	float *dev_a, *dev_b, *dev_partial_c;

	a = plan->a;
	b = plan->b;
	partial_c = (float*)malloc(blockPerGrid*sizeof(float));

	HANDLE_ERROR(cudaMalloc((void**)&dev_a, size*sizeof(float)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_b, size*sizeof(float)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c, blockPerGrid*sizeof(float)));

	HANDLE_ERROR(cudaMemcpy(dev_a, a, size*sizeof(float), cudaMemcpyHostToDevice));
	HANDLE_ERROR(cudaMemcpy(dev_b, b, size*sizeof(float), cudaMemcpyHostToDevice));

	runDotProduct(dev_a, dev_b, dev_partial_c, size);

	HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c, blockPerGrid*sizeof(float), cudaMemcpyDeviceToHost));

	c = 0;
	for (int i=0; ireturnValue = c;
	return 0;
}



int main()
{
	//on two GPUs
	int i;
	int deviceCount;
	HANDLE_ERROR(cudaGetDeviceCount(&deviceCount));

	if (deviceCount < 2)
	{
		printf("No more than 2 device with compute 1.0 or greater."
			"only %d devices found", deviceCount);
		return 0;
	}

	float *a = (float*)malloc(sizeof(float)*N);
	HANDLE_NULL(a);
	float *b = (float*)malloc(sizeof(float)*N);
	HANDLE_NULL(b);

	for (i=0; i\n", elapsedTime);

	printf("value calculated: %f\n", plan[0].returnValue + plan[1].returnValue);

	HANDLE_ERROR(cudaEventDestroy(start));
	HANDLE_ERROR(cudaEventDestroy(stop));
	free(a);
	free(b);

	// on one GPU
	float *host_a;
	float *host_b;
	float *partial_c;
	host_a = (float*)malloc(N*sizeof(float));
	host_b = (float*)malloc(N*sizeof(float));
	partial_c = (float*)malloc(blockPerGrid*sizeof(float));

	for (int i=0; i\n", elapsedTime);

	HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c, blockPerGrid*sizeof(float), cudaMemcpyDeviceToHost));

	float res = 0;
	for (int i=0; i

核函数:kernel.cu

#define imin(a,b) (a>>(dev_a, dev_b, dev_partial_c, size);
}


本文试图将同样的数据在单个GPU上计算,比较计算时间来突出多GPU在计算性能上的提升。但实际情况是多GPU的计算时间却比单GPU更长。初步考虑是觉得核函数太简单,使得GPU执行的性能提升不足以弥补设备分配以及线程调度等带来的开销。所以多GPU也许更适合在大量复杂计算的场景下使用~

你可能感兴趣的:(CUDA编程)