【CUDA编程笔记】(2)CPU与GPU之间的参数传递

利用GPU并行计算的的总体思路是:在CPU(Host)中创建数据,将数据传到GPU(Device)中进行计算,再将计算结果传回到CPU中。

最简单的例子:将CPU中的两个数字在GPU中进行相加,并在CPU中输出:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include 


__device__ int add_gpu(int a, int b) {
	return a + b;
}


__global__ void add(int a, int b, int *c) {
	*c = add_gpu(a , b);
}


int main()
{
	//创建变量
	int a = 3, b = 5;
	int c,int *ptr;

	//分配GPU中的内存
	cudaMalloc((void **)&ptr, sizeof(int)); 

	//在GPU中进行计算
	add << <1, 1 >> > (a, b, ptr);

	//将GPU中的计算结果(ptr指针)复制到CPU主机中,赋给c
	cudaMemcpy(&c, ptr, sizeof(int), cudaMemcpyDeviceToHost);   
	printf("%d + %d = %d\n", a, b, c);

	//释放指针
	cudaFree(ptr);  
	return 0;
}


//3 + 5 = 8

上面的例子只有一个数相加,因此不需要多线程,下面我们加大数据量,引入多线程并行计算。

用10个线程对两个长度为10的数组相加:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include 

const int N = 10;    //数组长度
const int Mem = N * sizeof(int);    //数组内存大小

__global__ void add(int *a, int *b, int *c) {
	int tid = threadIdx.x;
	if (tid < N)
		c[tid] = a[tid] + b[tid];
}


int main()
{
	//创建变量
	int a[N] = { 1,2,3,4,5,6,7,8,9,10 };
	int b[N] = { 1,3,5,7,9,11,13,15,17,19 };
	int c[N];
	int *dev_a, *dev_b, *dev_c;

	//分类GPU内存
	cudaMalloc((void **)&dev_a, Mem);
	cudaMalloc((void **)&dev_b, Mem);
	cudaMalloc((void **)&dev_c, Mem);

	//将数据传给GPU
	cudaMemcpy(dev_a, a, Mem, cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b, b, Mem, cudaMemcpyHostToDevice);

	//在GPU中并行计算
	add << <1, 10 >> > (dev_a, dev_b, dev_c);

	//将计算结果传回CPU
	cudaMemcpy(c, dev_c, Mem, cudaMemcpyDeviceToHost);

	//输出计算结果
	for (int i = 0; i < N; i++)
		printf("%d + %d = %d\n", a[i], b[i], c[i]);

	//释放指针
	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);

	return 0;
}


/*
1 + 1 = 2
2 + 3 = 5
3 + 5 = 8
4 + 7 = 11
5 + 9 = 14
6 + 11 = 17
7 + 13 = 20
8 + 15 = 23
9 + 17 = 26
10 + 19 = 29
*/

当然也可以用10个线程块,每个线程块分配一个线程的方式来实现,输出结果是一样的,代码如下:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include 

#define N 1024
const int Mem = N * sizeof(int);    //数组内存大小

__global__ void add(int *a, int *b, int *c) {
	int bid = blockIdx.x;
	if (bid < N)
		c[bid] = a[bid] + b[bid];
}


int main()
{
	//创建变量
	int a[N] = { 1,2,3,4,5,6,7,8,9,10 };
	int b[N] = { 1,3,5,7,9,11,13,15,17,19 };
	int c[N];
	int *dev_a, *dev_b, *dev_c;

	//分类GPU内存
	cudaMalloc((void **)&dev_a, Mem);
	cudaMalloc((void **)&dev_b, Mem);
	cudaMalloc((void **)&dev_c, Mem);

	//将数据传给GPU
	cudaMemcpy(dev_a, a, Mem, cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b, b, Mem, cudaMemcpyHostToDevice);

	//在GPU中并行计算
	add << <10, 1 >> > (dev_a, dev_b, dev_c);

	//将计算结果传回CPU
	cudaMemcpy(c, dev_c, Mem, cudaMemcpyDeviceToHost);

	//输出计算结果
	for (int i = 0; i < N; i++)
		printf("%d + %d = %d\n", a[i], b[i], c[i]);

	//释放指针
	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);

	return 0;
}

实际情况中,数据量很大,我们不可能用一个线程只处理一个数据,而是用一个线程处理多个数据。


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include 

#define N 1024
const int Mem = N * sizeof(int);    //数组内存大小


__global__ void add(int *a, int *b, int *c) {
	int bid = blockIdx.x;
	while (bid < N) {
		c[bid] = a[bid] + b[bid];
		bid += gridDim.x;
	}
}


int main()
{
	//创建变量
	int a[N],b[N],c[N];
	int *dev_a, *dev_b, *dev_c;
	for (int i = 0; i < N; i++) {
		a[i] = i + 1;
		b[i] = 2 * i;
	}

	//分类GPU内存
	cudaMalloc((void **)&dev_a, Mem);
	cudaMalloc((void **)&dev_b, Mem);
	cudaMalloc((void **)&dev_c, Mem);

	//将数据传给GPU
	cudaMemcpy(dev_a, a, Mem, cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b, b, Mem, cudaMemcpyHostToDevice);

	//在GPU中并行计算
	add << <10, 1 >> > (dev_a, dev_b, dev_c);

	//将计算结果传回CPU
	cudaMemcpy(c, dev_c, Mem, cudaMemcpyDeviceToHost);

	//输出计算结果
	for (int i = 0; i < N; i++)
		printf("%d + %d = %d\n", a[i], b[i], c[i]);

	//释放指针
	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);

	return 0;
}

你可能感兴趣的:(CUDA编程,c语言,c++,算法)