gpu cuda 数组求和优化

欢迎关注更多精彩
关注我,学习常用算法与数据结构,一题多解,降维打击。

问题描述

给定1个数组,利用gpu求和并返回结果。

cpu 算法


#include 
#include
#include
#include 

using namespace std;

#define BLOCK_SIZE 1024
#define N (102300 * 1024)

int arr[N];

void cpu_sum(int *in, int&out) {
	out = 0;
	for (int i = 0; i < N; ++i) {
		out += in[i];
	}
}


int main()
{
	float time_cpu, time_gpu;
	for (int y = 0; y < N; ++y) {
		arr[y] = y;
	}

	int cpu_result = 0;
	auto b = clock();
	cpu_sum(arr, cpu_result);
	time_cpu = clock() - b;

	printf("CPU result: %d, CPU time: %.3f\n", cpu_result, time_cpu);

	return 0;
}

cpu耗时

请添加图片描述
耗时20毫秒

gpu 算法

算法过程

  1. 先对数据分成1024*1024组,每组用1个线程进行汇总。
  2. 再把线程分成1024组,每组1024个,组内用共享内存进行快速求和。
  3. 将每组结果汇总。

共享内存求和过程:
gpu cuda 数组求和优化_第1张图片
设数组A为共享内存,共享内存有大小限制,一共1024个元素。
每次由前一半的线程进行操作。
第1次由前512个线程进行求和,结果放入前512个元素中。
第2次再缩减一半,直到剩下1个元素。

 for(int s = 1024/2;s;s>>=1) { // s代表当前多个线程有计算作用
 	if(tid<s)// tid代表当前线程号
 	    A[tid]+=A[tid+s];
 }

代码


#include "cuda_runtime.h"
#include
#include "device_launch_parameters.h"

#include 

using namespace std;

cudaError_t addWithCuda(vector<int> &a, vector<int> &sum);

__global__ void addKernel(int *dev_a, int *dev_out)
{
	int myId = threadIdx.x + blockDim.x*blockIdx.x;
	atomicAdd(dev_out, dev_a[myId]);
}

__global__ void addKernel_sheard(int *dev_a, int *dev_out, int len)
{
	extern __shared__ int sdata[];
	int myId = threadIdx.x + blockDim.x*blockIdx.x;
	int tid = threadIdx.x;

	sdata[tid] = 0;
	for (myId; myId < len; myId += blockDim.x*gridDim.x) sdata[tid] += dev_a[myId];
	__syncthreads();

	for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
		if (tid < s) {
			sdata[tid] += sdata[tid + s];
		}
		__syncthreads();
	}

	if (tid == 0) {
		atomicAdd(dev_out, sdata[0]);
	}
}

int main()
{
	vector<int> a(102300 * 1024);
	vector<int> sum(1024, 0);
	int s = 0;
	for (int i = 0; i < 102300 * 1024; ++i)
	{
		s += i;
		a[i] = i;
	}

	fprintf(stderr, "%d\n", s);

	// Add vectors in parallel.
	cudaError_t cudaStatus = addWithCuda(a, sum);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "addWithCuda failed!");
		return 1;
	}

	cudaStatus = cudaDeviceReset();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceReset failed!");
		return 1;
	}

	s = 0;
	for (auto n : sum)s += n;
	fprintf(stderr, "%d\n", s);

	return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(vector<int> &a, vector<int> &sum)
{
	int *dev_a = 0;
	int *dev_out = 0;
	cudaError_t cudaStatus;

	// Choose which GPU to run on, change this on a multi-GPU system.
	cudaStatus = cudaSetDevice(0);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
		goto Error;
	}

	// Allocate GPU buffers for three vectors (two input, one output)    .
	cudaStatus = cudaMalloc((void**)&dev_a, a.size() * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!");
		goto Error;
	}

	cudaStatus = cudaMalloc((void**)&dev_out, sum.size() * sizeof(int));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!");
		goto Error;
	}


	// Copy input vectors from host memory to GPU buffers.
	cudaStatus = cudaMemcpy(dev_a, a.data(), a.size() * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	cudaStatus = cudaMemcpy(dev_out, sum.data(), sum.size() * sizeof(int), cudaMemcpyHostToDevice);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	// Launch a kernel on the GPU with one thread for each element.
	for (int i = 0; i < 1; ++i)addKernel_sheard << <1024, 1024, 1024 * sizeof(int) >> > (dev_a, dev_out, a.size());

	// Check for any errors launching the kernel
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
		goto Error;
	}

	// cudaDeviceSynchronize waits for the kernel to finish, and returns
	// any errors encountered during the launch.
	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
		goto Error;
	}

	// Copy output vector from GPU buffer to host memory.
	cudaStatus = cudaMemcpy(sum.data(), dev_out, sum.size() * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

Error:
	cudaFree(dev_a);
	cudaFree(dev_out);

	return cudaStatus;
}


耗时分析

gpu cuda 数组求和优化_第2张图片
耗时1.4毫秒,提升了10多倍。


本人码农,希望通过自己的分享,让大家更容易学懂计算机知识。创作不易,帮忙点击公众号的链接。

在这里插入图片描述

你可能感兴趣的:(高阶算法,gpu,面试技巧,cuda,算法)