CUDA并行计算框架编程+矩阵相乘并行计算

当下的GPGPU(General Purpose GPU(Graphic Process Unit))—(CUDA: Compute Unified Device Architecture)即通用计算图形处理器。

安装过程可参考我的另一篇blog:https://blog.csdn.net/pnan222/article/details/79449923

CUDA软件架构:(1)开发库(CUDA Library)(2)运行时环境(CUDA Runtime)(3)驱动(CUDA Driver)

CUDA的线程层次结构:Kenel-->Grid-->Block-->Thread

GPU硬件的一个核心组件是SM(Streaming Multiprocessor)流式多处理器,SM可以并发地执行数百个线程,且一个block对应一个SM,而一个SM则可以对应多个block

grid只是逻辑层;而SM才是真正的物理层;block的大小一般要设置成32的倍数

在VS上的配置过程可参考blog:https://blog.csdn.net/kyocen/article/details/51424161

Code:

#include   
#include 
#include 
#include 
//#include 
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  
#define DWORD unsigned long
#pragma comment(lib,"winmm.lib")

using namespace std;

__global__ void add(float* a, float* b, float* c, int N)
{
	for (int i = 0; i < N; i++)
	{
		c[i] = a[i] + b[i];
	}
}


int main() {
	int N = 1 << 15;
	int nBytes = N * sizeof(float);
	//申请host内存
	float *x, *y, *z;
	x = (float*)malloc(nBytes);
	y = (float*)malloc(nBytes);
	z = (float*)malloc(nBytes);

	//初始化数据
	for (int i = 0; i < N; i++)
	{
		x[i] = 10.0;
		y[i] = 20.0;
	}

	//申请device内存
	float *d_x, *d_y, *d_z;
	cudaMalloc((void**)&d_x, nBytes);
	cudaMalloc((void**)&d_y, nBytes);
	cudaMalloc((void**)&d_z, nBytes);

	//将host数据拷贝到device
	cudaMemcpy((void*)d_x, (void*)x, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy((void*)d_y, (void*)y, nBytes, cudaMemcpyHostToDevice);


	//定义kernel的执行配置
	dim3 blockSize(256);
	dim3 gridSize((int)((N + blockSize.x - 1) / blockSize.x));

	//编写计时函数
	DWORD t1, t2;
	t1 = timeGetTime();
	
	

	//执行kernel
	add << < gridSize, blockSize >> >(d_x, d_y, d_z, N);

	t2 = timeGetTime();
	printf("Use Time:%f (s)\n", (t2 - t1)*1.0 / 1000);

	//将device得到的结果拷贝到host
	cudaMemcpy((void*)z, (void*)d_z, nBytes, cudaMemcpyDeviceToHost);

	//检查执行结果
	float maxError = 0.0;
	for (int i = 0; i < N; i++)
	{
		maxError = fmax(maxError, fabs(z[i] - 30.0));
	}
	cout << "最大误差:" << maxError << endl;

	//释放device内存
	cudaFree(d_x);
	cudaFree(d_y);
	cudaFree(d_z);

	//释放host内存
	free(x);
	free(y);
	free(z);
	
	system("pause");
	return 0;
}

实现矩阵相乘并行计算的代码:

#include   
#include 
#include 
#include 
//#include 
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  
#define DWORD unsigned long
#pragma comment(lib,"winmm.lib")

using namespace std;

__global__ void add(float* a, float* b, float* c, int N)
{
	for (int i = 0; i < N; i++)
	{
		c[i] = a[i] + b[i];
	}
}

__global__ void MatrixMuiOnDevice(int *M, int *N, int *P, int width)
{
	int x = threadIdx.x;
	int y = threadIdx.y; //获取该线程的位置

	float Pervalue = 0;

	for (int i = 0; i < width; i++)
	{
		float Mdlement = M[y * width + i];
		float Ndlement = N[width * i + x];

		Pervalue += Mdlement * Ndlement;
	}

	P[y * width + x] = Pervalue;
}

int main() {

	int a[30][30], b[30][30], c[30][30];
	int *M, *N, *P;
	int width = 30;
	int NUM = 900;
	dim3 dimBlock(30, 30);
	cudaEvent_t start, stop;
	float elapsedTime;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);


	cudaMalloc((void**)&M, 900 * sizeof(int));
	cudaMalloc((void**)&N, 900 * sizeof(int));
	cudaMalloc((void**)&P, 900 * sizeof(int));
	//初始化
	for (int i = 0; i < 30; i++)
	{
		for (int j = 0; j < 30; j++)
		{
			a[i][j] = 2;
			b[i][j] = 3;
		}
	}
	cudaMemcpy(M, a, NUM * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(N, b, NUM * sizeof(int), cudaMemcpyHostToDevice);
	
	cudaEventRecord(start, 0);
	MatrixMuiOnDevice << <1, dimBlock >> >(M, N, P, width);
	cudaThreadSynchronize();

	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);

	printf("%f\n", elapsedTime);

	cudaMemcpy(c, P, NUM * sizeof(int), cudaMemcpyDeviceToHost);
	for (int i = 0; i < 30; i++)
	{
		for (int j = 0; j < 30; j++)
		{
			printf("%d ", c[i][j]);
		}
		cout << endl;
	}
	cudaFree(M);
	cudaFree(N);
	cudaFree(P);
	
	system("pause");
	return 0;
}

你可能感兴趣的:(OpenGL)