当下的GPGPU(General Purpose GPU(Graphic Process Unit))—(CUDA: Compute Unified Device Architecture)即通用计算图形处理器。
安装过程可参考我的另一篇blog:https://blog.csdn.net/pnan222/article/details/79449923
CUDA软件架构:(1)开发库(CUDA Library)(2)运行时环境(CUDA Runtime)(3)驱动(CUDA Driver)
CUDA的线程层次结构:Kenel-->Grid-->Block-->Thread
GPU硬件的一个核心组件是SM(Streaming Multiprocessor)流式多处理器,SM可以并发地执行数百个线程,且一个block对应一个SM,而一个SM则可以对应多个block
grid只是逻辑层;而SM才是真正的物理层;block的大小一般要设置成32的倍数
在VS上的配置过程可参考blog:https://blog.csdn.net/kyocen/article/details/51424161
Code:
#include
#include
#include
#include
//#include
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define DWORD unsigned long
#pragma comment(lib,"winmm.lib")
using namespace std;
__global__ void add(float* a, float* b, float* c, int N)
{
for (int i = 0; i < N; i++)
{
c[i] = a[i] + b[i];
}
}
int main() {
int N = 1 << 15;
int nBytes = N * sizeof(float);
//申请host内存
float *x, *y, *z;
x = (float*)malloc(nBytes);
y = (float*)malloc(nBytes);
z = (float*)malloc(nBytes);
//初始化数据
for (int i = 0; i < N; i++)
{
x[i] = 10.0;
y[i] = 20.0;
}
//申请device内存
float *d_x, *d_y, *d_z;
cudaMalloc((void**)&d_x, nBytes);
cudaMalloc((void**)&d_y, nBytes);
cudaMalloc((void**)&d_z, nBytes);
//将host数据拷贝到device
cudaMemcpy((void*)d_x, (void*)x, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_y, (void*)y, nBytes, cudaMemcpyHostToDevice);
//定义kernel的执行配置
dim3 blockSize(256);
dim3 gridSize((int)((N + blockSize.x - 1) / blockSize.x));
//编写计时函数
DWORD t1, t2;
t1 = timeGetTime();
//执行kernel
add << < gridSize, blockSize >> >(d_x, d_y, d_z, N);
t2 = timeGetTime();
printf("Use Time:%f (s)\n", (t2 - t1)*1.0 / 1000);
//将device得到的结果拷贝到host
cudaMemcpy((void*)z, (void*)d_z, nBytes, cudaMemcpyDeviceToHost);
//检查执行结果
float maxError = 0.0;
for (int i = 0; i < N; i++)
{
maxError = fmax(maxError, fabs(z[i] - 30.0));
}
cout << "最大误差:" << maxError << endl;
//释放device内存
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_z);
//释放host内存
free(x);
free(y);
free(z);
system("pause");
return 0;
}
实现矩阵相乘并行计算的代码:
#include
#include
#include
#include
//#include
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define DWORD unsigned long
#pragma comment(lib,"winmm.lib")
using namespace std;
__global__ void add(float* a, float* b, float* c, int N)
{
for (int i = 0; i < N; i++)
{
c[i] = a[i] + b[i];
}
}
__global__ void MatrixMuiOnDevice(int *M, int *N, int *P, int width)
{
int x = threadIdx.x;
int y = threadIdx.y; //获取该线程的位置
float Pervalue = 0;
for (int i = 0; i < width; i++)
{
float Mdlement = M[y * width + i];
float Ndlement = N[width * i + x];
Pervalue += Mdlement * Ndlement;
}
P[y * width + x] = Pervalue;
}
int main() {
int a[30][30], b[30][30], c[30][30];
int *M, *N, *P;
int width = 30;
int NUM = 900;
dim3 dimBlock(30, 30);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc((void**)&M, 900 * sizeof(int));
cudaMalloc((void**)&N, 900 * sizeof(int));
cudaMalloc((void**)&P, 900 * sizeof(int));
//初始化
for (int i = 0; i < 30; i++)
{
for (int j = 0; j < 30; j++)
{
a[i][j] = 2;
b[i][j] = 3;
}
}
cudaMemcpy(M, a, NUM * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(N, b, NUM * sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start, 0);
MatrixMuiOnDevice << <1, dimBlock >> >(M, N, P, width);
cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("%f\n", elapsedTime);
cudaMemcpy(c, P, NUM * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < 30; i++)
{
for (int j = 0; j < 30; j++)
{
printf("%d ", c[i][j]);
}
cout << endl;
}
cudaFree(M);
cudaFree(N);
cudaFree(P);
system("pause");
return 0;
}