#include "cuda_runtime.h"
#include "device_launch_parameters.h" // threadIdx
#include // io
#include // time_t
#include // rand
#include //memset
#define CHECK(call) \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
}
///
/// 矩阵相加,线性存储的二维矩阵
///
///
///
///
///
///
void sumMatrixOnHost(float* h_a, float* h_b, float* h_c, const int nx, const int ny)
{
float* ia = h_a;
float* ib = h_b;
float* ic = h_c;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++) // 处理当前行
{
ic[ix] = ia[ix] + ib[ix];
}
ia += nx; ib += nx; ic += nx; // 移动到下一行,ia下一行的第一个索引变成了0.
}
}
// 去掉循环
__global__ void sumMatrixOnDevice2D(float* d_a, float* d_b, float* d_c, const int nx, const int ny)
{
// 二维网格和二维块,映射到矩阵坐标
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
// 由矩阵坐标, 映射到全局坐标(都是线性存储的)
unsigned int idx = iy * nx + ix; // 坐标(ix, iy),前面由iy行,每行有nx个元素
// 相加
if (ix < nx && iy < ny) // 配置线程的可能过多,这里防止越界。
{
d_c[idx] = d_a[idx] + d_b[idx];
}
if (idx == 0)
printf("%d, %d", idx, d_c[idx]);
}
void initialData(float* p, const int N)
{
//generate different seed from random number
time_t t;
srand((unsigned int)time(&t)); // 生成种子
for (int i = 0; i < N; i++)
{
p[i] = (float)(rand() & 0xFF) / 10.0f; // 随机数
}
}
void checkResult(float* hostRef, float* deviceRef, const int N)
{
double eps = 1.0E-8;
int match = 1;
for (int i = 0; i < N; i++)
{
if (hostRef[i] - deviceRef[i] > eps)
{
match = 0;
printf("\nArrays do not match\n");
printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], deviceRef[i], i);
break;
}
}
if (match)
printf("\nArrays match!\n");
}
int main(void)
{
// get device info
int device = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, device));
printf("Using device: %d %s", device, deviceProp.name); // 卡号0的显卡名称。
CHECK(cudaSetDevice(device)); // 设置显卡号
// set matrix dimension. 2^14 = 16384行列数
int nx = 1<<14, ny =1<<14, nxy = nx * ny;
//int nx = 1 << 13, ny = 1 << 13, nxy = nx * ny;
int nBytes = nxy * sizeof(float);
// malloc host memory
float* h_a, * h_b, * hostRef, * gpuRef;
h_a = (float*)malloc(nBytes);
h_b = (float*)malloc(nBytes);
hostRef = (float*)malloc(nBytes); // 主机端求得的结果
gpuRef = (float*)malloc(nBytes); // 设备端拷回的数据
// init data
initialData(h_a, nxy);
initialData(h_b, nxy);
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// add matrix on host side for result checks.
sumMatrixOnHost(h_a, h_b, hostRef, nx, ny);
// malloc device memory
float* d_mat_a, * d_mat_b, * d_mat_c;
cudaMalloc((void**)&d_mat_a, nBytes);
cudaMalloc((void**)&d_mat_b, nBytes);
cudaMalloc((void**)&d_mat_c, nBytes);
// transfer data from host to device
cudaMemcpy(d_mat_a, h_a, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_mat_b, h_b, nBytes, cudaMemcpyHostToDevice);
// config
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy); // 二维线程块(x,y)=(4,2)
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); // 二维网格(2,3)
// 直接nx/block.x = 8/4=2. (8+4-1)/4=2.
// invoke kernel
sumMatrixOnDevice2D << > > (d_mat_a, d_mat_b, d_mat_c, nx, ny);
CHECK(cudaDeviceSynchronize());
// check kernel error
CHECK(cudaGetLastError()); // 一定要加上这一行,不然上面求和有错不会中断也不提示。
// copy kernel result back to host side
cudaMemcpy(gpuRef, d_mat_c, nBytes, cudaMemcpyDeviceToHost);
// check result
checkResult(hostRef, gpuRef, nxy);
// free memory
cudaFree(d_mat_a);
cudaFree(d_mat_b);
cudaFree(d_mat_c);
free(h_a);
free(h_b);
free(hostRef);
free(gpuRef);
// reset device
cudaDeviceReset();
return 0;
}
注意:如果没有下面这一行,会报错。
// check kernel error
CHECK(cudaGetLastError()); // 一定要加上这一行,不然上面求和有错不会中断也不提示。
错误信息:
File: D:/zxq/code/cuda/CUDA111/CUDA111/kernel.cu
Line: 144
Error code: 700
Error text: an illegal memory access was encountered
1<<14会报矩阵求和不一致错误,1<<13即可。