矩阵乘法是cuda samples中的一个。所以就从这个较为简单的程序开始熟悉cuda吧。下面的代码有三个部分。
上面讲的比较笼统,还想要了解细节还是要阅读源码。
#include
#include "device_launch_parameters.h"
#include
#include
#include
//CPU版矩阵乘法
int matrixMutilCPU(float *A, float *B, int rowA, int colA, int rowB, int colB,float *C){
float tmp ;
if (colA != rowB)
return -1;
for (int i = 0; i < rowA; i++){
for (int j = 0; j < colB; j++){ //loop for C
tmp = 0;
for (int k = 0; k < colA; k++){
tmp += (A[colA*i+k]*B[colB*k+j]);
}
C[i*colB+j]= tmp;
}
}
return 0;
}
//globle memory矩阵乘法
__global__ void matrixMutilGPU_slow(float *A, float *B, int colA, int colB, float *C){
int xb = blockIdx.x;
int yb = blockIdx.y;
int x = threadIdx.x;
int y = threadIdx.y;
//计算结果矩阵的二维坐标
int row = blockDim.y*yb + y;//blockDim.x == blockSize
int col = blockDim.x*xb + x;
float tsum = 0;
//每一个block A和B的子块首地址
for (int k = 0; k < colB; k++){
tsum += (A[row*colA+k] * B[k*colB+col]);
}
//写入结果
C[row*colB+col] = tsum;
}
// shared memory 版分块矩阵乘法
template <int blockSize>
__global__ void matrixMutilGPU(float *A, float *B, int colA,int colB, float *C){
int xb = blockIdx.x;
int yb = blockIdx.y;
int x = threadIdx.x;
int y = threadIdx.y;
//该线程负责的结果子块C,对应的A和B用于计算的起始子块
//假设分成9个子块
// A11 A12 A13 B11 B12 B13
// A21 A22 A23 * B21 B22 B23
// A31 A32 A33 B31 B32 B33 ,则计算C22时这样计算:A21*B12+A22*B22+A23*B32
float *BeginA = A + yb* blockSize*colA;
float *EndA = BeginA + colA;
float *BeginB = B + blockSize*xb;
int stepA = blockSize;
int stepB = blockSize*colB;
float tsum = 0;
//每一个block A和B的子块首地址
for (; BeginA < EndA; BeginA += stepA, BeginB += stepB){
__shared__ float As[blockSize][blockSize];
__shared__ float Bs[blockSize][blockSize];
// 每个线程load一个元素到shared mem中
As[y][x] = *(BeginA + y*colA + x);
Bs[y][x] = *(BeginB + y*colB + x);
__syncthreads();//同步
for (int k = 0; k < blockSize;k++){
tsum = tsum + As[y][k] * Bs[k][x];
}
__syncthreads();//同步,确保该块内所有线程都完成了计算。下次循环,要重复利用共享内存。
}
//写入结果 注意坐标的计算方法
C[yb*blockSize*colB+y*colB+xb*blockSize+x]=tsum;
}
int main(int argc,char **argv){
//1.construct Mat A & Mat B
int rowA, colA,rowB,colB;
rowA=1024;
colA = 1024;
rowB = 1024;
colB = 1024;
float *Ah, *Bh,*Ch;
int memSizeA, memSizeB,memSizeC;
memSizeA = sizeof(float)*colA*rowA;
memSizeB = sizeof(float)*colB*rowB;
memSizeC = sizeof(float)*colA*rowB;
Ah = (float *)malloc(memSizeA);
Bh = (float *)malloc(memSizeB);
Ch = (float *)malloc(memSizeC);
for (int k = 0; k < colA*rowA; k++)
Ah[k] = 1.0l;
for (int k = 0; k < colB*rowB; k++)
Bh[k] = 0.1l;
clock_t t1 = clock();
matrixMutilCPU(Ah, Bh, rowA, colA, rowB, colB, Ch);
clock_t t2 = clock();
double time = (t2 - t1)*1.0 / CLOCKS_PER_SEC;
printf("CPU cost:%.8lf s\n",time);
printf("C[0]=%.8lf\n", Ch[0]);
printf("=======================\n");
for (int k = 0; k < colB*rowA; k++)
Ch[k] = 0.0l;
//2.set GPU params
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaSetDevice(0);
const int blockSize = 16;
cudaError_t error;
dim3 threads(blockSize, blockSize);
dim3 grid(colB / threads.x, rowA / threads.y);
float *Ad, *Bd, *Cd;
error=cudaMalloc(&Ad, memSizeA);
if (error != cudaSuccess)
{
printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
error=cudaMalloc(&Bd, memSizeB);
if (error != cudaSuccess)
{
printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
error=cudaMalloc(&Cd, memSizeC);
if (error != cudaSuccess)
{
printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
//3. move data from host to device
cudaMemcpy(Ad, Ah, memSizeA, cudaMemcpyHostToDevice);
cudaMemcpy(Bd, Bh, memSizeB, cudaMemcpyHostToDevice);
cudaMemcpy(Cd, Ch, memSizeC, cudaMemcpyHostToDevice);
//4.compute
t1 = clock();
//matrixMutilGPU_slow << > >(Ad, Bd, colA, colB, Cd); //全局显存
matrixMutilGPU << > >(Ad, Bd, colA, colB, Cd);//共享内存
//5.move data to host
cudaMemcpy(Ch, Cd, memSizeC, cudaMemcpyDeviceToHost);
t2 = clock(); //计算GPU耗时 包括内存移动
time = (t2 - t1)*1.0 / CLOCKS_PER_SEC;
printf("GPU cost:%.8lf s\n", time);
printf("C[0]=%.8lf\n", Ch[0]);
//6.release memory
free(Ah);
free(Bh);
free(Ch);
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
return 0;
}
下表记录了矩阵大小为1024*1024,数据类型为float的矩阵乘法耗时。编译选项为VS默认选项,release模式。
机器配置是I7,GTX1066。
block大小 | 4*4 | 16*16 | 32*32 | 64*64 |
---|---|---|---|---|
CPU | 2.386s | 2.386s | 2.386s | 2.386s |
GPU_基础版本 | 0.036s | 0.011s | 0.011s | kernel函数未运行 |
GPU_共享内存版本 | 0.027s | 0.005s | 0.005s | kernel函数未运行 |
GPU提速的效果是很明显的,GPU版本单纯计算的速度更快,这里面的时间消耗也包括把结果数据从显存搬到内存中的时间。 kernel未运行初步判断是显存不够,分的子块越大,内存占用也就更多。但是基础内存版本为什么也内存不足呢? 有清楚细节的可以在下方留言指教。