cuda矩阵相乘_2.3CUDA矩阵乘法

CPU 矩阵乘法

能相乘的两个矩阵,必须满足一个矩阵的行数和第二个矩阵的列数相同.

A(N*P) * B(P*M) = C(N*M). 其中P是行数,N是列数, 从宽高的角度来说,即 A的宽度和B的高度是相同的.C矩阵 = ha * wb.

其中C(i,j) = A矩阵中的i行和B矩阵中的j列进行点乘得到该点的值.

//C = A*B

void MatrixMulCPU(float* _C,const float *_A,const float *_B,int _wa,int _ha,int_wb)

{float sum = 0;for (int i = 0; i < _ha; ++i)

{for (int j = 0; j < _wb; ++j)

{

sum= 0;for (int k = 0; k < _wa; ++k)

{

//i*_wa得到当前对应的是A矩阵中的哪一行,+k对应当前行的哪一列.矩阵A的stride是wa

//j对应当前矩阵B的哪一列,+k*wb依次表示第0行的j列,第1行的j列...第wa-1行的j列.矩阵B的stride是wb

sum+= (float)_A[i*_wa+k]*(float)_B[k*_wb+j];

}

_C[i*_wb+j] = (float)sum;

}

}

}

简单矩阵乘法

C(i,j) = sum { A(i,k)*B(k,j) } 0<=k < _wa;耦合程度很小,所以我们可以通过划分区域的方法,让每个线程负责一个区域。

怎么划分呢?首先最初的想法是让每一个线程计算一个C(i,j),那么估算一下,应该需要height_c*width_c,也就是ha*wb个线程。进一步,我们将矩阵按一个大方格Block划分,如果一个方格Block大小是16*16,那么矩阵80*48的可以表示为5(*16) * 3(*16),即5*3个大格子(Grid),所以grid的划分自然就是(height_c/16) *(width_c/16)个线程了。

好了,划分完后,内核代码如下: 这个kernel的代码只是把外层两个循环变成

计算版本0:

__global__ void matrix_kernel_0(float* _C,const float* _A,const float *_B,int _wa,int_wb)

{float sum = 0;//找出该线程所在的行列

int row = blockIdx.y*blockDim.y +threadIdx.y; // X 对应矩阵row, Y对应举证colint col = blockIdx.x*blockDim.x +threadIdx.x;//线程Thread(row,col)负责计算C(row,col)

for (int i = 0; i < _wa; ++i)

{

sum+= _A[row*_wa + i]*_B[i*_wb +col];

}

_C[row*_wb + col] =sum;

}

这个是Heterogeneous Parallel Programming lab3:Basic Matrix Matrix Multiplication的代码:

#include

#define wbCheck(stmt) \

do{ \

cudaError_t err=stmt; \if (err !=cudaSuccess) { \

wbLog(ERROR,"Failed to run stmt", #stmt); \

wbLog(ERROR,"Got CUDA error ...", cudaGetErrorString(err)); \return -1; \

} \

}while (0)#if 0 //This is C verison matrixMUl

//C = A*B

void MatrixMulCPU(float* _C,const float *_A,const float *_B,int _wa,int _ha,int_wb)

{float sum = 0;for (int i = 0; i < _ha; ++i)

{for (int j = 0; j < _wb; ++j)

{

sum= 0;for (int k = 0; k < _wa; ++k)

{

sum+= (float)_A[i*_wa+k]*(float)_B[k*_wb+j];

}

_C[i*_wb+j] = (float)sum;

}

}

}#endif

//Compute C = A * B , Matrix C = hA * wB = rowA * columnB

__global__ void matrixMultiply(float *A, float *B, float *C, intnumARows,int numAColumns, int numBRows, intnumBColumns,int numCRows, intnumCColumns) {//@@ Insert code to implement matrix multiplication here

float sum = 0.0f;int row = blockIdx.y*blockDim.y +threadIdx.y;int col = blockIdx.x*blockDim.x +threadIdx.x;if(row < numCRows && col

{

sum+= A[row*numAColumns + i] * B[i*numBColumns +col];

}

C[row*numBColumns + col] =sum;

}

printf("C = %f\n",C[row*numBColumns +col]);

}int main(int argc, char **argv) {

wbArg_t args;float *hostA; //The A matrix

float *hostB; //The B matrix

float *hostC; //The output C matrix

float *deviceA;float *deviceB;float *deviceC;int numARows; //number of rows in the matrix A

int numAColumns; //number of columns in the matrix A

int numBRows; //number of rows in the matrix B

int numBColumns; //number of columns in the matrix B

int numCRows; //number of rows in the matrix C (you have to set this)

int numCColumns; //number of columns in the matrix C (you have to set this)

args=wbArg_read(argc, argv);

wbTime_start(Generic,"Importing data and creating memory on host");

hostA=(float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);

hostB=(float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);//@@ Set numCRows and numCColumns

numCRows = 0;

numCColumns= 0;if(numAColumns !=numBRows){

wbLog(TRACE,"numAColumns != numBRows, Break");return 1;

}

numCRows=numARows;

numCColumns=numBColumns;

unsignedint A_size = numARows * numAColumns * sizeof(float);

unsignedint B_size = numBRows * numBColumns * sizeof(float);

unsignedint C_size = numCRows * numCColumns * sizeof(float);//@@ Allocate the hostC matrix

hostC = ( float *)malloc(C_size);

wbTime_stop(Generic,"Importing data and creating memory on host");

wbLog(TRACE,"The dimensions of A are", numARows, "x", numAColumns);

wbLog(TRACE,"The dimensions of B are", numBRows, "x", numBColumns);

wbTime_start(GPU,"Allocating GPU memory.");//@@ Allocate GPU memory here

wbCheck(cudaMalloc((void**)&deviceA, A_size));

wbCheck(cudaMalloc((void**)&deviceB, B_size));

wbCheck(cudaMalloc((void**)&deviceC, C_size));

wbTime_stop(GPU,"Allocating GPU memory.");

wbTime_start(GPU,"Copying input memory to the GPU.");//@@ Copy memory to the GPU here

wbCheck(cudaMemcpy(deviceA, hostA, A_size, cudaMemcpyHostToDevice));

wbCheck(cudaMemcpy(deviceB, hostB, B_size, cudaMemcpyHostToDevice));

wbCheck(cudaMemcpy(deviceC, hostC, C_size, cudaMemcpyHostToDevice));

wbTime_stop(GPU,"Copying input memory to the GPU.");//@@ Initialize the grid and block dimensions here

dim3 DimGrid((numCColumns-1)/16+1, (numCRows-1)/16+1, 1);

dim3 DimBlock(16, 16, 1);

wbTime_start(Compute,"Performing CUDA computation");//@@ Launch the GPU Kernel here

matrixMultiply<<< DimGrid, DimBlock >>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);

cudaDeviceSynchronize();

wbTime_stop(Compute,"Performing CUDA computation");

wbTime_start(Copy,"Copying output memory to the CPU");//@@ Copy the GPU memory back to the CPU here//@@ Copy the GPU memory back to the CPU here

wbCheck(cudaMemcpy(hostC, deviceC, C_size, cudaMemcpyDeviceToHost));

wbTime_stop(Copy,"Copying output memory to the CPU");

wbTime_start(GPU,"Freeing GPU Memory");//@@ Free the GPU memory here

wbCheck(cudaFree(deviceA));

wbCheck(cudaFree(deviceB));

wbCheck(cudaFree(deviceC));

wbTime_stop(GPU,"Freeing GPU Memory");

wbSolution(args, hostC, numCRows, numCColumns);

free(hostA);

free(hostB);

free(hostC);return 0;

}

View Code

使用tile来划分矩阵乘法

另外一种思路,我们不让每一个线程完整计算一个C(i,j),通过C(i,j) = sum { A(i,k)*B(k,j) }发现,我们还可以再细度划分:

Csub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)}  0<=ksub < blockSize

C(i,j) = sum{Csub(i,j)}

就是把矩阵分成n*n个大的子块,然后每一个block负责计算子块i 和 子块j的子乘积,计算完毕后加起来则可。这里主要引入shared Memory来提高程序效率.

计算矩阵我们

__global__ void matrix_kernel_1(float* _C,const float* _A,const float *_B,int _wa,int_wb) //_wa是A矩阵的宽度,_wb是矩阵B的宽度

{int bx =blockIdx.x; //Block X的当前位置int by =blockIdx.y; //Block y的当前位置int tx =threadIdx.x;int ty =threadIdx.y;//该block要处理的A ,A的取值方向是X轴方向, B的取值方向是Y轴方向

int aBegin = _wa*(by*BLOCK_SIZE);//A(0,by) //在矩阵A上每个block的首地址

int aEnd = aBegin + _wa - 1; //int aStep = BLOCK_SIZE;//offsetA //因为A是横向取值,所以step是blocksize

int bBegin = BLOCK_SIZE*bx;//B(bx,0) //矩阵B的首地址

int bStep = BLOCK_SIZE*_wb;//offsetB //因为B是纵向取值,所以step是blocksize*_wb.

float cSub = 0;

//每一个线程计算一个像素点,分成wa/block 次来计算,每次计算一段A(sub) * B(sub),最后累加得到C的结果.

//假设矩阵都是n*n的,那么旧的basicMatrix每个线程都需要执行2n次globalMemory的访问,这里用到sharedMemory只需要执行2n/blocksize,每个线程可以提高blocksize倍,

//每个block里面的thread都是通过读取sharedMemory来执行计算的,速度会非常快.for (int a = aBegin,b = bBegin; a <= aEnd; a += aStep,b +=bStep)

{

__shared__floatAs[BLOCK_SIZE][BLOCK_SIZE];

__shared__floatBs[BLOCK_SIZE][BLOCK_SIZE];//每个线程负责一个元素拷贝,我们以block为单位来分析。假设blocksize=16, 一个block里面有 16*16个线程。

//每个block 可以填满需要用到的 As, 和Bs大小的矩阵。这里就是矩阵A里面的16*16的数据可以填满,保存在sharedMemory中。同样B矩阵也是。

As[ty][tx] = _A[a + _wa*ty +tx];

Bs[ty][tx]= _B[b + _wb*ty +tx];

__syncthreads(); //同步使得矩阵A,和矩阵B的第一个tile*tile的数据保存在As和Bs里,供下面的计算使用.//每个线程负责计算一个子块i 和 子块j的子乘积宽度是block_size,执行到wa/block_size次,累加可得到C的值

for (int k = 0; k < BLOCK_SIZE; ++k)

{

cSub+= As[ty][k]*Bs[k][tx];

}

__syncthreads();

}//全局地址,向全局寄存器写回去//一个线程负责一个元素,一个block负责一个子块

int cIndex = (by*BLOCK_SIZE + ty)*_wb + (bx*BLOCK_SIZE +tx);

_C[cIndex]=cSub;

}

二维矩阵的索引问题:

假设有一个32*48的矩阵,x的范围[0,47], y的范围是[0,31]。 在代码中是以一个二维数组保存,内存是连续的。目前我们要找到point(23,23)的索引:

一维指针指向的应该是: 23*48 + 23.

但是我们同样也可以用grid 和block 来划分这个矩阵:

grid(3,2) (2是列,即X维度,2是行,Y维度), block(16,16) 。 grid X的范围是是[0,2], Y的范围是[0,1]. 同理适用于block 我们也可以用下面的方式找到point(23,23)的索引:

point(23,23)  对应grid的坐标是(1,1) 对应的block坐标是(7,7)。block的宽度是16。

point.y = (by*Block_size + ty) = 1*16+7 =23

point.x =  (bx*Block_size + tx)= 1*16+7 =23

point(x,y)的索引位置是: y * width + x

这个是Heterogeneous Parallel Programming lab4:Basic Matrix Matrix Multiplication的代码:

#include

#define wbCheck(stmt) \

do{ \

cudaError_t err=stmt; \if (err !=cudaSuccess) { \

wbLog(ERROR,"Failed to run stmt", #stmt); \

wbLog(ERROR,"Got CUDA error ...", cudaGetErrorString(err)); \return -1; \

} \

}while (0)#define TILE_WIDTH 32 //block size ,each thread to calucate each block

//Compute C = A * B

__global__ void matrixMultiplyShared(float *A, float *B, float *C, intnumARows,int numAColumns, intnumBRows,int numBColumns, intnumCRows,intnumCColumns) {//@@ Insert code to implement matrix multiplication here//@@ You have to use shared memory for this MP

__shared__floatsharedM[TILE_WIDTH][TILE_WIDTH];

__shared__floatsharedN[TILE_WIDTH][TILE_WIDTH];int bx =blockIdx.x;int by =blockIdx.y;int tx =threadIdx.x;int ty =threadIdx.y;int row = by*TILE_WIDTH +ty;int col = bx*TILE_WIDTH +tx;float v = 0.0;for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++)

{if (i*TILE_WIDTH + tx < numAColumns && row

sharedM[ty][tx]= A[row*numAColumns + i*TILE_WIDTH +tx];elsesharedM[ty][tx]= 0.0;if (i*TILE_WIDTH + ty < numBRows && col

sharedN[ty][tx]= B[(i*TILE_WIDTH + ty)*numBColumns +col];elsesharedN[ty][tx]= 0.0;

__syncthreads();for(int j = 0; j < TILE_WIDTH; j++)

v+= sharedM[ty][j] *sharedN[j][tx];

__syncthreads();

}if (row < numCRows && col

C[row*numCColumns + col] =v;

}int main(int argc, char **argv) {

wbArg_t args;float *hostA; //The A matrix

float *hostB; //The B matrix

float *hostC; //The output C matrix

float *deviceA;float *deviceB;float *deviceC;int numARows; //number of rows in the matrix A

int numAColumns; //number of columns in the matrix A

int numBRows; //number of rows in the matrix B

int numBColumns; //number of columns in the matrix B

int numCRows; //number of rows in the matrix C (you have to set this)

int numCColumns; //number of columns in the matrix C (you have to set this)

args=wbArg_read(argc, argv);

wbTime_start(Generic,"Importing data and creating memory on host");

hostA=(float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);

hostB=(float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);//@@ Set numCRows and numCColumns

numCRows = 0;

numCColumns= 0;if(numAColumns !=numBRows){

wbLog(TRACE,"numAColumns != numBRows, Break");return 1;

}

numCRows=numARows;

numCColumns=numBColumns;

unsignedint A_size = numARows * numAColumns * sizeof(float);

unsignedint B_size = numBRows * numBColumns * sizeof(float);

unsignedint C_size = numCRows * numCColumns * sizeof(float);//@@ Allocate the hostC matrix

hostC = ( float *)malloc(C_size);

wbTime_stop(Generic,"Importing data and creating memory on host");

wbLog(TRACE,"The dimensions of A are", numARows, "x", numAColumns);

wbLog(TRACE,"The dimensions of B are", numBRows, "x", numBColumns);

wbTime_start(GPU,"Allocating GPU memory.");//@@ Allocate GPU memory here

wbCheck(cudaMalloc((void**)&deviceA, A_size));

wbCheck(cudaMalloc((void**)&deviceB, B_size));

wbCheck(cudaMalloc((void**)&deviceC, C_size));

wbTime_stop(GPU,"Allocating GPU memory.");

wbTime_start(GPU,"Copying input memory to the GPU.");//@@ Copy memory to the GPU here

wbCheck(cudaMemcpy(deviceA, hostA, A_size, cudaMemcpyHostToDevice));

wbCheck(cudaMemcpy(deviceB, hostB, B_size, cudaMemcpyHostToDevice));

wbCheck(cudaMemcpy(deviceC, hostC, C_size, cudaMemcpyHostToDevice));

wbTime_stop(GPU,"Copying input memory to the GPU.");//@@ Initialize the grid and block dimensions here

dim3 DimGrid(ceil(numCColumns / 32.0), ceil(numCRows / 32.0), 1);

dim3 DimBlock(TILE_WIDTH, TILE_WIDTH,1);

wbTime_start(Compute,"Performing CUDA computation");//@@ Launch the GPU Kernel here

matrixMultiplyShared<<< DimGrid, DimBlock >>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);

cudaDeviceSynchronize();

wbTime_stop(Compute,"Performing CUDA computation");

wbTime_start(Copy,"Copying output memory to the CPU");//@@ Copy the GPU memory back to the CPU here//@@ Copy the GPU memory back to the CPU here

wbCheck(cudaMemcpy(hostC, deviceC, C_size, cudaMemcpyDeviceToHost));

wbTime_stop(Copy,"Copying output memory to the CPU");

wbTime_start(GPU,"Freeing GPU Memory");//@@ Free the GPU memory here

wbCheck(cudaFree(deviceA));

wbCheck(cudaFree(deviceB));

wbCheck(cudaFree(deviceC));

wbTime_stop(GPU,"Freeing GPU Memory");

wbSolution(args, hostC, numCRows, numCColumns);

free(hostA);

free(hostB);

free(hostC);return 0;

}

View Code

你可能感兴趣的:(cuda矩阵相乘)