出于对并行计算的兴趣,我尝试了CPU、GPU CUDA、GPU CUDA CUBLAS这三种方式进行矩阵乘法的计算。
一、CPU
#include
#include
#include
//#pragma GCC optimize(3)
#define R_SIZE 256*8
int main(){
int* a=(int*)malloc(R_SIZE*R_SIZE*sizeof(int));
int* b=(int*)malloc(R_SIZE*R_SIZE*sizeof(int));
int* c=(int*)malloc(R_SIZE*R_SIZE*sizeof(int));
int i,j,k;
clock_t start,end;
for(i=0;i<R_SIZE*R_SIZE;i++)
{
a[i]=rand()%10000;
b[i]=rand()%10000;
c[i]=0;
}
start=clock();
for(i=0;i<R_SIZE;i++)
for(j=0;j<R_SIZE;j++)
for(k=0;k<R_SIZE;k++)
c[i*R_SIZE+j]+=a[i*R_SIZE+k]*b[k*R_SIZE+j];
end=clock();
printf("time is %f s\n",((double)end-start)/CLOCKS_PER_SEC);
return 0;
}
二、GPU CUDA CUBLAS
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include
#include
#include
#include
#include
using namespace std;
int main()
{
clock_t start,end;
srand(time(0));
int M = 32*256; //矩阵A的行,矩阵C的行
int N = 32*256; //矩阵A的列,矩阵B的行
int K = 32*256; //矩阵B的列,矩阵C的列
float *h_A = (float*)malloc(sizeof(float)*M*N);
float *h_B = (float*)malloc(sizeof(float)*N*K);
float *h_C = (float*)malloc(sizeof(float)*M*K);
for (int i = 0; i < M*N; i++)
{
h_A[i] = rand() % 10;
}
for (int i = 0; i < N*K; i++)
{
h_B[i] = rand() % 10;
}
float *d_A, *d_B, *d_C,*d_CT;
start=clock();
cudaMalloc((void**)&d_A, sizeof(float)*M*N);
cudaMalloc((void**)&d_B, sizeof(float)*N*K);
cudaMalloc((void**)&d_C, sizeof(float)*M*K);
cudaMemcpy(d_A, h_A, M*N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, N*K * sizeof(float), cudaMemcpyHostToDevice);
float alpha = 1;
float beta = 0;
//C=A*B
cublasHandle_t handle;
cublasCreate(&handle);
cublasSgemm(handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
K, //矩阵B的列数
M, //矩阵A的行数
N, //矩阵A的列数
&alpha,
d_B,
K,
d_A,
N,
&beta,
d_C,
K);
cudaMemcpy(h_C, d_C, M*K * sizeof(float), cudaMemcpyDeviceToHost);
end=clock();
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
printf("time used is %f s\n",((double)end-start)/CLOCKS_PER_SEC);
return 0;
}
三、GPU CUDA
#include
#include
#include
#include
#define BLOCK_NUM 32 //块数量
#define THREAD_NUM 256 // 每个块中的线程数
#define R_SIZE BLOCK_NUM * THREAD_NUM
#define M_SIZE R_SIZE * R_SIZE
__global__ void mat_mul(int *mat1, int *mat2, int *result) {
const int bid = blockIdx.x;
const int tid = threadIdx.x;
// 每个线程计算一行
const int row = bid * THREAD_NUM + tid;
for (int c = 0; c < R_SIZE; c++) {
for (int n = 0; n < R_SIZE; n++) {
result[row*R_SIZE+c] += mat1[row*R_SIZE+n] * mat2[n*R_SIZE+c];
}
}
}
int main(int argc, char *argv[]) {
int *mat1, *mat2, *result;
int *g_mat1, *g_mat2, *g_mat_result;
clock_t start,end;
// 用一位数组表示二维矩阵
mat1 = (int*) malloc(M_SIZE * sizeof(int));
mat2 = (int*) malloc(M_SIZE * sizeof(int));
result = (int*) malloc(M_SIZE * sizeof(int));
// initialize
for (int i = 0; i < M_SIZE; i++) {
mat1[i] = rand()/1000000;
mat2[i] = rand()/1000000;
result[i] = 0;
}
cudaMalloc((void **)&g_mat1, sizeof(int) * M_SIZE);
cudaMalloc((void **)&g_mat2, sizeof(int) * M_SIZE);
cudaMalloc((void **)&g_mat_result, sizeof(int) * M_SIZE);
start=clock();
cudaMemcpy(g_mat1, mat1, sizeof(int) * M_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(g_mat2, mat2, sizeof(int) * M_SIZE, cudaMemcpyHostToDevice);
mat_mul<<<BLOCK_NUM, THREAD_NUM>>>(g_mat1, g_mat2, g_mat_result);
cudaMemcpy(result, g_mat_result, sizeof(int) * M_SIZE, cudaMemcpyDeviceToHost);
end=clock();
printf("time=%f\n",((double)end-start)/CLOCKS_PER_SEC);
return 0;
}
运行时间比较
从上到下依次为CUDA CUBLAS、CUDA、CPU的运行时间,而且前两种情况是(32x256)x(32x256)大小的两个矩阵相乘,而第三种是(256x8)x(256x8)大小的两个矩阵相乘,可以看出,运行速度CUBLAS>CUDA>>CPU.