矩阵乘法在不同平台上的运行时间

出于对并行计算的兴趣,我尝试了CPU、GPU CUDA、GPU CUDA CUBLAS这三种方式进行矩阵乘法的计算。
一、CPU

#include
#include
#include
//#pragma GCC optimize(3)
#define R_SIZE 256*8
int main(){
  int* a=(int*)malloc(R_SIZE*R_SIZE*sizeof(int));
  int* b=(int*)malloc(R_SIZE*R_SIZE*sizeof(int));
  int* c=(int*)malloc(R_SIZE*R_SIZE*sizeof(int));
  int i,j,k;
  clock_t start,end;
  for(i=0;i<R_SIZE*R_SIZE;i++)
      {
      a[i]=rand()%10000;
      b[i]=rand()%10000;
      c[i]=0;
      }
  start=clock();
  for(i=0;i<R_SIZE;i++)
    for(j=0;j<R_SIZE;j++)
       for(k=0;k<R_SIZE;k++)
          c[i*R_SIZE+j]+=a[i*R_SIZE+k]*b[k*R_SIZE+j];
  end=clock();
  printf("time is %f s\n",((double)end-start)/CLOCKS_PER_SEC);
  return 0;
}                                                                          

二、GPU CUDA CUBLAS

#include "cuda_runtime.h"
#include "cublas_v2.h"
#include
#include 
#include 
#include 
#include 

using namespace std;

int main()
{
    clock_t start,end;
    srand(time(0));
    int M = 32*256;            //矩阵A的行,矩阵C的行
    int N = 32*256;            //矩阵A的列,矩阵B的行
    int K = 32*256;            //矩阵B的列,矩阵C的列

    float *h_A = (float*)malloc(sizeof(float)*M*N);
    float *h_B = (float*)malloc(sizeof(float)*N*K);
    float *h_C = (float*)malloc(sizeof(float)*M*K);

    for (int i = 0; i < M*N; i++)
    {
        h_A[i] = rand() % 10;        
    }

    for (int i = 0; i < N*K; i++)
    {
        h_B[i] = rand() % 10;
    }

    float *d_A, *d_B, *d_C,*d_CT;

    start=clock();
    cudaMalloc((void**)&d_A, sizeof(float)*M*N);
    cudaMalloc((void**)&d_B, sizeof(float)*N*K);
    cudaMalloc((void**)&d_C, sizeof(float)*M*K);

    cudaMemcpy(d_A, h_A, M*N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N*K * sizeof(float), cudaMemcpyHostToDevice);

    float alpha = 1;
    float beta = 0;

    //C=A*B
    cublasHandle_t handle;
    cublasCreate(&handle);
    cublasSgemm(handle,
        CUBLAS_OP_N,  
        CUBLAS_OP_N,   
        K,                    //矩阵B的列数
        M,                    //矩阵A的行数
        N,                    //矩阵A的列数
        &alpha,           
        d_B,            
        K,                    
        d_A,         
        N,         
        &beta,          
        d_C,           
        K);

    cudaMemcpy(h_C, d_C, M*K * sizeof(float), cudaMemcpyDeviceToHost);
    end=clock();

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
    printf("time used is %f s\n",((double)end-start)/CLOCKS_PER_SEC);
    return 0;
}

三、GPU CUDA

#include 
#include
#include
#include

#define BLOCK_NUM 32   //块数量
#define THREAD_NUM 256 // 每个块中的线程数
#define R_SIZE BLOCK_NUM * THREAD_NUM
#define M_SIZE R_SIZE * R_SIZE

__global__ void mat_mul(int *mat1, int *mat2, int *result) {
    const int bid = blockIdx.x;
    const int tid = threadIdx.x;
    // 每个线程计算一行
    const int row = bid * THREAD_NUM + tid;
    for (int c = 0; c < R_SIZE; c++) {
        for (int n = 0; n < R_SIZE; n++) {
            result[row*R_SIZE+c] += mat1[row*R_SIZE+n] * mat2[n*R_SIZE+c];
        }
    }
}

int main(int argc, char *argv[]) {
    int *mat1, *mat2, *result;
    int *g_mat1, *g_mat2, *g_mat_result;
    clock_t start,end;    

    // 用一位数组表示二维矩阵
    mat1 = (int*) malloc(M_SIZE * sizeof(int));
    mat2 = (int*) malloc(M_SIZE * sizeof(int));
    result = (int*) malloc(M_SIZE * sizeof(int));

    // initialize
    for (int i = 0; i < M_SIZE; i++) {
        mat1[i] = rand()/1000000;
        mat2[i] = rand()/1000000;
        result[i] = 0;
        
    }

    cudaMalloc((void **)&g_mat1, sizeof(int) * M_SIZE);
    cudaMalloc((void **)&g_mat2, sizeof(int) * M_SIZE);
    cudaMalloc((void **)&g_mat_result, sizeof(int) * M_SIZE);
    
    start=clock();

    cudaMemcpy(g_mat1, mat1, sizeof(int) * M_SIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(g_mat2, mat2, sizeof(int) * M_SIZE, cudaMemcpyHostToDevice);

    mat_mul<<<BLOCK_NUM, THREAD_NUM>>>(g_mat1, g_mat2, g_mat_result);

    cudaMemcpy(result, g_mat_result, sizeof(int) * M_SIZE, cudaMemcpyDeviceToHost);

    end=clock();
    printf("time=%f\n",((double)end-start)/CLOCKS_PER_SEC);
    return 0;
}




运行时间比较
矩阵乘法在不同平台上的运行时间_第1张图片
从上到下依次为CUDA CUBLAS、CUDA、CPU的运行时间,而且前两种情况是(32x256)x(32x256)大小的两个矩阵相乘,而第三种是(256x8)x(256x8)大小的两个矩阵相乘,可以看出,运行速度CUBLAS>CUDA>>CPU.

你可能感兴趣的:(c/c++)