【CUDA-C/C++】任意维度矩阵乘

前面已经写过了利用Fortran实现任意维度矩阵乘的CUDA实现,详见:https://blog.csdn.net/xll_bit/article/details/117551476?spm=1001.2014.3001.5501

今天更新一个使用c实现的任意维度矩阵乘的CUDA实现,分别包含了简单实现版本和利用tile技术实现的share memory 版本,详细代码如下:

#include 
#include 
#include 
using namespace std;

#define TILE_SIZE 16
typedef float my_type;

bool comp(my_type *s, my_type *p, int M, int K){
    for(int m = 0; m < M * K; m ++){
        if( abs(s[m] - p[m]) > 1e-1){
            cout << "Test failed at" << m <<' '<>>(d_a, d_b, d_c, M, N, K);
    cudaEventRecord(start,0);
    for(int i=0; i < NUM; i ++)
        kernel_global<<>>(d_a, d_b, d_c, M, N, K);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime_global,start,stop);
    elapsedTime_global /= NUM;

    cudaEventRecord(start,0);
    for(int i=0; i < NUM; i ++)
        kernel_shared<<>>(d_a, d_b, d_c, M, N, K);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime_shared,start,stop);
    elapsedTime_shared /= NUM;
    cudaMemcpy(p_c,d_c,M*K*sizeof(my_type),cudaMemcpyDeviceToHost);

    clock_t c_start, c_stop;

    c_start = clock();
    for(int m = 0; m < M; m ++){
        for( int k = 0; k < K; k ++){
            my_type tmp = 0;
            for( int n = 0; n < N; n++){
                tmp += h_a[m * N + n] * h_b[n * K + k];
            }
            s_c[m * K + k] = tmp;
        }
    }
    c_stop = clock();
    elapsedTime_cpu = (c_stop - c_start)/ 1000;
    if( !comp(s_c,p_c,M, K))
        print(s_c,p_c,M, K);
    cout<<"cpu time:    "<

如下是加上 -O2编译选项后运行出的结果:

【CUDA-C/C++】任意维度矩阵乘_第1张图片

你可能感兴趣的:(CUDA,c++,性能分析,c语言,c++,算法)