cuda矩阵相乘_利用CUDA实现矩阵的快速乘法

// -------MatrixMul.cu-----

// 2012-03-27

// czg1989

//

#include

#include

#include

#include

#include

#include

#define TILE_WIDTH 16

#define WIDTH  512

// kernel function

// matrix mul

__global__ void Matrix_Mul(int *md, int *nd, int *pd, int

width)

{

int bx, by, tx, ty;

bx = blockIdx.x;

by = blockIdx.y;

tx = threadIdx.x;

ty = threadIdx.y;

int mulResult = 0;

for ( int i = 0; i < gridDim.x; ++i )

{

__shared__ int d_m[TILE_WIDTH][TILE_WIDTH];

__shared__ int d_n[TILE_WIDTH][TILE_WIDTH];

d_m[ty][tx] = *(md + (by * blockDim.y + ty) * width + i *

blockDim.x + tx);

d_n[ty][tx] = *(nd + (i * blockDim.y + ty) * width + bx *

blockDim.x + tx);

__syncthreads();

for ( int j = 0; j < blockDim.x; ++j )

{

mulResult += d_m[ty][j] * d_n[j][tx];

}

__syncthreads();

}

pd[(by*blockDim.y+ty)*width+bx*blockDim.x+tx] =

mulResult;

}

//CPU function

void FillMatrix( int *m )

{

int *mm = m;

srand(time(NULL));

for ( int i = 0; i < WIDTH * WIDTH; ++i )

{

mm[i] = i * rand() % 10 + 1;

}

}

void PrintMatrix( int *p )

{

for ( int i = 0; i < WIDTH * WIDTH; ++i )

{

if ( i > 0 && i

% WIDTH == 0 )

{

printf("\n");

}

printf("%d ", p[i]);

}

puts("\n");

}

#define MALLOC_INT_SIZE (sizeof(int) * WIDTH * WIDTH)

//测试结果是否正确

void GeneralMul(int *a, int *b, int *c, int width, int

*p)

{

time_t s = clock();

for ( int i = 0; i < width; ++i )

{

for ( int j = 0; j < width; ++j )

{

int sum = 0;

for ( int k = 0; k < width; ++k )

{

sum += a[i*width+k] * b[k*width+j];

}

c[i*width+j] = sum;

}

}

time_t e = clock();

printf("%f\n",(e-s)*1.f/1000.f);

for ( int i = 0; i < width * width; ++i )

{

if ( c[i] != p[i] )

{

printf("not equal,error!\n");

return ;

}

}

puts("right");

}

void Process( int argc, char** argv )

{

CUT_DEVICE_INIT( argc, argv );

int *m, *n, *p;

int *d_m, *d_n, *d_p;

cudaEvent_t begin, end;

CUDA_SAFE_CALL( cudaEventCreate(&begin)

);

CUDA_SAFE_CALL( cudaEventCreate(&end) );

CUDA_SAFE_CALL( cudaEventRecord(begin, 0) );

m = (int*)malloc(MALLOC_INT_SIZE);

n = (int*)malloc(MALLOC_INT_SIZE);

p = (int*)malloc(MALLOC_INT_SIZE);

CUDA_SAFE_CALL( cudaMalloc((void**)&d_m,

MALLOC_INT_SIZE) );

CUDA_SAFE_CALL( cudaMalloc((void**)&d_n,

MALLOC_INT_SIZE) );

CUDA_SAFE_CALL( cudaMalloc((void**)&d_p,

MALLOC_INT_SIZE) );

FillMatrix( m );

FillMatrix( n );

CUDA_SAFE_CALL( cudaMemcpy(d_m, m, MALLOC_INT_SIZE,

cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL( cudaMemcpy(d_n, n, MALLOC_INT_SIZE,

cudaMemcpyHostToDevice) );

dim3 grid( WIDTH / TILE_WIDTH, WIDTH / TILE_WIDTH );

dim3 blocks( TILE_WIDTH, TILE_WIDTH );

Matrix_Mul<<

blocks>>>( d_m, d_n,

d_p, WIDTH );

CUDA_SAFE_CALL( cudaMemcpy(p, d_p, MALLOC_INT_SIZE,

cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL( cudaEventRecord(end,0) );

CUDA_SAFE_CALL( cudaEventSynchronize( end ) );

float elapsedTime;

CUDA_SAFE_CALL( cudaEventElapsedTime(

&elapsedTime, begin, end ) );

printf("Time to generate: %fs\n", elapsedTime / 1000.f);

CUDA_SAFE_CALL( cudaEventDestroy( begin ) );

CUDA_SAFE_CALL( cudaEventDestroy( end ) );

//PrintMatrix( m );

//PrintMatrix( n );

//PrintMatrix( p );

int *c;

c = (int*)malloc(MALLOC_INT_SIZE);

GeneralMul(m,n,c, WIDTH, p);

free(m);

free(n);

free(p);

free(c);

CUDA_SAFE_CALL( cudaFree(d_m) );

CUDA_SAFE_CALL( cudaFree(d_n) );

CUDA_SAFE_CALL( cudaFree(d_p) );

}

int main( int argc, char** argv)

{

Process( argc, argv );

CUT_EXIT( argc, argv );

}

你可能感兴趣的:(cuda矩阵相乘)