// -------MatrixMul.cu-----
// 2012-03-27
// czg1989
//
#include
#include
#include
#include
#include
#include
#define TILE_WIDTH 16
#define WIDTH 512
// kernel function
// matrix mul
__global__ void Matrix_Mul(int *md, int *nd, int *pd, int
width)
{
int bx, by, tx, ty;
bx = blockIdx.x;
by = blockIdx.y;
tx = threadIdx.x;
ty = threadIdx.y;
int mulResult = 0;
for ( int i = 0; i < gridDim.x; ++i )
{
__shared__ int d_m[TILE_WIDTH][TILE_WIDTH];
__shared__ int d_n[TILE_WIDTH][TILE_WIDTH];
d_m[ty][tx] = *(md + (by * blockDim.y + ty) * width + i *
blockDim.x + tx);
d_n[ty][tx] = *(nd + (i * blockDim.y + ty) * width + bx *
blockDim.x + tx);
__syncthreads();
for ( int j = 0; j < blockDim.x; ++j )
{
mulResult += d_m[ty][j] * d_n[j][tx];
}
__syncthreads();
}
pd[(by*blockDim.y+ty)*width+bx*blockDim.x+tx] =
mulResult;
}
//CPU function
void FillMatrix( int *m )
{
int *mm = m;
srand(time(NULL));
for ( int i = 0; i < WIDTH * WIDTH; ++i )
{
mm[i] = i * rand() % 10 + 1;
}
}
void PrintMatrix( int *p )
{
for ( int i = 0; i < WIDTH * WIDTH; ++i )
{
if ( i > 0 && i
% WIDTH == 0 )
{
printf("\n");
}
printf("%d ", p[i]);
}
puts("\n");
}
#define MALLOC_INT_SIZE (sizeof(int) * WIDTH * WIDTH)
//测试结果是否正确
void GeneralMul(int *a, int *b, int *c, int width, int
*p)
{
time_t s = clock();
for ( int i = 0; i < width; ++i )
{
for ( int j = 0; j < width; ++j )
{
int sum = 0;
for ( int k = 0; k < width; ++k )
{
sum += a[i*width+k] * b[k*width+j];
}
c[i*width+j] = sum;
}
}
time_t e = clock();
printf("%f\n",(e-s)*1.f/1000.f);
for ( int i = 0; i < width * width; ++i )
{
if ( c[i] != p[i] )
{
printf("not equal,error!\n");
return ;
}
}
puts("right");
}
void Process( int argc, char** argv )
{
CUT_DEVICE_INIT( argc, argv );
int *m, *n, *p;
int *d_m, *d_n, *d_p;
cudaEvent_t begin, end;
CUDA_SAFE_CALL( cudaEventCreate(&begin)
);
CUDA_SAFE_CALL( cudaEventCreate(&end) );
CUDA_SAFE_CALL( cudaEventRecord(begin, 0) );
m = (int*)malloc(MALLOC_INT_SIZE);
n = (int*)malloc(MALLOC_INT_SIZE);
p = (int*)malloc(MALLOC_INT_SIZE);
CUDA_SAFE_CALL( cudaMalloc((void**)&d_m,
MALLOC_INT_SIZE) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_n,
MALLOC_INT_SIZE) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_p,
MALLOC_INT_SIZE) );
FillMatrix( m );
FillMatrix( n );
CUDA_SAFE_CALL( cudaMemcpy(d_m, m, MALLOC_INT_SIZE,
cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpy(d_n, n, MALLOC_INT_SIZE,
cudaMemcpyHostToDevice) );
dim3 grid( WIDTH / TILE_WIDTH, WIDTH / TILE_WIDTH );
dim3 blocks( TILE_WIDTH, TILE_WIDTH );
Matrix_Mul<<
blocks>>>( d_m, d_n,
d_p, WIDTH );
CUDA_SAFE_CALL( cudaMemcpy(p, d_p, MALLOC_INT_SIZE,
cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL( cudaEventRecord(end,0) );
CUDA_SAFE_CALL( cudaEventSynchronize( end ) );
float elapsedTime;
CUDA_SAFE_CALL( cudaEventElapsedTime(
&elapsedTime, begin, end ) );
printf("Time to generate: %fs\n", elapsedTime / 1000.f);
CUDA_SAFE_CALL( cudaEventDestroy( begin ) );
CUDA_SAFE_CALL( cudaEventDestroy( end ) );
//PrintMatrix( m );
//PrintMatrix( n );
//PrintMatrix( p );
int *c;
c = (int*)malloc(MALLOC_INT_SIZE);
GeneralMul(m,n,c, WIDTH, p);
free(m);
free(n);
free(p);
free(c);
CUDA_SAFE_CALL( cudaFree(d_m) );
CUDA_SAFE_CALL( cudaFree(d_n) );
CUDA_SAFE_CALL( cudaFree(d_p) );
}
int main( int argc, char** argv)
{
Process( argc, argv );
CUT_EXIT( argc, argv );
}