// includes, system
#include
#include
#define RADIUS 1
#define BLOCKDIM 16
#define N 13
__global__ static void set_global_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = idx;
}
__global__ static void set_block_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = blockIdx.x;
}
__global__ static void set_thread_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = threadIdx.x;
}
__global__ static void stencil_naive(int n, int *in, int *out)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int i;
int value = 0;
if (idx < n) {
for (i=-RADIUS; i<=RADIUS; ++i)
{
value += in[idx+i+RADIUS];
}
}
out[idx] = value;
}
__global__ static void stencil(int n, int *in, int *out)
{
__shared__ int shared[BLOCKDIM + 2*RADIUS];
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idx_local = threadIdx.x + RADIUS;
shared[idx_local] = in[idx + RADIUS];
int i;
if (idx < n) {
if( threadIdx.x < RADIUS)
{
shared[idx_local - RADIUS] = in[idx ];
shared[idx_local + BLOCKDIM] = in[idx + BLOCKDIM + RADIUS];
}
}
__syncthreads();
int value = 0;
for (i=-RADIUS; i<=RADIUS; i++)
{
value += shared[idx_local + i];
}
out[idx] = value;
}
__global__ static void matrix_multiply_naive(int n, int *in_a, int *in_b, int *out)
{
int tmp = gridDim.x*blockDim.x;
int row = threadIdx.x + blockIdx.x*blockDim.x;
int col = threadIdx.y + blockIdx.y*blockDim.y;
int idx = row*tmp + col;
int sum = 0;
if (idx < n)
{
for (int i=0; i
sum += in_a[row*N+i]*in_b[i*N+col];
}
}
out[row*N+col] = sum;
/*int row = threadIdx.x + blockIdx.x*blockDim.x;
int col = threadIdx.y + blockIdx.y*blockDim.y;
int idx = row*gridDim.x*blockDim.x + col;
if (idx < n)
{
out[idx] = idx;
}*/
}
// Start the main SDK sample here
int main(int argc, char** argv)
{
printf(" ###########################\n");
printf(" #test for cuda toolkit ...#\n");
printf(" ###########################\n");
int dim = 16;
int mem_size = dim*sizeof(int);
int *d_a, *h_a, *d_b;
h_a = (int*)malloc(mem_size);
cudaMalloc((void**)&d_a, mem_size);
cudaMalloc((void**)&d_b, mem_size);
if ( 0==h_a || 0==d_a)
{
printf("could not allocate memory\n");
return 1;
}
cudaMemset(d_a, 0, mem_size);
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i
printf("%2d ", h_a[i]);
}
printf("\n");
int dim_block = 3;
int dim_grid = (dim + dim_block -1)/dim_block;
set_global_idx<<
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i
printf("%2d ", h_a[i]);
}
printf("\n");
set_block_idx<<
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i
printf("%2d ", h_a[i]);
}
printf("\n");
set_thread_idx<<
cudaMemcpy(d_b, d_a, mem_size, cudaMemcpyDeviceToDevice);
cudaMemcpy(h_a, d_b, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i
printf("%2d ", h_a[i]);
}
printf("\n");
free(h_a);
cudaFree(d_a);
cudaFree(d_b);
///
printf("\n");
printf(" #######################\n");
printf(" #stencil test ...... #\n");
printf(" #######################\n");
dim = 80;
int dim_pro = dim + RADIUS*2;
int mem_size_pro = sizeof(int)*dim_pro;
mem_size = sizeof(int)*dim;
h_a = (int*)malloc(mem_size_pro);
memset(h_a, 0, mem_size_pro);
int *h_b;
cudaMalloc((void**)&d_a, mem_size_pro);
cudaMalloc((void**)&d_b, mem_size);
for (int i=0; i
h_a[i+RADIUS] = i;
}
h_b = &(h_a[RADIUS]);
printf(" original vector ... \n");
for (int i=0; i
if (i%10 == 0)
printf("\n");
printf("%5d ", h_b[i]);
}
printf("\n\n\n");
cudaMemcpy(d_a, h_a, mem_size_pro, cudaMemcpyHostToDevice);
dim_grid = (dim + BLOCKDIM -1)/BLOCKDIM;
//cudaMemset(d_b, 0, mem_size);
stencil<<
//stencil_naive<<
cudaMemcpy(h_b, d_b, mem_size, cudaMemcpyDeviceToHost);
printf(" after stencil ... \n");
for (int i=0; i
if (i%10 == 0)
printf("\n");
printf("%5d ", h_b[i]);
}
printf("\n");
free(h_a);
cudaFree(d_a);
cudaFree(d_b);
///
printf("\n");
printf(" ##########################\n");
printf(" #matrix multiply ...... #\n");
printf(" ##########################\n");
int row = N;
int col = N;
dim = row*col;
mem_size = dim*sizeof(int);
h_a = (int*)malloc(mem_size);
int *d_c;
cudaMalloc((void**)&d_a, mem_size);
cudaMalloc((void**)&d_b, mem_size);
cudaMalloc((void**)&d_c, mem_size);
printf(" matrix A & B : \n");
for (int i=0; i
h_a[i] = 1;
if (i%row == 0)
{
printf("\n");
}
printf("%5d ", h_a[i]);
}
cudaMemcpy(d_a, h_a, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_a, mem_size, cudaMemcpyHostToDevice);
dim_block = 2;
dim3 block(dim_block, dim_block);
dim3 grid((row+dim_block-1)/dim_block, (col+dim_block-1)/dim_block);
cudaMemset(d_c, 0, mem_size);
matrix_multiply_naive<<
//memset(h_a, 0,mem_size);
cudaMemcpy(h_a, d_c, mem_size, cudaMemcpyDeviceToHost);
printf("\n matrix C : \n");
for (int i=0; i
if (i%row == 0)
{
printf("\n");
}
printf("%5d ", h_a[i]);
}
return 0;
}
——————————————————————————————————————————————————————————————————
#include