一个不错的关于CPU和GPU(CUDA)的性能比较讨论话题

http://topic.csdn.net/u/20081027/23/67ff3857-3c71-4d5c-acf6-095f3497c7a9.html
这里是今天的一个论坛的一个帖子,大家可以讨论一下:)
1.那些程序适合用cpu来做,那些适合用gpu来做
2.如果用gpu来做,需要注意那些东西
3.如果需要优化,需要那些思路:)

在lz的代码的基础上做了一些变化,大家可以自己测试一下,就知道那些工作适合用cpu做,那些是适合用gpu来做。
这里面的LOOP_ADD_TIME 从1->10->100->1000->10000....
大家可以做一个测试,看看最后的效果是怎么样的,可以画一个曲线图出来:)

过一段时间,还可以把这个代码在修改一下,添加更多的内容进去,再看看两者的效果怎么样:)
C/C++ code
    
    
    
    

#include
< stdio.h >
#include
< assert.h >
#include
< time.h >
#include
< cutil.h >

// Simple utility function to check for CUDA runtime errors
// void checkCUDAError(const char* msg)
#define LOOP_ADD_TIME 100

// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock( int * d_a)
{
int dx = blockDim.x * blockIdx.x + threadIdx.x;

for ( int i = 1 ; i <= LOOP_ADD_TIME; i ++ )
{
d_a[dx]
+= i;
}
}

int gpu_test()
{
clock_t start, finish;
double duration;

// pointer for host memory and size
int * h_a,transfer;
int dimA = 512 * 21056 ; // 256K elements (1MB total)

// pointer for device memory
int * d_a;

// define grid and block size
int numThreadsPerBlock = 512 ;

// Part 1 of 2: compute number of blocks needed based on array size and desired block size
int numBlocks = dimA / numThreadsPerBlock;
printf(
" blocks: %d/n " ,numBlocks);

// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof ( int );
h_a
= ( int * ) malloc(memSize);
CUDA_SAFE_CALL(cudaMalloc( (
void ** ) & d_a, memSize ));

// Initialize input array on host
for ( int i = 0 ; i < dimA; ++ i)
{
h_a[i]
= i;
// printf("%d ",h_a[i]);
}

start
= clock();
// unsigned int timer;
// CUT_SAFE_CALL(cutCreateTimer(&timer));
// CUT_SAFE_CALL(cutStartTimer(timer));
// Copy host array to device array
CUDA_SAFE_CALL(cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice ));

// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock
<<< dimGrid, dimBlock >>> ( d_a );

// device to host copy
CUDA_SAFE_CALL(cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost ));

// CUT_SAFE_CALL(cutStopTimer(timer));
finish = clock();
duration
= ( double )(finish - start) * 1000 / CLOCKS_PER_SEC;
printf(
" gpu time is %f ms/n " , duration );
// printf( "gpu time is %f ms/n", cutGetTimerValue(timer));


int * h_a2;

// allocate host memory
h_a2 = ( int * ) malloc(memSize);


// Initialize input array on host
for ( int i = 0 ; i < dimA; ++ i)
{
h_a2[i]
= i;
// printf("%d ",h_a[i]);
}
for ( int j = 0 ; j < dimA ; ++ j )
{
for ( int k = 1 ; k <= LOOP_ADD_TIME; k ++ )
{
h_a2[j]
+= k;
}
}

for ( int j = 0 ; j < dimA ; ++ j )
{
if (h_a[j] != h_a2[j])printf( " error!/n " );
}
// free host memory
free(h_a2);

// free host memory
free(h_a);

// free device memory
cudaFree(d_a);

return 0 ;

}

////////////////////////////////////////////////////////////////////////////// //
// Program main
////////////////////////////////////////////////////////////////////////////// //
int cpu_test()
{
clock_t start, finish;
double duration;

// pointer for host memory and size
int * h_a,transfer;
int dimA = 512 * 21056 ; // 256K elements (1MB total)

// allocate host memory
size_t memSize = 512 * 21056 * sizeof ( int );
h_a
= ( int * ) malloc(memSize);


// Initialize input array on host
for ( int i = 0 ; i < dimA; ++ i)
{
h_a[i]
= i;
// printf("%d ",h_a[i]);
}
printf(
" /n " );

start
= clock();
for ( int j = 0 ; j < dimA ; ++ j )
{
for ( int k = 1 ; k <= LOOP_ADD_TIME; k ++ )
{
h_a[j]
+= k;
}
}

finish
= clock();
duration
= ( double )(finish - start) * 1000 / CLOCKS_PER_SEC;
printf(
" cpu time is %f ms/n " , duration );


// free host memory
free(h_a);

return 0 ;

}
////////////////////////////////////////////////////////////////////////////// //
// Program main
////////////////////////////////////////////////////////////////////////////// //
int main( int argc, char ** argv)
{

CUT_DEVICE_INIT(argc, argv);
gpu_test();

cpu_test();

CUT_EXIT(argc, argv);
}




你可能感兴趣的:(timer,优化,function,CUDA,测试,input)