GPU和CUP计算能力对比

原文地址:http://blog.163.com/qimo601@126/blog/static/15822093201352193920821/

硬件:Geforce GT 420

叙述:CPU程序和GPU程序实现的功能完全一样,更换一下数组大小以及计算公式,测试一下GPU和CPU的计算性能。


测试例子1
arraySize = 10000
计算公式:c[i] = a[i] + b[i] + c[i];  
addKernel<<<100, 100>>>(dev_c, dev_a, dev_b);
消耗时间:CUDA 0.122s ,CPU 0.001s
GPU和CUP计算能力对比_第1张图片
 

测试例子2

const int arraySize = 100000;
GPU计算公式:for(int m = 0; m < 2000; m++)
c[i] = (a[i]*200 + b[i]/400 + c[i]*4)/56 ;

addKernel<<<200, 500>>>(dev_c, dev_a, dev_b);
消耗时间:CUDA 0.669s ,CPU 2.43s
GPU和CUP计算能力对比_第2张图片
 


测试例子3
const int arraySize = 100000;
GPU计算公式:
for(int m = 0; m < 10000; m++)
c[i] = (a[i]*200 + b[i]/400 + c[i]*4)/56 ;

addKernel<<<200, 500>>>(dev_c, dev_a, dev_b);
消耗时间:CUDA 2.867s ,CPU 2.43s
GPU和CUP计算能力对比_第3张图片
 
测试例子4 ,安装RTX后

const int arraySize = 100000;
GPU计算公式:
for(int m = 0; m < 10000; m++)
c[i] = (a[i]*200 + b[i]/400 + c[i]*4)/56 ;

addKernel<<<200, 500>>>(dev_c, dev_a, dev_b);
消耗时间:CUDA 2.867s ,CPU 2.43s
GPU和CUP计算能力对比_第4张图片


CPU程序:
#include <QtCore/QCoreApplication>  
#include <stdio.h>  
#include <time.h>  
int main(int argc, char *argv[])  
{  
    QCoreApplication app(argc, argv);  
  
  
    const int arraySize = 10000;  
    int* a = new int[arraySize];  
    int* b= new int[arraySize];  
    int* c = new int[arraySize];  
  
    for(int i = 0; i < arraySize; i++)  
    {  
        a[i] = i;  
        b[i] = i*10;  
        c[i] = 0;  
    }  
  
   
    clock_t start, finish;    
    double duration;    
    start = clock();  
  
    for(int i = 0; i < arraySize; i++)  
            c[i] = a[i] + b[i] + c[i];  
  
    for(int t = 0; t < arraySize; t++)  
    {  
        if(t < 800 && t>= 790)  
            printf("c[%d] = %d \n",t,c[t]);  
    }  
  
    finish = clock();    
    duration = (double)(finish - start) / CLOCKS_PER_SEC;    
    printf( "%f seconds\n", duration );   
  
  
    system("pause");  
    return app.exec();  
}  

GPU程序:
#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  
  
#include <stdio.h>  
#include <time.h>   
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);  
  
__global__ void addKernel(int *c, const int *a, const int *b)  
{  
    int i = threadIdx.x + blockIdx.x * gridDim.x;  
    c[i] = a[i] + b[i] + c[i] ;  
          
      
}  
  
int main()  
{  
    const int arraySize = 10000;  
    int* a = new int[arraySize];      
    int* b= new int[arraySize];  
    int* c = new int[arraySize];  
  
    for(int i = 0; i < arraySize; i++)  
    {  
        a[i] = i;  
        b[i] = i*10;  
        c[i] = 0;  
    }  
          
   
    clock_t start, finish;    
    double duration;     
    start = clock();    
     
      
    // Add vectors in parallel.  
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "addWithCuda failed!");  
        return 1;  
    }  
  
    for(int t = 0; t < arraySize; t++)  
    {  
        if(t < 800 && t>= 790)  
            printf("c[%d] = %d \n",t,c[t]);  
    }  
  
    // cudaThreadExit must be called before exiting in order for profiling and  
    // tracing tools such as Nsight and Visual Profiler to show complete traces.  
    cudaStatus = cudaThreadExit();  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "cudaThreadExit failed!");  
        return 1;  
    }  
  
  
    finish = clock();    
    duration = (double)(finish - start) / CLOCKS_PER_SEC;    
    printf( "%f seconds\n", duration );   
      
      
    getchar();  
    return 0;  
}  
  
// Helper function for using CUDA to add vectors in parallel.  
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)  
{  
    int *dev_a = 0;  
    int *dev_b = 0;  
    int *dev_c = 0;  
    cudaError_t cudaStatus;  
  
    // Choose which GPU to run on, change this on a multi-GPU system.  
    cudaStatus = cudaSetDevice(0);  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");  
        goto Error;  
    }  
  
    // Allocate GPU buffers for three vectors (two input, one output)    .  
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "cudaMalloc failed!");  
        goto Error;  
    }  
  
    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "cudaMalloc failed!");  
        goto Error;  
    }  
  
    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "cudaMalloc failed!");  
        goto Error;  
    }  
  
    // Copy input vectors from host memory to GPU buffers.  
    cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
    // Copy input vectors from host memory to GPU buffers.  
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
  
    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
  
    // Launch a kernel on the GPU with one thread for each element.  
    addKernel<<<100, 100>>>(dev_c, dev_a, dev_b);  
  
    // cudaThreadSynchronize waits for the kernel to finish, and returns  
    // any errors encountered during the launch.  
    cudaStatus = cudaThreadSynchronize();  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);  
        goto Error;  
    }  
  
    // Copy output vector from GPU buffer to host memory.  
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);  
    if (cudaStatus != cudaSuccess) {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
  
  
      
Error:  
    cudaFree(dev_c);  
    cudaFree(dev_a);  
    cudaFree(dev_b);  
      
    return cudaStatus;  
}  


你可能感兴趣的:(cpu,CUDA,GPU)