cuda编程入门示例1---两个向量对应元素相乘

#include 
#include 
#include 
#include 
#include  


#define MIN(a, b) ((a) < (b))? (a):(b)


float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;float* d_C;


__global__ void DotMulVet(const float* A,const float* B,float* C,int N)
{    
    int index = blockIdx.x*blockDim.x+threadIdx.x;    
const int offset = gridDim.x * blockDim.x;


    while (index < N)
    {
C[index] = A[index] * B[index];
index += offset;
    }
}


int main()
{    
    int N=1024;    
    int i;    
    int mem_size=sizeof(float)*N;    
    cudaEvent_t   start, finish;    
float costTime;


    cudaEventCreate(&start);    
    cudaEventCreate(&finish);   


    printf("Start to malloc host memory...\n");
    h_A=(float*)malloc(mem_size);    
    h_B=(float*)malloc(mem_size);    
    h_C=(float*)malloc(mem_size);  


    for(i=0;i>>(d_A,d_B,d_C,N);   
cudaEventRecord(finish, 0);
cudaEventSynchronize(finish);
cudaEventElapsedTime(&costTime, start, finish);


//copy result from device to host
cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost);
           
printf("Check result with CPU...\n");
int errorCounts = 0;
    for(i=0;i1E-6)
{
errorCounts++;
}
    }      
    
    printf("Result: %s, errorCounts: %d\n",(0 == errorCounts) ? "Correct":"Wrong", errorCounts);    
    printf("Cost Time : %f\n",costTime);  
    
    free(h_A);    
    free(h_B);    
    free(h_C); 
       
    cudaFree(d_A);    
    cudaFree(d_B);    
    cudaFree(d_C);


return 0;
}

你可能感兴趣的:(cuda编程常用代码示例)