cmake下c/c++与cuda混合编程测试

以下为混合编程测试例子一相对应的gpu计算能力和配置(本文测试gpuGTX1050),例子中计算的数组的平方和与数组元素求立方。(使用例子过程中遇到问题可以一起讨论)

英伟达部分gpu的计算能力:

Tesla V100
# ARCH= -gencode arch=compute_70,code=[sm_70,compute_70]

GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4
# ARCH= -gencode arch=compute_61,code=sm_61 -gencode arch=compute_61,code=compute_61

GP100/Tesla P100 DGX-1
# ARCH= -gencode arch=compute_60,code=sm_60

For Jetson Tx1 uncomment:
# ARCH= -gencode arch=compute_51,code=[sm_51,compute_51]

For Jetson Tx2 or Drive-PX2 uncomment:
# ARCH= -gencode arch=compute_62,code=[sm_62,compute_62]

CMakeLists文件配置调用cuda库:

project(cudafilter2d)
cmake_minimum_required(VERSION 2.8)
INCLUDE(/usr/share/cmake-3.5/Modules/FindCUDA.cmake)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ")
#find_package(OpenCV 3.4 REQUIRED)
#INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS})
find_package(CUDA)
set(CUDA_NVCC_FLAGS -gencode arch=compute_61,code=sm_61;-G;-g)
CUDA_ADD_EXECUTABLE(main  main.cpp filter.cu filter.h )
#target_link_libraries(main ${OpenCV_LIBS})

测试程序:

mian.cpp:

#include
#include "filter.h"
#include
#include
#include
using namespace cv;
using namespace std;
   int main()
   {
       float data[96],data1[96];
       for(int i =0;i<96;i++)
           data[i]=float(i);

       double t1 = (double)getTickCount();
     CUDA_cube();
//for(int i= 0; i<96;i++)
//{
//    data1[i]=data[i]*data[i]*data[i];
//}
       //CUDA_square();
       double t2 = ((double)getTickCount()-t1)/getTickFrequency();
       //cout<        return 0;
   }

filter.h:

#ifndef FILTER_H
#define FILTER_H
#include
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
using namespace cv;
extern "C" void CUDA_square();
extern "C" void CUDA_cube();
#endif


filter.cu:

#include "filter.h"

__global__ void square (int *data,int *sum)
{
//int i = threadIdx.x;
for(int i=0;i<100;i++)
{
    sum[0] += data[i]*data[i];
}


}
__global__ void cube(float * d_out, float * d_in)
{

    int idx = threadIdx.x;
    //for(int idx=0;idx<96;idx++)
    //{
    float f = d_in[idx];
    d_out[idx] = f*f*f;
//}
}
 void CUDA_cube()
 {
     const int ARRAY_SIZE = 96;
     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

     // generate the input array on the host
     float h_in[ARRAY_SIZE];
     for (int i = 0; i < ARRAY_SIZE; i++) {
         h_in[i] = float(i);
     }
     float h_out[ARRAY_SIZE];

     // declare GPU memory pointers
     float * d_in;
     float * d_out;

     // allocate GPU memory
     cudaMalloc((void**)&d_in, ARRAY_BYTES);
     cudaMalloc((void**)&d_out, ARRAY_BYTES);

     // transfer the array to the GPU
     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

     // launch the kernel
      double t1 = (double)getTickCount();
     cube << <1, ARRAY_SIZE >> >(d_out, d_in);
      double t2 = ((double)getTickCount()-t1)/getTickFrequency();
      printf("time:%f",t2);
//     cudaError_t cudaStatus = cudaGetLastError();
//     if (cudaStatus != cudaSuccess)
//     { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));  }

     // copy back the result array to the CPU
     cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);

     // print out the resulting array
//     for (int i = 0; i < ARRAY_SIZE; i++) {
//         printf("%f\n", h_out[i]);
//         //printf(((i % 4) != 3) ? "\t" : "\n");
//     }

     cudaFree(d_in);
     cudaFree(d_out);
 }
 void CUDA_square()
 {
     int data[100];
     int result[1];
     for(int i = 0; i<100;i++)
         data[i]=i;
     int *gpudata,*gpusum;
     cudaMalloc((void**) &gpudata,sizeof(int)*100);
     cudaMalloc((void**) &gpusum,sizeof(int));
     cudaMemcpy(gpudata,data,sizeof(int)*100,cudaMemcpyHostToDevice);
     square<<<1,100>>>(gpudata,gpusum);
     cudaMemcpy(result,gpusum,sizeof(int),cudaMemcpyDeviceToHost);
     printf("result:%d\n",result[0]);
 }

 

 

你可能感兴趣的:(linux,开源库学习,cuda,cmake)