CUDA C/C++ 与 GNU C/C++混编实例

背景说明:本文目的在于厘清CUDA C/C++与GNU C/C++的调用与编译链接关系。

代码参考:https://devblogs.nvidia.com/how-implement-performance-metrics-cuda-cc/

编译参考:https://stackoverflow.com/questions/9421108/how-can-i-compile-cuda-code-then-link-it-to-a-c-project

代码如下:

add.h ,注意这里的头文件没有__host__修饰符

#include 
void call(void);
void wrapper(void);

add.cu

#include "add.h"
__global__ void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i] + y[i];
}

__host__ void call(void)
{
  int N = 20 * (1 << 20);
  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }
  
  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
    
  // Perform SAXPY on 1M elements.
  saxpy<<<(N+511)/512, 512>>>(N, 2.0f, d_x, d_y);

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
  
  float maxError = 0.0f;
  for (int i = 0; i < N; i++) {
    maxError = max(maxError, abs(y[i]-4.0f));
  }
  
  printf("Max error: %f\n", maxError);
 
  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}

__host__ void wrapper(void)
{
  call();
}

main.cpp

#include 
#include "add.h"
int main(void)
{
    call();
    wrapper();
}

编译

1、编译add.cu文件

nvcc -c -o add.o add.cu

2、混编cpp与add.o文件,这里有两种方法,一种用nvcc编译,一种是用g++编译。

A、使用nvcc混编:

nvcc -o main main.cpp add.o

B、使用g++混编:

g++ -o main main.cpp add.o -L/lib64 -lcuda -lcudart

Done!

你可能感兴趣的:(cuda,编程)