最近要处理大规模点云数据, 用CPU跑感觉有点慢,想通过GPU加速点云处理过程,于是想要学习CUDA编程。
很多教程提到在安装CUDA之前,需要打开cmd,输入nvidia-smi,查看显卡支持的CUDA版本。这个步骤我在安装CUDA之前没有做,我是直接安装了,但是建议查一下。事后,还是查了一下,发现直接写的是11.8
根据官方教程可以发现,VS2019和Win11是可以跟CUDA11.8配置的。
安装完毕之后,打开cmd,输入nvcc -V试一试看看能否查到CUDA版本,可以的话应该是没问题。
VS2019与CUDA配置
可以打开一个现有的VS项目,或者新建一个空项目。右键点击源文件,添加新建项就可以创建一个CUDA文件,后缀是.cu。若是创建CUDA头文件,后缀就是.cuh。debug为x64
然后,右键点击这个cu文件,选择属性,将项类型改成CUDA C++。
选择项目,点击右键–>生成依赖项–>自定义生成–>选择CUDA11.8
右键项目,找到CUDA C/C++ ——>Common,输入C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8
当然,也还是要配置包含目录和库目录,这个和VS配置其他库一样
包含目录:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include
库目录:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\lib\x64
链接器——>输入——>附加依赖项
cublas.lib
cublasLt.lib
cuda.lib
cudadevrt.lib
cudart.lib
cudart_static.lib
cufft.lib
cufftw.lib
cufilt.lib
curand.lib
cusolver.lib
cusolverMg.lib
cusparse.lib
nppc.lib
nppial.lib
nppicc.lib
nppidei.lib
nppif.lib
nppig.lib
nppim.lib
nppist.lib
nppisu.lib
nppitc.lib
npps.lib
nvblas.lib
nvjpeg.lib
nvml.lib
nvptxcompiler_static.lib
nvrtc.lib
nvrtc_static.lib
nvrtc-builtins_static.lib
OpenCL.lib
完成配置之后,通常要先运行一个例程试一试看看效果。但是报错
这篇博客中解释到这个情况加什么头文件都不行,我试了也发现确实是不行。但是好像可以编译运行。
另一个博主也是这么说,传送门
error.cuh文件
#pragma once
#include
#define CHECK(call) \
do \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
XXX.cu代码
#include
#include
#include "error.cuh"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include
#define TILE_DIM 32 //Don't ask me why I don't set these two values to one
#define BLOCK_SIZE 32
#define N 3001 // for huanhuan, you know that!
__managed__ int input_M[N * N]; //input matrix & GPU result
int cpu_result[N * N]; //CPU result
//in-place matrix transpose
__global__ void ip_transpose(int* data)
{
__shared__ int tile_s[TILE_DIM][TILE_DIM + 1];
__shared__ int tile_d[TILE_DIM][TILE_DIM + 1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
//Threads in the triangle below
if (blockIdx.y > blockIdx.x) {
int dx = blockIdx.y * TILE_DIM + threadIdx.x;
int dy = blockIdx.x * TILE_DIM + threadIdx.y;
if (x < N && y < N)
{
tile_s[threadIdx.y][threadIdx.x] = data[(y)*N + x];
}
if (dx < N && dy < N)
{
tile_d[threadIdx.y][threadIdx.x] = data[(dy)*N + dx];
}
__syncthreads();
if (dx < N && dy < N)
{
data[(dy)*N + dx] = tile_s[threadIdx.x][threadIdx.y];
}
if (x < N && y < N)
{
data[(y)*N + x] = tile_d[threadIdx.x][threadIdx.y];
}
}
else if (blockIdx.y == blockIdx.x)//Threads on the diagonal
{
if (x < N && y < N)
{
tile_s[threadIdx.y][threadIdx.x] = data[(y)*N + x];
}
__syncthreads();
if (x < N && y < N)
{
data[(y)*N + x] = tile_s[threadIdx.x][threadIdx.y];
}
}
}
void cpu_transpose(int* A, int* B)
{
for (int j = 0; j < N; j++)
{
for (int i = 0; i < N; i++)
{
B[i * N + j] = A[j * N + i];
}
}
}
int main(int argc, char const* argv[])
{
cudaEvent_t start, stop_gpu;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop_gpu));
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
input_M[i * N + j] = rand() % 1000;
}
}
cpu_transpose(input_M, cpu_result);
CHECK(cudaEventRecord(start));
unsigned int grid_rows = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
ip_transpose << <dimGrid, dimBlock >> > (input_M);
CHECK(cudaDeviceSynchronize());
CHECK(cudaEventRecord(stop_gpu));
CHECK(cudaEventSynchronize(stop_gpu));
float elapsed_time_gpu;
CHECK(cudaEventElapsedTime(&elapsed_time_gpu, start, stop_gpu));
printf("Time_GPU = %g ms.\n", elapsed_time_gpu);
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop_gpu));
int ok = 1;
for (int i = 0; i < N; ++i)
{
for (int j = 0; j < N; ++j)
{
if (fabs(input_M[i * N + j] - cpu_result[i * N + j]) > (1.0e-10))
{
ok = 0;
}
}
}
if (ok)
{
printf("Pass!!!\n");
}
else
{
printf("Error!!!\n");
}
return 0;
}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
void myDeviceInfo();
int main()
{
// mykernel <<<1, 10 >>> ();
myDeviceInfo();
cudaDeviceSynchronize();
return 0;
}
void myDeviceInfo()
{
int dev_count;
cudaGetDeviceCount(&dev_count);
cudaDeviceProp dev_prop;
int i;
for (i = 0; i < dev_count; i++) {
cudaGetDeviceProperties(&dev_prop, i);
printf("----------- Information of device %d -----------\n", i);
printf("The streaming multiprocessor(SM) number is %d\n", dev_prop.multiProcessorCount);
printf("The max thread block numberof per SM is %d\n", dev_prop.maxBlocksPerMultiProcessor);
printf("The max threads number of per SM is %d\n", dev_prop.maxThreadsPerMultiProcessor);
printf("The max threads number of per block is %d\n", dev_prop.maxThreadsPerBlock);
printf("The max thread blocks number in (x, y, z) dim is (%d, %d, %d)\n", dev_prop.maxGridSize[0], dev_prop.maxGridSize[1], dev_prop.maxGridSize[2]);
printf("The max threads number of (x, y, z) dim is (%d. %d, %d)\n", dev_prop.maxThreadsDim[0], dev_prop.maxThreadsDim[1], dev_prop.maxThreadsDim[2]);
printf("----------- Information of device end -----------\n");
}
}
教程1
教程2
教程3
教程4
教程5