Ubuntu22.04安装CUDA深度学习环境&&cuda principle


neofetch && uname -a|lolcat

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第1张图片

install nvidia GPU driver:

sudo add-apt-repository ppa:graphics-drivers/ppa  # 加入官方ppa源
sudo apt update  # 检查软件包更新列表
apt list --upgradable  # 查看可更新的软件包列表
sudo apt upgrade  # 更新所有可更新的软件包
ubuntu-drivers devices    # ubuntu检测n卡的可选驱动
sudo apt install nvidia-driver-510  # 根据自己的n卡可选驱动下载显卡驱动

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第2张图片

ubuntu-drivers devices    # ubuntu检测n卡的可选驱动
sudo apt install nvidia-driver-510  # 根据自己的n卡可选驱动下载显卡驱动

disable the nouveau

disable the nouveau by add the nouveau to the black list.


最后一行加上: blacklist nouveau

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第3张图片

and execute:

$ sudo update-initramfs -u
$ reboot

 reboot the system and execute the nvidia-smi:

the output of cuda does not mean the cuda environment already been installed, it just meas the corrspoinding versions of cuda that this driver supports.

sudo nvidia-settings  # 更改Nvidia驱动设置
nvidia-smi  # 查看显卡基本信息

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第4张图片

install cuda:

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第5张图片

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
sudo dpkg -i cuda-keyring_1.0-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda


add environment in bash shell

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.7/lib64
export PATH=$PATH:/usr/local/cuda-11.7/bin
export CUDA_HOME=$CUDA_HOME:/usr/local/cuda-11.7

test,printf on device:

__device__ void add(void)
	printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);

__global__ void myfirstkernel(void)
	printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
int main(void)
	myfirstkernel <<<16,1>>>();


	return 0;

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第6张图片

algo cuda sample:

__device__ void add(int a, int b, int *c)
	*c = a + b;
	printf("kernel %s line %d, i am in kernel thread in block %d. *c = %d.\n", __func__, __LINE__,blockIdx.x, *c);

__global__ void myfirstkernel(int a, int b, int *c)
	printf("kernel %s line %d, i am in kernel thread in block %d.\n", __func__, __LINE__,blockIdx.x);
	add(a, b, c);
int main(void)
	int c;
	int *gpu_c;

	cudaMalloc((void **)&gpu_c, sizeof(int));
	myfirstkernel <<<16,1>>>(3, 6, gpu_c);

	cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);



	printf("exit.c = %d.\n", c);
	return 0;

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第7张图片

change thread and block

__device__ void add(int a, int b, int *c)
	*c = a + b;
	printf("kernel %s line %d, i am in kernel thread %d in block %d. *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, *c);

__global__ void myfirstkernel(int a, int b, int *c)
	printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
	add(a, b, c);
int main(void)
	int c;
	int *gpu_c;

	cudaMalloc((void **)&gpu_c, sizeof(int));
	myfirstkernel <<<1,16>>>(3, 6, gpu_c);

	cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);



	printf("exit.c = %d.\n", c);
	return 0;

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第8张图片

__device__ void add(int a, int b, int *c)
	*c = a + b;
	printf("kernel %s line %d, i am in kernel thread %d in block %d. *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, *c);

__global__ void myfirstkernel(int a, int b, int *c)
	printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
	printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
	printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
	add(a, b, c);
int main(void)
	int c;
	int *gpu_c;

	cudaMalloc((void **)&gpu_c, sizeof(int));
	myfirstkernel <<<3,16>>>(3, 6, gpu_c);

	cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);



	printf("exit.c = %d.\n", c);
	return 0;
czl@czl-RedmiBook-14:~/workspace/work$ ./a.out 
kernel myfirstkernel line 12, i am in kernel thread 0 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 1 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 2 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 3 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 4 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 5 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 6 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 7 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 8 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 9 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 10 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 11 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 12 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 13 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 14 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 15 in block 0.
kernel myfirstkernel line 12, i am in kernel thread 0 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 1 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 2 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 3 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 4 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 5 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 6 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 7 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 8 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 9 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 10 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 11 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 12 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 13 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 14 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 15 in block 1.
kernel myfirstkernel line 12, i am in kernel thread 0 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 1 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 2 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 3 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 4 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 5 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 6 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 7 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 8 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 9 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 10 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 11 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 12 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 13 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 14 in block 2.
kernel myfirstkernel line 12, i am in kernel thread 15 in block 2.
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
block.x = 16, block.y = 1,block.z = 1
thread.x = 0, thread.y = 0,thread.z = 0
thread.x = 1, thread.y = 0,thread.z = 0
thread.x = 2, thread.y = 0,thread.z = 0
thread.x = 3, thread.y = 0,thread.z = 0
thread.x = 4, thread.y = 0,thread.z = 0
thread.x = 5, thread.y = 0,thread.z = 0
thread.x = 6, thread.y = 0,thread.z = 0
thread.x = 7, thread.y = 0,thread.z = 0
thread.x = 8, thread.y = 0,thread.z = 0
thread.x = 9, thread.y = 0,thread.z = 0
thread.x = 10, thread.y = 0,thread.z = 0
thread.x = 11, thread.y = 0,thread.z = 0
thread.x = 12, thread.y = 0,thread.z = 0
thread.x = 13, thread.y = 0,thread.z = 0
thread.x = 14, thread.y = 0,thread.z = 0
thread.x = 15, thread.y = 0,thread.z = 0
thread.x = 0, thread.y = 0,thread.z = 0
thread.x = 1, thread.y = 0,thread.z = 0
thread.x = 2, thread.y = 0,thread.z = 0
thread.x = 3, thread.y = 0,thread.z = 0
thread.x = 4, thread.y = 0,thread.z = 0
thread.x = 5, thread.y = 0,thread.z = 0
thread.x = 6, thread.y = 0,thread.z = 0
thread.x = 7, thread.y = 0,thread.z = 0
thread.x = 8, thread.y = 0,thread.z = 0
thread.x = 9, thread.y = 0,thread.z = 0
thread.x = 10, thread.y = 0,thread.z = 0
thread.x = 11, thread.y = 0,thread.z = 0
thread.x = 12, thread.y = 0,thread.z = 0
thread.x = 13, thread.y = 0,thread.z = 0
thread.x = 14, thread.y = 0,thread.z = 0
thread.x = 15, thread.y = 0,thread.z = 0
thread.x = 0, thread.y = 0,thread.z = 0
thread.x = 1, thread.y = 0,thread.z = 0
thread.x = 2, thread.y = 0,thread.z = 0
thread.x = 3, thread.y = 0,thread.z = 0
thread.x = 4, thread.y = 0,thread.z = 0
thread.x = 5, thread.y = 0,thread.z = 0
thread.x = 6, thread.y = 0,thread.z = 0
thread.x = 7, thread.y = 0,thread.z = 0
thread.x = 8, thread.y = 0,thread.z = 0
thread.x = 9, thread.y = 0,thread.z = 0
thread.x = 10, thread.y = 0,thread.z = 0
thread.x = 11, thread.y = 0,thread.z = 0
thread.x = 12, thread.y = 0,thread.z = 0
thread.x = 13, thread.y = 0,thread.z = 0
thread.x = 14, thread.y = 0,thread.z = 0
thread.x = 15, thread.y = 0,thread.z = 0
kernel add line 7, i am in kernel thread 0 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 1 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 2 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 3 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 4 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 5 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 6 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 7 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 8 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 9 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 10 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 11 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 12 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 13 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 14 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 15 in block 0. *c = 9.
kernel add line 7, i am in kernel thread 0 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 1 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 2 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 3 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 4 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 5 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 6 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 7 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 8 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 9 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 10 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 11 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 12 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 13 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 14 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 15 in block 2. *c = 9.
kernel add line 7, i am in kernel thread 0 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 1 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 2 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 3 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 4 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 5 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 6 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 7 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 8 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 9 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 10 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 11 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 12 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 13 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 14 in block 1. *c = 9.
kernel add line 7, i am in kernel thread 15 in block 1. *c = 9.
exit.c = 9.


__device__ void add(int a, int b, int *c)
	*c = a + b;
	printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);

__global__ void myfirstkernel(int a, int b, int *c)
	printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
	printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
	printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
	printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
	add(a, b, c);
int main(void)
	int c;
	int *gpu_c;

	cudaMalloc((void **)&gpu_c, sizeof(int));
	myfirstkernel <<<2,3>>>(3, 6, gpu_c);

	cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);



	printf("exit.c = %d.\n", c);
	return 0;

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第9张图片

kernel call convontion:

kernel call invocation convotional is:

dim3 gridSize(3,2,1);

dim3 blockSize(2,2,2);


Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第10张图片



那么执行的线程是如何知道自己在在GRID,BLOCK,THREAD中的位置的呢?一种更简单的方案是让每个线程把自己的X索引(也就是threadIdx.x)记录下来,线程索引是线程在每个BLOCK里的索引,由于BLOCK的尺寸是三维的,因此线程索引也是一个三元常向量,threadIdx,访问方式为:threadIdx.x, threadIdx.y, threadIdx.z.对于一个BLOCK来说,它其中的每个线程的索引是唯一的,但是当一个GRID中有两个以上的BLOCK时,其中就会出现重复的线程索引,相应的,每个GRID里面的BLOCK也有唯一的BLOCK索引,用blockIdx表示,它同样是一个三维喜爱常向量,blockIdx.x, blockIdx.y, blockIdx.z。由于一次函数调用中只有一个GRID,因此不存在GRID索引。

对于BLOCK和GRID的尺寸,也用相应的三维常向量来表示,BLOCK的尺寸保存在常向量blockDim中,GRID的尺寸保存在gridDim中,他们都是CUDA C的内建变量,可以直接在设备代码中使用,在计算中,用户常常要给每个线程一个唯一的标识符,即线程号,以便给每个线程分配不同的任务。在多个BLOCK的情况下,线程号也不能重复。线程号在实际使用中很重要,它关系到被处理的数据在输入数组中的位置,也关系到线程的分配和存储器的使用问题。


1D grid && 1d block.

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第11张图片

__device__ int get_globalidx_1d_1d(void)
	return blockIdx.x * blockDim.x + threadIdx.x;

1D grid && 2d block.

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第12张图片

__device__ int get_globalidx_1d_2d(void)
	return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;

1d grid && 3d block

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第13张图片

__device__ int get_globalidx_1d_3d(void)
	return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;

2d grid && 1d block

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第14张图片

__device__ int get_globalidx_2d_1d(void)
	int blockid = blockIdx.y * gridDim.x + blockIdx.x;
	int threadid = blockid * blockDim.x + threadIdx.x;

	return threadid;

2d grid && 2d block

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第15张图片

__device__ int get_globalidx_2d_2d(void)
	int blockid = blockIdx.y * gridDim.x + blockIdx.x;
	int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;

	return threadid;

2d grid && 3d block

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第16张图片

__device__ int get_globalidx_2d_3d(void)
	int blockid = blockIdx.y * gridDim.x + blockIdx.x;
	int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;

	return threadid;

3d grid && 1d block

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第17张图片

__device__ int get_globalidx_3d_1d(void)
	int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
	int threadid = blockid * blockDim.x + threadIdx.x;

	return threadid;

3d grid && 2d block

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第18张图片

__device__ int get_globalidx_3d_2d(void)
	int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
	int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;

	return threadid;

3d grid && 3d block

Ubuntu22.04安装CUDA深度学习环境&&cuda principle_第19张图片

__device__ int get_globalidx_3d_3d(void)
	int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
	int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;

	return threadid;


__device__ void add(int a, int b, int *c)
	*c = a + b;
	printf("kernel %s line %d, i am in kernel thread %d in blockidx.x %d. blokidx.y %d blockidx.z %d *c = %d.\n", __func__, __LINE__, threadIdx.x, blockIdx.x, blockIdx.y,blockIdx.z,*c);

__global__ void myfirstkernel(int a, int b, int *c)
	printf("kernel %s line %d, i am in kernel thread %d in block %d.\n", __func__, __LINE__,threadIdx.x, blockIdx.x);
	printf("block.x = %d, block.y = %d,block.z = %d\n", blockDim.x, blockDim.y,blockDim.z);
	printf("thread.x = %d, thread.y = %d,thread.z = %d\n", threadIdx.x, threadIdx.y,threadIdx.z);
	printf("gridDim.x = %d, gridDim.y = %d,gridDim.z = %d\n", gridDim.x, gridDim.y,gridDim.z);
	add(a, b, c);

__device__ int get_globalidx_1d_1d(void)
	return blockIdx.x * blockDim.x + threadIdx.x;

__device__ int get_globalidx_1d_2d(void)
	return blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;

__device__ int get_globalidx_1d_3d(void)
	return blockIdx.x * blockDim.x * blockDim.y * blockDim.z + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;

__device__ int get_globalidx_2d_1d(void)
	int blockid = blockIdx.y * gridDim.x + blockIdx.x;
	int threadid = blockid * blockDim.x + threadIdx.x;

	return threadid;

__device__ int get_globalidx_2d_2d(void)
	int blockid = blockIdx.y * gridDim.x + blockIdx.x;
	int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;

	return threadid;

__device__ int get_globalidx_2d_3d(void)
	int blockid = blockIdx.y * gridDim.x + blockIdx.x;
	int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;

	return threadid;

__device__ int get_globalidx_3d_1d(void)
	int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
	int threadid = blockid * blockDim.x + threadIdx.x;

	return threadid;

__device__ int get_globalidx_3d_2d(void)
	int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
	int threadid = blockid * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;

	return threadid;

__device__ int get_globalidx_3d_3d(void)
	int blockid = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
	int threadid = blockid * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;

	return threadid;
__host__ int main(void)
	int c;
	int *gpu_c;

	cudaMalloc((void **)&gpu_c, sizeof(int));
	myfirstkernel <<<2,3>>>(3, 6, gpu_c);

	cudaMemcpy(&c, gpu_c, sizeof(int), cudaMemcpyDeviceToHost);



	printf("exit.c = %d.\n", c);
	return 0;

others blog, opencl:

OpenCL编程初探_papaofdoudou的博客-CSDN博客_opencl 源码



