在CUDA中,host和device是两个重要的概念,我们用host指代CPU及其内存,而用device指代GPU及其内存。
一般的CUDA程序的执行流程如下:
一般来说:
一个kernel对应一个grid
一个grid可以有多个block,一维~三维
一个block可以有多个thread,一维~三维
我们写的kernel function运行在block中的每个thread中。
https://cuda-programming.blogspot.com/2013/01/thread-and-block-heuristics-in-cuda.html
#include
#include
//核函数 打印线程索引
__global__ void print_idx(){
printf("block idx: (%3d, %3d, %3d), thread idx: (%3d, %3d, %3d)\n",
blockIdx.z, blockIdx.y, blockIdx.x,
threadIdx.z, threadIdx.y, threadIdx.x);
}
void demo_print(){
int inputSize = 8;
int blockDim = 4; // block的维度 即 block中的线程数量
int gridDim = inputSize / blockDim; // 计算出需要2个block,所以grid的维度为2
dim3 block(blockDim);
dim3 grid(gridDim);
print_idx<<<grid, block>>>();
//cudaDeviceSynchroize()来强制性的让kernel函数的结果执行结
//束之后host再执行下一步。
cudaDeviceSynchronize();
}
int main() {
demo_print();
return 0;
}
__global__ void print_dim(){
printf("grid dimension: (%3d, %3d, %3d), block dimension: (%3d, %3d, %3d)\n",
gridDim.z, gridDim.y, gridDim.x,
blockDim.z, blockDim.y, blockDim.x);
}
__global__ void print_thread_idx_per_block(){
int index = threadIdx.z * blockDim.x * blockDim.y + \
threadIdx.y * blockDim.x + \
threadIdx.x;
printf("block idx: (%3d, %3d, %3d), thread idx: %3d\n",
blockIdx.z, blockIdx.y, blockIdx.x,
index);
}
__global__ void print_thread_idx_per_grid(){
int block_Size = blockDim.z * blockDim.y * blockDim.x;
int block_Index = blockIdx.z * gridDim.x * gridDim.y + \
blockIdx.y * gridDim.x + \
blockIdx.x;
int thread_Index = threadIdx.z * blockDim.x * blockDim.y + \
threadIdx.y * blockDim.x + \
threadIdx.x;
int thread_index_in_grid = block_Index * block_Size + thread_Index;
printf("block idx: %3d, thread idx in block: %3d, thread index in grid: %3d\n",
block_Index, thread_Index, thread_index_in_grid);
}
#include
#include
#include
//核函数 打印线程索引
__global__ void print_idx(){
printf("block idx: (%3d, %3d, %3d), thread idx: (%3d, %3d, %3d)\n",
blockIdx.z, blockIdx.y, blockIdx.x,
threadIdx.z, threadIdx.y, threadIdx.x);
}
//核函数 打印grid和block的维度
__global__ void print_dim(){
printf("grid dimension: (%3d, %3d, %3d), block dimension: (%3d, %3d, %3d)\n",
gridDim.z, gridDim.y, gridDim.x,
blockDim.z, blockDim.y, blockDim.x);
}
//核函数 计算每个线程在block中的索引。GPU遍历顺序为Z,Y,X,所以计算的如下:
__global__ void print_thread_idx_per_block(){
int index = threadIdx.z * blockDim.x * blockDim.y + \
threadIdx.y * blockDim.x + \
threadIdx.x;
printf("block idx: (%3d, %3d, %3d), thread idx: %3d\n",
blockIdx.z, blockIdx.y, blockIdx.x,
index);
}
//核函数 计算每个线程在grid中的索引。GPU遍历顺序为Z,Y,X:
__global__ void print_thread_idx_per_grid(){
int block_Size = blockDim.z * blockDim.y * blockDim.x;
int block_Index = blockIdx.z * gridDim.x * gridDim.y + \
blockIdx.y * gridDim.x + \
blockIdx.x;
int thread_Index = threadIdx.z * blockDim.x * blockDim.y + \
threadIdx.y * blockDim.x + \
threadIdx.x;
int thread_index_in_grid = block_Index * block_Size + thread_Index;
printf("block idx: %3d, thread idx in block: %3d, thread index in grid: %3d\n",
block_Index, thread_Index, thread_index_in_grid);
}
void demo_print(){
int inputSize = 8;
int blockDim = 4; // block的维度 即 block中的线程数量
int gridDim = inputSize / blockDim; // 计算出需要2个block,所以grid的维度为2
dim3 block(blockDim);
dim3 grid(gridDim);
print_idx<<<grid, block>>>();
//cudaDeviceSynchroize()来强制性的让kernel函数的结果执行结
//束之后host再执行下一步。
cudaDeviceSynchronize();
std::cout << "---------------分割线---------------------------" << std::endl;
print_dim<<<grid, block>>>();
cudaDeviceSynchronize();
std::cout << "---------------分割线---------------------------" << std::endl;
print_thread_idx_per_block<<<grid, block>>>();
cudaDeviceSynchronize();
std::cout << "---------------分割线---------------------------" << std::endl;
print_thread_idx_per_grid<<<grid, block>>>();
cudaDeviceSynchronize();
}
int main() {
demo_print();
return 0;
}
cmake_minimum_required(VERSION 3.10)
project(test CUDA)
set(CMAKE_CUDA_STANDARD 20)
add_executable(test1 print_index_demo1.cu)
#include
#include
#include
#include
int main(){
int count;
int index = 0;
cudaGetDeviceCount(&count);
while (index < count) {
cudaSetDevice(index);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, index);
std::cout<<"*********************Architecture related**********************"<<std::endl;
std::cout<<"Device id: " << index<<std::endl;
std::cout<<"Device name: " << prop.name<<std::endl;
std::cout<<"Device compute capability: "<<prop.major + (float)prop.minor / 10<<std::endl;
std::cout<<"GPU global meory size: "<<(float)prop.totalGlobalMem / (1<<30) << "GB"<<std::endl;;
std::cout<<"L2 cache size: "<<(float)prop.l2CacheSize / (1<<20) << "MB"<<std::endl;;
std::cout<<"Shared memory per block: "<<(float)prop.sharedMemPerBlock / (1<<10) << "KB"<<std::endl;;
std::cout<<"Shared memory per SM: "<<(float)prop.sharedMemPerMultiprocessor / (1<<10)<< "KB"<<std::endl;;
std::cout<<"Device clock rate: "<<prop.clockRate*1E-6<< "GHz"<<std::endl;;
std::cout<<"Device memory clock rate: "<<prop.memoryClockRate*1E-6<< "Ghz"<<std::endl;;
std::cout<<"Number of SM: "<<prop.multiProcessorCount<<std::endl;
std::cout<<"Warp size: "<<prop.warpSize<<std::endl;
std::cout<<"*********************Parameter related************************"<<std::endl;;
std::cout<<"Max block numbers: "<< prop.maxBlocksPerMultiProcessor<<std::endl;
std::cout<<"Max threads per block: "<<prop.maxThreadsPerBlock<<std::endl;
std::cout<<"Max block dimension size:"<<prop.maxThreadsDim[0]<<" "<< prop.maxThreadsDim[1]<<" "<< prop.maxThreadsDim[2]<<std::endl;
std::cout<<"Max grid dimension size: "<<prop.maxGridSize[0]<<" "<< prop.maxGridSize[1]<<" "<< prop.maxGridSize[2]<<std::endl;
index ++;
printf("\n");
}
return 0;
}