__global__ void conditionalKernel(float *input, float *output, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
// 基于数据值的条件分支
if (input[idx] > 0) {
// 复杂计算A - 一些线程会执行这部分
for (int i = 0; i < 100; i++) {
output[idx] = sqrt(input[idx]) * sin(input[idx]) + cos(input[idx]);
}
} else {
// 复杂计算B - 其他线程会执行这部分
for (int i = 0; i < 100; i++) {
output[idx] = log(fabs(input[idx]) + 1) * exp(input[idx] * 0.1);
}
}
}
}
// 主机代码
int main() {
// ...
dim3 threadsPerBlock(256);
dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x);
conditionalKernel<<>>(d_input, d_output, N);
// ...
}
这个内核的利用率低下主要有以下原因:
不规则内存访问模式:
__global__ void irregularAccess(float *input, float *output, int *indices, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
// 不连续的内存访问导致非合并访问
output[idx] = input[indices[idx]];
}
}
负载不平衡
__global__ void imbalancedLoad(int *data, int *results, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
int iterations = data[idx]; // 每个线程的工作量不同
int sum = 0;
for (int i = 0; i < iterations; i++) {
sum += i;
}
results[idx] = sum;
}
}
过多的同步点:
__global__ void excessiveSynchronization(float *data, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
for (int i = 0; i < 100; i++) {
data[idx] += 1.0f;
__syncthreads(); // 每次迭代都同步,导致大量等待
}
}
}
分离内核
__global__ void positiveKernel(float *input, float *output, bool *flags, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N && flags[idx]) {
for (int i = 0; i < 100; i++) {
output[idx] = sqrt(input[idx]) * sin(input[idx]) + cos(input[idx]);
}
}
}
__global__ void negativeKernel(float *input, float *output, bool *flags, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N && !flags[idx]) {
for (int i = 0; i < 100; i++) {
output[idx] = log(fabs(input[idx]) + 1) * exp(input[idx] * 0.1);
}
}
}
__global__ void flagData(float *input, bool *flags, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
flags[idx] = (input[idx] > 0);
}
}
// 主机代码
void optimizedApproach(float *d_input, float *d_output, int N) {
bool *d_flags;
cudaMalloc(&d_flags, N * sizeof(bool));
dim3 threadsPerBlock(256);
dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x);
// 标记数据
flagData<<>>(d_input, d_flags, N);
// 分别处理正值和负值
positiveKernel<<>>(d_input, d_output, d_flags, N);
negativeKernel<<>>(d_input, d_output, d_flags, N);
cudaFree(d_flags);
}
线程协作
__global__ void cooperativeKernel(float *input, float *output, int N) {
__shared__ int positive_count;
__shared__ int negative_count;
__shared__ int positive_indices[256]; // 假设块大小为256
__shared__ int negative_indices[256];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
// 初始化
if (tid == 0) {
positive_count = 0;
negative_count = 0;
}
__syncthreads();
// 分类阶段
bool is_positive = false;
if (idx < N) {
is_positive = (input[idx] > 0);
if (is_positive) {
int pos = atomicAdd(&positive_count, 1);
positive_indices[pos] = tid;
} else {
int pos = atomicAdd(&negative_count, 1);
negative_indices[pos] = tid;
}
}
__syncthreads();
// 处理正值
for (int i = 0; i < positive_count; i++) {
if (tid == positive_indices[i]) {
for (int j = 0; j < 100; j++) {
output[idx] = sqrt(input[idx]) * sin(input[idx]) + cos(input[idx]);
}
}
}
// 处理负值
for (int i = 0; i < negative_count; i++) {
if (tid == negative_indices[i]) {
for (int j = 0; j < 100; j++) {
output[idx] = log(fabs(input[idx]) + 1) * exp(input[idx] * 0.1);
}
}
}
}