__ballot(int predicate):指的是当前线程所在的Wrap中第N个线程对应的predicate值不为零,则将整数零的第N位进行置位
__popc(ballot(int predicate)):返回warp中bool不为零的线程数目
asm("mov.u32 %0, %laneid;" : "=r"(ret)):获得ret为当前线程在所在Warp中的ID
unsigned int ret; asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret));
__popc(ret & __ballot(int predicate)):返回的值为当前线程在所在的Warp中是第几个满足条件的
Example:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include "device_functions.h" #include <iostream> using namespace std; __device__ __forceinline__ int laneId() { unsigned int ret; asm("mov.u32 %0, %laneid;" : "=r"(ret)); return ret; } __device__ __forceinline__ int laneMaskLt() { unsigned int ret; asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret)); return ret; } __global__ void testKernel(int *a, int *b, int *c, int *d, int *e, int n) { int x = threadIdx.x + blockIdx.x * blockDim.x; if (x >= n) { return; } a[x] = __ballot(x > 10); b[x] = laneMaskLt(); d[x] = __popc(b[x] & a[x]); c[x] = __popc(a[x]); e[x] = laneId(); } int main() { int *a, *b, *c, *d, *e, *dev_a, *dev_b, *dev_c, *dev_d, *dev_e; int n = 64; int size = n * sizeof(int); a = (int *)malloc(size); b = (int *)malloc(size); c = (int *)malloc(size); d = (int *)malloc(size); e = (int *)malloc(size); cudaMalloc(&dev_a, size); cudaMalloc(&dev_b, size); cudaMalloc(&dev_c, size); cudaMalloc(&dev_d, size); cudaMalloc(&dev_e, size); testKernel<<<1, n>>>(dev_a, dev_b, dev_c, dev_d, dev_e, n); cudaMemcpy(a, dev_a, size, cudaMemcpyDeviceToHost); cudaMemcpy(b, dev_b, size, cudaMemcpyDeviceToHost); cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost); cudaMemcpy(d, dev_d, size, cudaMemcpyDeviceToHost); cudaMemcpy(e, dev_e, size, cudaMemcpyDeviceToHost); for (int i = 0; i < n; ++i) { printf("%d %d %d %d %d\n", a[i], b[i], c[i], d[i], e[i]); } cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); cudaFree(dev_d); cudaFree(dev_e); free(a); free(b); free(c); free(d); free(e); }