书指[GPU高性能编程CUDA实战].(桑德斯).聂雪军等.扫描版.pdf
设一维数组大小为N。
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include
#include
#define N 1024
using namespace std;
__global__ void init(int *a, int *b, int *c)
{
int tid = blockIdx.x;
if (tid < N)
{
a[tid] = tid;
b[tid] = tid * tid;
}
}
__global__ void add(int *a, int *b, int *c)
{
int tid = blockIdx.x;
if (tid < N)
{
c[tid] = a[tid] + b[tid];
}
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * sizeof(int));
cudaMalloc((void **)&dev_b, N * sizeof(int));
cudaMalloc((void **)&dev_c, N * sizeof(int));
init << > > (dev_a, dev_b, dev_c);
add << > > (dev_a, dev_b, dev_c);
cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf("%d+%d=%d\n", a[100], b[100], c[100]);
getchar();
return 0;
}
Nvidia 1050Ti 4G的每个线程块最大只能有1024个线程。设一维数组大小为N。
这种情况下只需要使用一个线程块中的N个线程计算。详见书P44。
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include
#include
#define N 1024
using namespace std;
__global__ void init(int *a, int *b, int *c)
{
int i = threadIdx.x;
if(i < N)
{
a[i] = i;
b[i] = i * i;
}
}
__global__ void add(int *a, int *b, int *c)
{
int i = threadIdx.x;
if(i < N)
{
c[i] = a[i] + b[i];
}
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * sizeof(int));
cudaMalloc((void **)&dev_b, N * sizeof(int));
cudaMalloc((void **)&dev_c, N * sizeof(int));
init<<<1, N>>>(dev_a, dev_b, dev_c);
add <<<1, N>>>(dev_a, dev_b, dev_c);
cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf("%d + %d = %d\n", a[100], b[100], c[100]);
getchar();
return 0;
}
设每个线程块只使用512个线程,则这种情况下需要使用(N-511)/512个线程块。详见书P45。
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include
#include
#define N 2000
using namespace std;
__global__ void init(int *a, int *b, int *c)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if(i < N)
{
a[i] = i;
b[i] = i * i;
}
}
__global__ void add(int *a, int *b, int *c)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if(i < N)
{
c[i] = a[i] + b[i];
}
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * sizeof(int));
cudaMalloc((void **)&dev_b, N * sizeof(int));
cudaMalloc((void **)&dev_c, N * sizeof(int));
init << <(N + 511) / 512, 512 >> > (dev_a, dev_b, dev_c);
add << <(N + 511) / 512, 512 >> > (dev_a, dev_b, dev_c);
cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf("%d + %d = %d\n", a[100], b[100], c[100]);
getchar();
return 0;
}