为了避免每次都要键入nvcc的命令,要准备一个makefile。makefile如下:
CUFLAG = -g -Xcompiler -v \ -gencode=arch=compute_20,code=sm_20\ -gencode=arch=compute_20,code=compute_20\ -O2 IFLAGS = -I$(CUDA_DIR)/include -I$(CUDA_SDK_DIR)/C/common/inc -I../include LFLAGS = -L$(CUDA_DIR)/lib64 -L$(CUDA_SDK_DIR)/C/lib PRG = cuda_test $(PRG) : main.cu nvcc main.cu -o $(PRG) $(CUFLAG) $(IFLAGS) $(LFLAGS)
#include<stdio.h> #include<stdlib.h> #include<cuda.h> #include<cutil.h> __global__ void mykernel(void) { } int main(void) { mykernel<<<1,1>>>(); printf("Hello World!\n"); return 0; }上述代码编译后运行生成可执行文件cuda_test,运行cuda_test后将输出:
Hello World!
注意:
例2: 整数相加
#include<stdio.h> #include<stdlib.h> #include<cuda.h> #include<cutil.h> __global__ void integer_add(int * a, int * b, int * c) { *c = *a + *b; } int main(void) { int a,b,c; int * d_a, * d_b, * d_c; int size = sizeof(int); cudaMalloc((void**)&d_a,size); cudaMalloc((void**)&d_b,size); cudaMalloc((void**)&d_c,size); printf("Enter two integers with a space to separate them:\n"); scanf("%d %d",&a,&b); cudaMemcpy(d_a,&a,size,cudaMemcpyHostToDevice); cudaMemcpy(d_b,&b,size,cudaMemcpyHostToDevice); integer_add<<<1,1>>>(d_a,d_b,d_c); cudaMemcpy(&c,d_c,size,cudaMemcpyDeviceToHost); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); printf("Sum is %d\n",c); return 0; }
#include<stdio.h> #include<stdlib.h> #include<cuda.h> #include<cutil.h> #include<time.h> #define N 512 __global__ void vec_block_add(int * a, int * b, int * c) { c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x]; } void rand_ints(int * arr, int count) { srand(time(NULL)); for(int i=0;i<count;i++) { arr[i] = rand() % 100; } } int main(void) { int * a,* b,* c; int * d_a, * d_b, * d_c; int size = N * sizeof(int); cudaMalloc((void**)&d_a,size); cudaMalloc((void**)&d_b,size); cudaMalloc((void**)&d_c,size); a = (int *) malloc(size); rand_ints(a,N); b = (int *) malloc(size); rand_ints(b,N); c = (int *) malloc(size); cudaMemcpy(d_a,a,size,cudaMemcpyHostToDevice); cudaMemcpy(d_b,b,size,cudaMemcpyHostToDevice); vec_block_add<<<N,1>>>(d_a,d_b,d_c); cudaMemcpy(c,d_c,size,cudaMemcpyDeviceToHost); #if 1 for(int i=0;i<N;i++) { printf("%-5d: a:%-5d b:%-5d c:%-5d\n",i,a[i],b[i],c[i]); } #endif cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); free(a); free(b); free(c); return 0; }
__global__ void vec_block_add(int * a, int * b, int * c) { c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x]; }由于函数是并行执行的,和传统的串行程序在integer_add函数中使用 循环来完成加法相比,相当于由GPU这个加速器使用硬件的方式进行了 循环展开,展开后便可以并行执行了。所以在编写这段代码时,需要使用blockIdx.x来定位当前执行的是 循环的哪个部分。
#include<stdio.h> #include<stdlib.h> #include<cuda.h> #include<cutil.h> #include<time.h> #define N 512 __global__ void vec_thread_add(int * a, int * b, int * c) { c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x]; } void rand_ints(int * arr, int count) { srand(time(NULL)); for(int i=0;i<count;i++) { arr[i] = rand() % 100; } } int main(void) { int * a,* b,* c; int * d_a, * d_b, * d_c; int size = N * sizeof(int); cudaMalloc((void**)&d_a,size); cudaMalloc((void**)&d_b,size); cudaMalloc((void**)&d_c,size); a = (int *) malloc(size); rand_ints(a,N); b = (int *) malloc(size); rand_ints(b,N); c = (int *) malloc(size); cudaMemcpy(d_a,a,size,cudaMemcpyHostToDevice); cudaMemcpy(d_b,b,size,cudaMemcpyHostToDevice); vec_thread_add<<<1,N>>>(d_a,d_b,d_c); cudaMemcpy(c,d_c,size,cudaMemcpyDeviceToHost); #if 1 for(int i=0;i<N;i++) { printf("%-5d: a:%-5d b:%-5d c:%-5d\n",i,a[i],b[i],c[i]); } #endif cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); free(a); free(b); free(c); return 0; }上述代码vec_thread_add<<<1,N>>>指定向量相加操作仅有1个块完成,但是这个块可以分割成512个线程来完成这个计算。和块类似的,由于函数vec_thread_add是被多个线程并行展开循环的,所以需要根据线程编号来确定当前循环应该完成的计算部分。每个线程使用threadIdx.x来标识当前线程。
#include<stdio.h> #include<stdlib.h> #include<cuda.h> #include<cutil.h> #include<time.h> #define N (16*16) #define THREAD_PER_BLOCK 32 __global__ void vec_block_thread_add(int * a, int * b, int * c) { int index = threadIdx.x + blockIdx.x * blockDim.x; c[index] = a[index] + b[index]; } void rand_ints(int * arr, int count) { srand(time(NULL)); for(int i=0;i<count;i++) { arr[i] = rand() % 100; } } int main(void) { int * a,* b,* c; int * d_a, * d_b, * d_c; int size = N * sizeof(int); cudaMalloc((void**)&d_a,size); cudaMalloc((void**)&d_b,size); cudaMalloc((void**)&d_c,size); a = (int *) malloc(size); rand_ints(a,N); b = (int *) malloc(size); rand_ints(b,N); c = (int *) malloc(size); cudaMemcpy(d_a,a,size,cudaMemcpyHostToDevice); cudaMemcpy(d_b,b,size,cudaMemcpyHostToDevice); vec_block_thread_add<<<N/THREAD_PER_BLOCK,THREAD_PER_BLOCK>>>(d_a,d_b,d_c); cudaMemcpy(c,d_c,size,cudaMemcpyDeviceToHost); #if 1 for(int i=0;i<N;i++) { printf("%-5d: a:%-5d b:%-5d c:%-5d\n",i,a[i],b[i],c[i]); } #endif cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); free(a); free(b); free(c); return 0; }以上代码的缺陷是需要元素的数量能够整除每块中线程数量,稍作修改便可以实现任意元素数量的向量相加了。
#include<stdio.h> #include<stdlib.h> #include<cuda.h> #include<cutil.h> #include<time.h> #define N 100 #define M 32 __global__ void vec_block_thread_add(int * a, int * b, int * c, int n ) { int index = threadIdx.x + blockIdx.x * blockDim.x; if(index < n) { c[index] = a[index] + b[index]; } } void rand_ints(int * arr, int count) { srand(time(NULL)); for(int i=0;i<count;i++) { arr[i] = rand() % 100; } } int main(void) { int * a,* b,* c; int * d_a, * d_b, * d_c; int size = N * sizeof(int); cudaMalloc((void**)&d_a,size); cudaMalloc((void**)&d_b,size); cudaMalloc((void**)&d_c,size); a = (int *) malloc(size); rand_ints(a,N); b = (int *) malloc(size); rand_ints(b,N); c = (int *) malloc(size); cudaMemcpy(d_a,a,size,cudaMemcpyHostToDevice); cudaMemcpy(d_b,b,size,cudaMemcpyHostToDevice); vec_block_thread_add<<<(N+M-1)/M,M>>>(d_a,d_b,d_c,N); cudaMemcpy(c,d_c,size,cudaMemcpyDeviceToHost); #if 1 for(int i=0;i<N;i++) { printf("%-5d: a:%-5d b:%-5d c:%-5d\n",i,a[i],b[i],c[i]); } #endif cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); free(a); free(b); free(c); return 0; }可能有一个很显然的问题就是既然使用块能够达到并行的函数调用为什么还需要用线程机制呢?
#include<stdio.h> #include<stdlib.h> #include<cuda.h> #include<cutil.h> #include<time.h> #define N 256 #define RADIUS 2 #define BLOCK_SIZE 32 __global__ void stencil_1d(int * in, int *out) { __shared__ int temp[BLOCK_SIZE + 2 * RADIUS]; int g_index = threadIdx.x + blockIdx.x * blockDim.x + RADIUS; int s_index = threadIdx.x + RADIUS; temp[s_index] = in[g_index]; if(threadIdx.x < RADIUS) { temp[s_index - RADIUS] = in[g_index - RADIUS]; temp[s_index + BLOCK_SIZE] = in[g_index + BLOCK_SIZE]; } int result = 0; for(int offset = -RADIUS; offset <= RADIUS; offset++) { result = result + temp[s_index+offset]; } out[g_index-RADIUS] = result; } void rand_ints(int * arr, int count) { srand(time(NULL)); for(int i=0;i<count;i++) { arr[i] = rand() % 100; } } int main(void) { int * in, * out; int size_in = sizeof(int) * ( N + 2 * RADIUS ); in = (int *) malloc(size_in); rand_ints(in+RADIUS,N); int size_out = sizeof(int) * N; out = (int *) malloc(size_out); int * d_in, * d_out; cudaMalloc((void**)&d_in,size_in); cudaMalloc((void**)&d_out,size_out); cudaMemcpy(d_in,in,size_in,cudaMemcpyHostToDevice); stencil_1d<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE,BLOCK_SIZE>>>(d_in,d_out); cudaMemcpy(out,d_out,size_out,cudaMemcpyDeviceToHost); #if 1 for(int i=0;i<N+2*RADIUS;i++) { printf("%-5d ",in[i]); } printf("\n"); for(int i=0;i<N;i++) { printf("%-5d ",out[i]); } printf("\n"); #endif cudaFree(d_in); cudaFree(d_out); free(in); free(out); return 0; }
#include<stdio.h> #include<stdlib.h> #include<cuda.h> #include<cutil.h> #include<time.h> #define N 256 #define RADIUS 2 #define BLOCK_SIZE 32 __global__ void stencil_1d(int * in, int *out) { __shared__ int temp[BLOCK_SIZE + 2 * RADIUS]; int g_index = threadIdx.x + blockIdx.x * blockDim.x + RADIUS; int s_index = threadIdx.x + RADIUS; temp[s_index] = in[g_index]; if(threadIdx.x < RADIUS) { temp[s_index - RADIUS] = in[g_index - RADIUS]; temp[s_index + BLOCK_SIZE] = in[g_index + BLOCK_SIZE]; } __syncthreads(); int result = 0; for(int offset = -RADIUS; offset <= RADIUS; offset++) { result = result + temp[s_index+offset]; } out[g_index-RADIUS] = result; } void rand_ints(int * arr, int count) { srand(time(NULL)); for(int i=0;i<count;i++) { arr[i] = rand() % 100; } } int main(void) { int * in, * out; int size_in = sizeof(int) * ( N + 2 * RADIUS ); in = (int *) malloc(size_in); rand_ints(in+RADIUS,N); int size_out = sizeof(int) * N; out = (int *) malloc(size_out); int * d_in, * d_out; cudaMalloc((void**)&d_in,size_in); cudaMalloc((void**)&d_out,size_out); cudaMemcpy(d_in,in,size_in,cudaMemcpyHostToDevice); stencil_1d<<<(N+BLOCK_SIZE-1)/BLOCK_SIZE,BLOCK_SIZE>>>(d_in,d_out); cudaMemcpy(out,d_out,size_out,cudaMemcpyDeviceToHost); #if 1 for(int i=0;i<N+2*RADIUS;i++) { printf("%-5d ",in[i]); } printf("\n"); for(int i=0;i<N;i++) { printf("%-5d ",out[i]); } printf("\n"); #endif cudaFree(d_in); cudaFree(d_out); free(in); free(out); return 0; }