记录下学习cuda过程中的源代码,附有注释以便回顾学习。
#include
using namespace std;
__global__ void add(int a, int b, int *c) {
*c = a + b;
}
int test1(void) { // 设备指针和生成设备内存
int c;
int *dev_c;
cudaMalloc((void**)&dev_c, sizeof(int));
add << <1, 1 >> > (1, 3, dev_c);
//printf("%d\n", *dev_c);
cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", c);
cout << c << endl;
cudaFree(dev_c);
//test();
return 0;
}
int test2() { // 测试在主机代码使用设备指针
int *dev;
cudaMalloc((void**)&dev, sizeof(int));
*dev = 3; // 不能这样,虽然编译不会出现问题,但是运行过程中会中途退出
cout << "hello" << endl;
//cout << dev << endl;
return 0;
}
int test3() { // 获取设备信息
int count;
cudaGetDeviceCount(&count); // GPU数量
cout << "有" << count << "个GPU设备" << endl;
cudaDeviceProp prop;
for (int i = 0; i < count; ++i) {
cudaGetDeviceProperties(&prop, i);
cout << " --- Genenal Information of Device -- " << endl;
cout << "name: " << prop.name << endl;
cout << "Compute capability: " << prop.major << "." << prop.minor << endl;
cout << "Clock Rate: " << prop.clockRate << endl;
cout << "\n __ Memory Information for device __ " << endl;
cout << "Total global memery: " << prop.totalGlobalMem << endl;
cout << "Total constant memery: " << prop.totalConstMem << endl;// 常量内存总量,什么鬼
}
return 0;
}
void test4() { // 根据条件选择设备,版本号大于等于1.3
cudaDeviceProp prop;
int dev;
cudaGetDevice(&dev);
cout << "ID of current device is " << dev << endl;
memset(&prop, 0, sizeof(cudaDeviceProp));
prop.major = 1;
prop.minor = 3;
cudaChooseDevice(&dev, &prop);
cout << "the id of choose device is " << dev << endl;
cudaSetDevice(dev);
}
const int N = 10;
__global__ void add(int *a, int *b, int *c) {
int tid = blockIdx.x;
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
void test5() { // cuda进行十位的矢量相加
int a[N], b[N], c[N];
for (int i = 0; i < 10; ++i) { // 初始化矢量
a[i] = -i - 3;
b[i] = i * i;
}
int *dev_a, *dev_b, *dev_c; // 在GPU上分配内存
cudaMalloc((void**)&dev_a, N*sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
cudaMemcpy(dev_a, a, sizeof(a), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeof(b), cudaMemcpyHostToDevice);
for (int i = 0; i < 100000; ++i)
add << > > (dev_a, dev_b, dev_c); // 左线程块的数量,右一个线程块的线程数量
cudaMemcpy(c, dev_c, sizeof(c), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
cout << a[i] << " + " << b[i] << " = " << c[i] << endl;
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
__global__ void add_2(int *a, int *b, int *c) {
int tid = threadIdx.x;
if (tid < N) {
c[tid] = a[tid] + b[tid];
}
}
void test6() { // cuda进行十位的矢量相加 // 使用并行线程而非并行线程块实现
int a[N], b[N], c[N];
for (int i = 0; i < 10; ++i) { // 初始化矢量
a[i] = -i - 3;
b[i] = i * i;
}
int *dev_a, *dev_b, *dev_c; // 在GPU上分配内存
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
cudaMemcpy(dev_a, a, sizeof(a), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeof(b), cudaMemcpyHostToDevice);
for (int i = 0; i < 100000; ++i)
add_2 << <1, N >> > (dev_a, dev_b, dev_c); // 左线程块的数量,右一个线程块的线程数量
cudaMemcpy(c, dev_c, sizeof(c), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
cout << a[i] << " + " << b[i] << " = " << c[i] << endl;
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
__global__ void add_3() {
int bx = blockIdx.x, tx = threadIdx.x;
int bd = blockDim.x, gd = gridDim.x; // 测试有多少个
printf("bx = %d tx = %d bd = %d gd = %d\n\n", bx, tx, bd, gd);
//cout << "hello, world" << endl; // 核函数里面似乎不能使用cout?可以使用printf?
//cout << "bx = " << bx << " tx = " << tx << endl;
//cout << "bd = " << bd << " gd = " << gridDim.x << endl << endl;
}
void test7() {
add_3 << <2, 5 >> > ();
}
int main() {
test7();
}