文件目录:
cudaTest
|--utils.cu
|--utils.h
|--squaresum.cu
|--squaresum.h
|--test.cpp
|--CMakeLists.txt
编译命令:
$cd /root/cudaTest
$mkdir build
$cd build
$cmake ..
$make
调佣关系:
utils:提供常用工具,这里提供查询设备信息功能;
squaresum:计算平方和功能,为cuda运行的核心函数实现
test:调用平方和函数
CMakeLists.txt:组织所有文件编译生成可执行文件
注意:调用cu文件中的函数时要在头文件声明成extern “C”
文件内容:
CMakeLists.txt
# CMakeLists.txt to build hellocuda.cu cmake_minimum_required(VERSION 2.8) find_package(CUDA QUIET REQUIRED) # Specify binary name and source file to build it from #add_library(utils utils.cpp) cuda_add_executable( squaresum test.cpp squaresum.cu utils.cu) #target_link_libraries(squaresum utils)
test.cpp
#include#include "squaresum.h" //extern "C" int squaresum(); int main(){ squaresum(); return 0; }
squaresum.h
#include "utils.h" #includeextern "C" { int squaresum(); }
squaresum.cu
#include#include //#include "utils.h" #include #include "squaresum.h" // ======== define area ======== #define DATA_SIZE 1048576 // 1M // ======== global area ======== int data[DATA_SIZE]; __global__ static void squaresSum(int *data, int *sum, clock_t *time) { int sum_t = 0; clock_t start = clock(); for (int i = 0; i < DATA_SIZE; ++i) { sum_t += data[i] * data[i]; } *sum = sum_t; *time = clock() - start; } // ======== used to generate rand datas ======== void generateData(int *data, int size) { for (int i = 0; i < size; ++i) { data[i] = rand() % 10; } } int squaresum() { // init CUDA device if (!InitCUDA()) { return 0; } printf("CUDA initialized.\n"); // generate rand datas generateData(data, DATA_SIZE); // malloc space for datas in GPU int *gpuData, *sum; clock_t *time; cudaMalloc((void**) &gpuData, sizeof(int) * DATA_SIZE); cudaMalloc((void**) &sum, sizeof(int)); cudaMalloc((void**) &time, sizeof(clock_t)); cudaMemcpy(gpuData, data, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice); // calculate the squares's sum squaresSum<<<1, 1, 0>>>(gpuData, sum, time); // copy the result from GPU to HOST int result; clock_t time_used; cudaMemcpy(&result, sum, sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost); // free GPU spaces cudaFree(gpuData); cudaFree(sum); cudaFree(time); // print result printf("(GPU) sum:%d time:%ld\n", result, time_used); // CPU calculate result = 0; clock_t start = clock(); for (int i = 0; i < DATA_SIZE; ++i) { result += data[i] * data[i]; } time_used = clock() - start; printf("(CPU) sum:%d time:%ld\n", result, time_used); return 0; }
utils.h
#include#include extern "C" { bool InitCUDA(); }
utils.cu
#include "utils.h" #include#include void printDeviceProp(const cudaDeviceProp &prop) { printf("Device Name : %s.\n", prop.name); printf("totalGlobalMem : %d.\n", prop.totalGlobalMem); printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock); printf("regsPerBlock : %d.\n", prop.regsPerBlock); printf("warpSize : %d.\n", prop.warpSize); printf("memPitch : %d.\n", prop.memPitch); printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock); printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf("totalConstMem : %d.\n", prop.totalConstMem); printf("major.minor : %d.%d.\n", prop.major, prop.minor); printf("clockRate : %d.\n", prop.clockRate); printf("textureAlignment : %d.\n", prop.textureAlignment); printf("deviceOverlap : %d.\n", prop.deviceOverlap); printf("multiProcessorCount : %d.\n", prop.multiProcessorCount); } bool InitCUDA() { //used to count the device numbers int count; // get the cuda device count cudaGetDeviceCount(&count); // print("%d\n", count); std::cout << count << std::endl; if (count == 0) { fprintf(stderr, "There is no device.\n"); return false; } // find the device >= 1.X int i; for (i = 0; i < count; ++i) { cudaDeviceProp prop; if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if (prop.major >= 1) { printDeviceProp(prop); break; } } } // if can't find the device if (i == count) { fprintf(stderr, "There is no device supporting CUDA 1.x.\n"); return false; } // set cuda device cudaSetDevice(i); return true; } //int main(){ // InitCUDA(); //}