调用CUDA的核函数时指定block 和 thread大小,该大小可以是dim3类型的(三维数组),只用一维时可以是usigned int型的。
以下程序验证了当block或thread大小超出硬件允许值时会产生异常!!!GPU根本不会执行运算!!!
所以验证结果的正确性很重要!!!
在VS中创建CUDA项目会有一个模板,里面有更详细的状态验证。
以下程序在K5000GPU上跑的。
奇怪的是cuda samples中的deviceQuery程序给出的block大小的第一维可以到2^31,但实测还是只有655535!!!
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <string>
#include <sstream>
#include <random>
cudaDeviceProp getCudaDeviceProperties(int deviceIdx = 0) {
cudaSetDevice(deviceIdx);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceIdx);
return deviceProp;
}
///在GPU上跑的函数,被称为kernel function
__global__ void productArray_bt_kernel(float *pa, float *pb, float *pResult, int N) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < N)
pResult[idx] = pa[idx] * pb[idx];
}
///kernel function 的辅助函数,用于自动分配内存、验证结果等;此处将内存的分配、拷贝放在了外部!因为同一参数要执行好多次
void productArray_bt(float *pa, float *pb, float *pResult, int N, int threadNum = 32) throw(std::string) {
cudaError_t cudaStatus;
int blockNum = (N - 1) / threadNum + 1;
dim3 bd(blockNum, 1, 1);
productArray_bt_kernel << <blockNum, threadNum >> > (pa, pb, pResult, N);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
std::stringstream ss;
ss << "productArray_bt_kernel launch failed: " << cudaGetErrorString(cudaStatus) << "\n\tblockNum=" << blockNum << ";\tthreadNum=" << threadNum << ";\t";
std::string errStr = ss.str();
//std::cerr << errStr << std::endl;
throw errStr;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
std::stringstream ss;
ss << "cudaDeviceSynchronize returned error code" << cudaStatus << "after launching addKernel!" << "\n\tblockNum=" << blockNum << ";\tthreadNum=" << threadNum << ";\t";
std::string errStr = ss.str();
//std::cerr <<errStr<< std::endl;
throw errStr;
}
}
void TestProductSpeed() {
float *pa, *pb, *pResult; // host copies
float *d_a, *d_b, *d_c; // device copies
int N = 5;
for (N = 1; N <= (1 << 20); N *= 2) {
std::cout << "\n\n**************数组长度为 " << N << " 的测试\n";
int size = sizeof (float)* N;
// Allocate space for device copies of pa, pb, pResult
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
pa = new float[N];
pb = new float[N];
pResult = new float[N];
for (int i = 0; i < N; ++i) {
pa[i] = i;
pb[i] = i * 10;
}
// Copy inputs to device
cudaMemcpy(d_a, pa, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, pb, size, cudaMemcpyHostToDevice);
//std::cout << "blockDim.x=" << blockDim.x<<std::endl;
// Launch add() kernel on GPU
for (int threadNum = 1; threadNum <= 4096; threadNum *= 4) {
try{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
{
//统计GPU耗时的代码段
productArray_bt(d_a, d_b, d_c, N, threadNum);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float costtime;
cudaEventElapsedTime(&costtime, start, stop);
std::cout << "数组长度=" << N << ";\t" << "treadNum=" << threadNum << ";\t" << "点积用时:" << costtime / 1000 << "s" << std::endl;
// Copy result back to host
cudaMemcpy(pResult, d_c, size, cudaMemcpyDeviceToHost);
//验证结果的正确性
for (int i = 1; i < N; i *= 2) {
//std::cout << "pa[i]=" << pa[i] << std::endl;
if (pResult[i] != pa[i] * pb[i])
std::cout << "错误: " << "i=" << i << ";\ti*10i=" << pResult[i] << std::endl;
}
}
catch (std::string s){
std::cout << "异常:" << s << std::endl;
std::cout << "\t" << "数组长度=" << N << ";\t" << "treadNum=" << threadNum << ";\t" << std::endl;
}
catch (...){
std::cout << "未知的异常类型" << std::endl;
std::cout << "\t\t" << "数组长度=" << N << ";\t" << "treadNum=" << threadNum << ";\t" << std::endl;
}
std::cout << std::endl;
}
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
delete[]pa;
delete[]pb;
delete[]pResult;
}
}
#define printExp(x) std::cout<< #x <<" = "<< x <<std::endl;
int main(void) {
std::cout << __FILE__ << std::endl;
TestProductSpeed();
return 0;
}
程序输出如下:
***/testThreadSpeed.cu
**************数组长度为 1 的测试
数组长度=1; treadNum=1; 点积用时:3.5904e-005s
数组长度=1; treadNum=4; 点积用时:0.000135648s
数组长度=1; treadNum=16; 点积用时:3.4592e-005s
数组长度=1; treadNum=64; 点积用时:3.2384e-005s
数组长度=1; treadNum=256; 点积用时:3.2928e-005s
数组长度=1; treadNum=1024; 点积用时:3.1296e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=1; treadNum=4096;
**************数组长度为 2 的测试
数组长度=2; treadNum=1; 点积用时:3.6448e-005s
数组长度=2; treadNum=4; 点积用时:3.3696e-005s
数组长度=2; treadNum=16; 点积用时:3.4368e-005s
数组长度=2; treadNum=64; 点积用时:2.9024e-005s
数组长度=2; treadNum=256; 点积用时:3.232e-005s
数组长度=2; treadNum=1024; 点积用时:3.2352e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=2; treadNum=4096;
**************数组长度为 4 的测试
数组长度=4; treadNum=1; 点积用时:3.6448e-005s
数组长度=4; treadNum=4; 点积用时:4.3488e-005s
数组长度=4; treadNum=16; 点积用时:3.248e-005s
数组长度=4; treadNum=64; 点积用时:3.3984e-005s
数组长度=4; treadNum=256; 点积用时:4.1952e-005s
数组长度=4; treadNum=1024; 点积用时:3.3632e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=4; treadNum=4096;
**************数组长度为 8 的测试
数组长度=8; treadNum=1; 点积用时:3.328e-005s
数组长度=8; treadNum=4; 点积用时:3.36e-005s
数组长度=8; treadNum=16; 点积用时:3.2032e-005s
数组长度=8; treadNum=64; 点积用时:3.2736e-005s
数组长度=8; treadNum=256; 点积用时:3.2416e-005s
数组长度=8; treadNum=1024; 点积用时:3.3376e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=8; treadNum=4096;
**************数组长度为 16 的测试
数组长度=16; treadNum=1; 点积用时:3.472e-005s
数组长度=16; treadNum=4; 点积用时:3.2224e-005s
数组长度=16; treadNum=16; 点积用时:3.2352e-005s
数组长度=16; treadNum=64; 点积用时:3.28e-005s
数组长度=16; treadNum=256; 点积用时:3.1648e-005s
数组长度=16; treadNum=1024; 点积用时:3.2672e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=16; treadNum=4096;
**************数组长度为 32 的测试
数组长度=32; treadNum=1; 点积用时:3.44e-005s
数组长度=32; treadNum=4; 点积用时:3.3472e-005s
数组长度=32; treadNum=16; 点积用时:3.1968e-005s
数组长度=32; treadNum=64; 点积用时:4.8256e-005s
数组长度=32; treadNum=256; 点积用时:3.4592e-005s
数组长度=32; treadNum=1024; 点积用时:3.3152e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=32; treadNum=4096;
**************数组长度为 64 的测试
数组长度=64; treadNum=1; 点积用时:3.5712e-005s
数组长度=64; treadNum=4; 点积用时:3.12e-005s
数组长度=64; treadNum=16; 点积用时:3.2352e-005s
数组长度=64; treadNum=64; 点积用时:3.1168e-005s
数组长度=64; treadNum=256; 点积用时:3.2064e-005s
数组长度=64; treadNum=1024; 点积用时:3.3376e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=64; treadNum=4096;
**************数组长度为 128 的测试
数组长度=128; treadNum=1; 点积用时:3.504e-005s
数组长度=128; treadNum=4; 点积用时:3.2544e-005s
数组长度=128; treadNum=16; 点积用时:3.1776e-005s
数组长度=128; treadNum=64; 点积用时:3.1744e-005s
数组长度=128; treadNum=256; 点积用时:3.1776e-005s
数组长度=128; treadNum=1024; 点积用时:3.1872e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=128; treadNum=4096;
**************数组长度为 256 的测试
数组长度=256; treadNum=1; 点积用时:3.5328e-005s
数组长度=256; treadNum=4; 点积用时:3.3408e-005s
数组长度=256; treadNum=16; 点积用时:3.456e-005s
数组长度=256; treadNum=64; 点积用时:3.3952e-005s
数组长度=256; treadNum=256; 点积用时:4.6336e-005s
数组长度=256; treadNum=1024; 点积用时:3.1776e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=256; treadNum=4096;
**************数组长度为 512 的测试
数组长度=512; treadNum=1; 点积用时:3.68e-005s
数组长度=512; treadNum=4; 点积用时:3.2064e-005s
数组长度=512; treadNum=16; 点积用时:3.2512e-005s
数组长度=512; treadNum=64; 点积用时:3.2736e-005s
数组长度=512; treadNum=256; 点积用时:3.136e-005s
数组长度=512; treadNum=1024; 点积用时:3.2128e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=512; treadNum=4096;
**************数组长度为 1024 的测试
数组长度=1024; treadNum=1; 点积用时:3.9552e-005s
数组长度=1024; treadNum=4; 点积用时:3.3568e-005s
数组长度=1024; treadNum=16; 点积用时:3.1712e-005s
数组长度=1024; treadNum=64; 点积用时:3.184e-005s
数组长度=1024; treadNum=256; 点积用时:3.5264e-005s
数组长度=1024; treadNum=1024; 点积用时:3.2544e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=1024; treadNum=4096;
**************数组长度为 2048 的测试
数组长度=2048; treadNum=1; 点积用时:4.7872e-005s
数组长度=2048; treadNum=4; 点积用时:3.4368e-005s
数组长度=2048; treadNum=16; 点积用时:3.2544e-005s
数组长度=2048; treadNum=64; 点积用时:5.648e-005s
数组长度=2048; treadNum=256; 点积用时:3.4336e-005s
数组长度=2048; treadNum=1024; 点积用时:3.5296e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=2048; treadNum=4096;
**************数组长度为 4096 的测试
数组长度=4096; treadNum=1; 点积用时:6.0384e-005s
数组长度=4096; treadNum=4; 点积用时:3.872e-005s
数组长度=4096; treadNum=16; 点积用时:3.328e-005s
数组长度=4096; treadNum=64; 点积用时:3.3344e-005s
数组长度=4096; treadNum=256; 点积用时:3.2832e-005s
数组长度=4096; treadNum=1024; 点积用时:3.3088e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=1; threadNum=4096;
数组长度=4096; treadNum=4096;
**************数组长度为 8192 的测试
数组长度=8192; treadNum=1; 点积用时:8.7168e-005s
数组长度=8192; treadNum=4; 点积用时:4.5664e-005s
数组长度=8192; treadNum=16; 点积用时:3.504e-005s
数组长度=8192; treadNum=64; 点积用时:3.2928e-005s
数组长度=8192; treadNum=256; 点积用时:3.2096e-005s
数组长度=8192; treadNum=1024; 点积用时:3.2128e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=2; threadNum=4096;
数组长度=8192; treadNum=4096;
**************数组长度为 16384 的测试
数组长度=16384; treadNum=1; 点积用时:0.000143616s
数组长度=16384; treadNum=4; 点积用时:6.08e-005s
数组长度=16384; treadNum=16; 点积用时:4.4512e-005s
数组长度=16384; treadNum=64; 点积用时:3.7184e-005s
数组长度=16384; treadNum=256; 点积用时:3.3088e-005s
数组长度=16384; treadNum=1024; 点积用时:3.2736e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=4; threadNum=4096;
数组长度=16384; treadNum=4096;
**************数组长度为 32768 的测试
数组长度=32768; treadNum=1; 点积用时:0.000344064s
数组长度=32768; treadNum=4; 点积用时:8.608e-005s
数组长度=32768; treadNum=16; 点积用时:4.4608e-005s
数组长度=32768; treadNum=64; 点积用时:3.568e-005s
数组长度=32768; treadNum=256; 点积用时:3.248e-005s
数组长度=32768; treadNum=1024; 点积用时:3.5552e-005s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=8; threadNum=4096;
数组长度=32768; treadNum=4096;
**************数组长度为 65536 的测试
异常:productArray_bt_kernel launch failed: invalid argument
blockNum=65536; threadNum=1;
数组长度=65536; treadNum=1;
数组长度=65536; treadNum=4; 点积用时:0.000200448s
数组长度=65536; treadNum=16; 点积用时:0.000135808s
数组长度=65536; treadNum=64; 点积用时:0.00033584s
数组长度=65536; treadNum=256; 点积用时:0.0003568s
数组长度=65536; treadNum=1024; 点积用时:0.000324992s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=16; threadNum=4096;
数组长度=65536; treadNum=4096;
**************数组长度为 131072 的测试
异常:productArray_bt_kernel launch failed: invalid argument
blockNum=131072; threadNum=1;
数组长度=131072; treadNum=1;
数组长度=131072; treadNum=4; 点积用时:0.00029168s
数组长度=131072; treadNum=16; 点积用时:0.00015808s
数组长度=131072; treadNum=64; 点积用时:0.000143424s
数组长度=131072; treadNum=256; 点积用时:4.2816e-005s
数组长度=131072; treadNum=1024; 点积用时:0.000152416s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=32; threadNum=4096;
数组长度=131072; treadNum=4096;
**************数组长度为 262144 的测试
异常:productArray_bt_kernel launch failed: invalid argument
blockNum=262144; threadNum=1;
数组长度=262144; treadNum=1;
异常:productArray_bt_kernel launch failed: invalid argument
blockNum=65536; threadNum=4;
数组长度=262144; treadNum=4;
数组长度=262144; treadNum=16; 点积用时:0.000238848s
数组长度=262144; treadNum=64; 点积用时:0.000137728s
数组长度=262144; treadNum=256; 点积用时:0.000148288s
数组长度=262144; treadNum=1024; 点积用时:0.000140192s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=64; threadNum=4096;
数组长度=262144; treadNum=4096;
**************数组长度为 524288 的测试
异常:productArray_bt_kernel launch failed: invalid argument
blockNum=524288; threadNum=1;
数组长度=524288; treadNum=1;
异常:productArray_bt_kernel launch failed: invalid argument
blockNum=131072; threadNum=4;
数组长度=524288; treadNum=4;
数组长度=524288; treadNum=16; 点积用时:0.000356736s
数组长度=524288; treadNum=64; 点积用时:0.00019056s
数组长度=524288; treadNum=256; 点积用时:0.000161248s
数组长度=524288; treadNum=1024; 点积用时:0.000157632s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=128; threadNum=4096;
数组长度=524288; treadNum=4096;
**************数组长度为 1048576 的测试
异常:productArray_bt_kernel launch failed: invalid argument
blockNum=1048576; threadNum=1;
数组长度=1048576; treadNum=1;
异常:productArray_bt_kernel launch failed: invalid argument
blockNum=262144; threadNum=4;
数组长度=1048576; treadNum=4;
异常:productArray_bt_kernel launch failed: invalid argument
blockNum=65536; threadNum=16;
数组长度=1048576; treadNum=16;
数组长度=1048576; treadNum=64; 点积用时:0.000241312s
数组长度=1048576; treadNum=256; 点积用时:0.000206912s
数组长度=1048576; treadNum=1024; 点积用时:0.000214688s
异常:productArray_bt_kernel launch failed: invalid configuration argument
blockNum=256; threadNum=4096;
数组长度=1048576; treadNum=4096;