CUDA学习日记10

1. __noinline__

解析:默认下,__device__函数总是inline的。__noinline__函数可以作为一个非inline函数的提示。


2. #pragma unroll 

解析:编译器默认情况下将循环展开小的次数,#pragma unroll能够指定循环以多少次展开。


3. CUDA内置矢量类型

解析:char1,uchar1,char2,uchar2,char3,uchar3,char4,uchar4,short1,ushort1,short2,ushort2,short3,ushort3,short4,ushort4,int1,uint1,int2,uint2,int3,uint3,int4,uint4,long1,ulong1,long2,ulong2,long3,ulong3,long4,ulong4,float1,float2,float3,float4。


4. CUDA类型转换函数

解析:

(1)int __float2int_[rn,rz,ru,rd](float);:用指定的舍入模式转换浮点参数到整型。

(2)unsigned int __float2unit_[rn,rz,ru,zd](float);:用指定的舍入模式转换浮点参数到无符号整型。

(3)float __int2float_[rn,rz,ru,rd](int);:用指定的舍入模式转换整型参数到浮点数。

(4)float __int2float_[rn,rz,ru,rd](unsigned int);:用指定的舍入模式转换无符号整型参数到浮点数。 

说明:rn是求最近的偶数,rz是逼近零,ru是向上舍入(到正无穷),rd是向下舍入(到负无穷)。


5. asyncAPI.cu代码剖析

#include 
#include 
#include 
#include  

__global__ void increment_kernel(int *g_data, int inc_value)
{
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	g_data[idx] = g_data[idx] + inc_value;
}


int main(int argc, char *argv[])
{   
	int n = 16 * 1024 * 1024;
	int nbytes = n * sizeof(int);
	int value = 26;

	// allocate host memory
	int *a = 0;
	cudaMallocHost((void **)&a, nbytes);
	memset(a, 0, nbytes);

	// allocate device memory
	int *d_a = 0;
	cudaMalloc((void **)&d_a, nbytes);
	cudaMemset(d_a, 255, nbytes);

	// set kernel launch configuration
	dim3 threads = dim3(512, 1);
	dim3 blocks = dim3(n / threads.x, 1);

	// create cuda event handles
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// define time
	StopWatchInterface *timer = NULL;
	sdkCreateTimer(&timer);
	sdkResetTimer(&timer);

	cudaDeviceSynchronize();
	float gpu_time = 0.0f;

	// asynchronously issue work to the GPU (all to stream 0)
	sdkStartTimer(&timer);
	cudaEventRecord(start, 0);
	cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
	increment_kernel << > >(d_a, value);
	cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
	cudaEventRecord(stop, 0);
	sdkStopTimer(&timer);

	// have CPU do some work while waiting for stage 1 to finish
	unsigned long int counter = 0;
	while (cudaEventQuery(stop) == cudaErrorNotReady)
	{
		counter++;
	}

	cudaEventElapsedTime(&gpu_time, start, stop);

	// print the cpu and gpu times
	printf("time spent executing by the GPU: %.2f\n", gpu_time);
	printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
	printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);

	// release resources
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaFreeHost(a);
	cudaFree(d_a);

	cudaDeviceReset();
}
解析:

(1)使用事件管理API主要作用是用于记录GPU状态,使CPU可以通过查询CUDA事件来确定GPU是否执行结束。

(2)常见的异步执行(主机端和设备端)函数包括Kernel启动;以Async为后缀的内存拷贝函数;device到device内存拷贝函数;存储器初始化函数,比如cudaMemset(),cudaMemset2D(),cudaMemset3D()。


6. 流的创建与初始化

解析:

cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++)
{
	checkCudaErrors(cudaStreamCreate(&(streams[i])));
}


7. simpleStreams.cu代码剖析

#include 
#include 


__global__ void init_array(int *g_data, int *factor, int num_iterations)
{
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	for (int i = 0; i> >(d_a, d_c, niterations);
	cudaEventRecord(stop_event, 0);
	cudaEventSynchronize(stop_event);
	cudaEventElapsedTime(&time_kernel, start_event, stop_event);
	printf("kernel:\t\t%.2f\n", time_kernel);
    
	// time non-streamed execution for reference
	threads = dim3(512, 1);
	blocks = dim3(n / threads.x, 1);
	cudaEventRecord(start_event, 0);
	for (int k = 0; k < nreps; k++)
	{
		init_array << > >(d_a, d_c, niterations);
		cudaMemcpy(h_a, d_a, nbytes, cudaMemcpyDeviceToHost);
	}
	cudaEventRecord(stop_event, 0);
	cudaEventSynchronize(stop_event);
	cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
	printf("non-streamed:\t%.2f\n", elapsed_time / nreps);

	// time execution with nstreams streams
	threads = dim3(512, 1);
	blocks = dim3(n / (nstreams*threads.x), 1);
	memset(h_a, 255, nbytes);     
	cudaMemset(d_a, 0, nbytes); 
	cudaEventRecord(start_event, 0);
	for (int k = 0; k < nreps; k++)
	{   // 异步加载nstreams个kernel
		for (int i = 0; i < nstreams; i++)
		{
			init_array << > >(d_a + i *n / nstreams, d_c, niterations);
		}
	    // 异步加载nstreams个memcopy
		for (int i = 0; i < nstreams; i++)
		{
			cudaMemcpyAsync(h_a + i * n / nstreams, d_a + i * n / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);
		}
	}
	cudaEventRecord(stop_event, 0);
	cudaEventSynchronize(stop_event);
	cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
	printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
    
	// release resources
	for (int i = 0; i < nstreams; i++) { cudaStreamDestroy(streams[i]); }
	cudaEventDestroy(start_event);
	cudaEventDestroy(stop_event);
	cudaFree(h_a);
	cudaFree(d_a);
	cudaFree(d_c);

	cudaDeviceReset();
}

解析:simpleStreams.cu进行了流与事件的创建,并分别进行了内存拷贝计时,使用流的kernel执行计时,不使用流的kernel执行计时,以及使用nstreams个流的整体计时。为了使计时更加准确,采用了执行nreps次求平均值的方法。


8. CUDA中的流

解析:在一个给定的流中,操作顺序进行,但在不同流上的操作是乱序执行的,也可能是并行执行的。流的定义方法是创建一个cudaStream_t对象,并在启动内核和进行内存复制时将该对象作为参数传入,参数相同的属于同一个流,参数不同的属于不同的流。


9. Tegra

解析:Tegra是于推出的基于ARM构架通用处理器品牌(即CPU,NVIDIA称为“Computer on a chip”片上计算机),能够为便携设备提供高性能、低功耗体验。


参考文献:

[1] CUDA专家手册:GPU编程权威指南

你可能感兴趣的:(异构计算)