CUDA(20)之优化之去冗余

摘要

本文主要讲述CUDA的去常见的冗余操作。

 

1. 未作优化的代码

 

#include 
#include 
#include 
#include 
#include 

__global__ void gpu(int *d_ptr, int length){
    int elemID = blockIdx.x * blockDim.x + threadIdx.x;

	for(int innerloops = 0; innerloops < 100000; innerloops++){
		if (elemID < length){
			//unsigned int laneid;
			d_ptr[elemID] = elemID % 32;
		}
	}
}

void valid(int *h_ptr, int length){
    for (int elemID=0; elemID>>(d_ptr, N);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());

	// Finish timing
	cudaEventRecord(stop,0);    
	cudaEventSynchronize(start);    
	cudaEventSynchronize(stop);   
	cudaEventElapsedTime(&time_elapsed,start,stop);

	// Print
	printf("Time Used on GPU:%f(ms)\n",time_elapsed);

	// CPU (results for validate)
    valid(h_ptr, N);

    int *h_d_ptr;
    checkCudaErrors(cudaMallocHost(&h_d_ptr, N *sizeof(int)));
    checkCudaErrors(cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost));

    bool bValid = true;

    for (int i=0; i

 

 

2. 去冗余

本例中去除线程ID的重复计算elemID % 32

#include 
#include 
#include 
#include 
#include 


__global__ void gpu_register(int *d_ptr, int length){
    int elemID = blockIdx.x * blockDim.x + threadIdx.x;
	int elemR = elemID % 32;

	for(int innerloops = 0; innerloops < 100000; innerloops++){
		if (elemID < length){
			//unsigned int laneid;
			d_ptr[elemID] = elemR;
		}
	}
}

void valid(int *h_ptr, int length){
    for (int elemID=0; elemID>>(d_ptr, N);
    checkCudaErrors(cudaGetLastError());
    checkCudaErrors(cudaDeviceSynchronize());

	// Finish timing
	cudaEventRecord(stop,0);    
	cudaEventSynchronize(start);    
	cudaEventSynchronize(stop);   
	cudaEventElapsedTime(&time_elapsed,start,stop);

	// Print
	printf("Time Used on GPU:%f(ms)\n",time_elapsed);

	// CPU (results for validate)
    valid(h_ptr, N);

    int *h_d_ptr;
    checkCudaErrors(cudaMallocHost(&h_d_ptr, N *sizeof(int)));
    checkCudaErrors(cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost));

    bool bValid = true;

    for (int i=0; i

 

 

3. 小结
去除不必要的冗余计算,本例中带来的优化是很可观的。

你可能感兴趣的:(GPU编程)