CUDA学习日记5

1. cudaDeviceReset

解析:重置当前线程所关联过的当前设备的所有资源。


2. CUDART_VERSION

解析:CUDA 7.5版本的CUDART_VERSION为7050,包含在头文件#include中。 


3. thrust::count

解析:thrust:count函数原型,如下所示:

template
thrust::iterator_traits::difference_type thrust::count (	
InputIterator first,
InputIterator last,
const EqualityComparable & value 
)

说明:count returns the number of iterators i in [first, last) such that *i == value.


4. transform_reduce

解析:transform_reduce函数原型,如下所示:

template
OutputType thrust::transform_reduce	(	InputIterator 	first,
InputIterator 	last,
UnaryFunction 	unary_op,
OutputType 	init,
BinaryFunction 	binary_op 
)		
举个例子,如下所示:
#include 
#include 
#include 
#include 
#include 
using namespace std;
using namespace thrust;

template 
struct square
{
	__host__ __device__
		T operator()(const T& x) const {
		return x*x;
	}
};

int main(void)
{
	float x[4] = { 1.0, 2.0, 3.0, 4.0 };
	device_vector d_x(x, x + 4);

	square unary_op;
	thrust::plus binary_op;
	float init = 10;

	float norm = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);

	cout << norm << endl;

	return 0;
}


5. Prefix-Sums:inclusive_scan和exclusive_scan 

解析:

#include 
#include 
#include 
using namespace std;
using namespace thrust;

int main(void)
{
	int data[6] = { 1, 0, 2, 2, 1, 3 };

	// data is now {1, 1, 3, 5, 6, 9}
	// data[2] = data[0] + data[1] + data[2]
	// thrust::inclusive_scan(data, data + 6, data);

	// data is now {0, 1, 1, 3, 5, 6}
	// data[2] = data[0] + data[1]
	thrust::exclusive_scan(data, data + 6, data);

	for (int i = 0; i < 6; i++)
	{
		cout << data[i] << endl;
	}
	return 0;
}


6. thrust::sort和thrust::stable_sort

解析:thrust::stable_sort函数原型,如下所示:

template
__host__ __device__ void thrust::stable_sort (	
const thrust::detail::execution_policy_base< DerivedPolicy > & 	exec,
RandomAccessIterator 	first,
RandomAccessIterator 	last,
StrictWeakOrdering 	comp 
)	

(1)exec:The execution policy to use for parallelization.

(2)first:The beginning of the sequence.

(3)last:The end of the sequence.

(4)comp:Comparison operator.

举个例子,如下所示:

#include 
using namespace std;
using namespace thrust;

int main(void)
{
	const int N = 6;
	int A[N] = { 1, 4, 2, 8, 5, 7 };
	// A is now {1, 2, 4, 5, 7, 8}
	// thrust::sort(A, A + N);

	// A is now {1, 2, 4, 5, 7, 8}
	thrust::stable_sort(A, A + N);

	for (int i = 0; i < 6; i++)
	{
		cout << A[i] << endl;
	}
	return 0;
}

(1)#include :Function objects and tools for manipulating them.

(2)#include :Thrust execution policies.


7. thrust::sort_by_key和thrust::stable_sort_by_key

解析:

#include 
using namespace std;
using namespace thrust;

int main(void)
{
	const int N = 6;
	int keys[N] = { 1, 4, 2, 8, 5, 7 };
	char values[N] = { 'a', 'b', 'c', 'd', 'e', 'f' };
	// keys is now { 1, 2, 4, 5, 7, 8} 
	// values is now {'a', 'c', 'b', 'e', 'f', 'd'}
	// thrust::sort_by_key(keys, keys + N, values);
	 
	// keys is now { 1, 2, 4, 5, 7, 8} 
	// values is now {'a', 'c', 'b', 'e', 'f', 'd'}
	thrust::stable_sort_by_key(keys, keys + N, values);

	for (int i = 0; i < 6; i++)
	{
		cout << values[i] << endl;
	}
	return 0;
}


8. Thrust中的Iterator

解析:

(1)constant_iterator

(2)counting_iterator

#include 
#include 
#include 
#include 
using namespace std;
using namespace thrust;

int main(void)
{
	thrust::constant_iterator first(10);
	thrust::constant_iterator last = first + 3;

	// returns 30 (i.e. 3 * 10)
	// thrust::reduce(first, last);

	// returns 33 (i.e. 10 + 11 + 12)  
	thrust::reduce(first, last); 

	cout << thrust::reduce(first, last) << endl;
	return 0;
}
(3)transform_iterator
#include 
#include 
#include 
using namespace std;
using namespace thrust;

int main(void)
{
	thrust::device_vector vec(3);
	vec[0] = 10;
	vec[1] = 20;
	vec[2] = 30;

	// returns -60 (i.e. -10 + -20 + -30)
	cout << thrust::reduce(thrust::make_transform_iterator(vec.begin(), thrust::negate()),
		thrust::make_transform_iterator(vec.end(), thrust::negate())) << endl;
	return 0;
}

(4)permutation_iterator

#include 
#include 
#include 
using namespace std;
using namespace thrust;

int main(void)
{
	thrust::device_vector map(4);
	map[0] = 3;
	map[1] = 1;
	map[2] = 0;
	map[3] = 5;

	thrust::device_vector source(6);
	source[0] = 10;
	source[1] = 20;
	source[2] = 30;
	source[3] = 40;
	source[4] = 50;
	source[5] = 60;
    
	// sum = source[map[0]] + source[map[1]] + ...
	int sum = thrust::reduce(thrust::make_permutation_iterator(source.begin(), map.begin()),
		thrust::make_permutation_iterator(source.begin(), map.end()));

	cout << sum << endl;
	return 0;
}
(5)zip_iterator
#include 
#include 
#include 
using namespace std;
using namespace thrust;

int main(void)
{
	thrust::device_vector A(3);
	thrust::device_vector B(3);
	A[0] = 10; A[1] = 20; A[2] = 30; 
	B[0] = 'x'; B[1] = 'y'; B[2] = 'z';
    
	thrust::maximum< thrust::tuple > binary_op;
	thrust::tuple init = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin()))[0];
	thrust::tuple result = thrust::reduce(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin())), thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end())), init, binary_op);

	cout << thrust::get<0>(result) << endl;
	cout << thrust::get<1>(result) << endl;
	return 0;
}


8. #include

解析:

(1)#define EXIT_SUCCESS 0

(2)#define EXIT_FAILURE 1


9. cuBLAS与CUBLASXT 

解析:在CUDA 6的开发包中,提供了一个新的API——CUBLASXT,它是在cuBLAS API的上层封装了一个矩阵分块算法,解决了当数据量大时显存不足的问题。


10. cuRAND库

解析:cuRAND库提供了通过GPU生成随机数的接口,包含头文件#include


11. CUDA同步方式 

解析:在CUDA中,有两种方式实现同步,如下所示:

(1)System-level:等待所有host和device的工作完成。

(2)Block-level:等待device中block的所有thread执行到某个点。


参考文献:

[1] Thrust:http://docs.nvidia.com/cuda/thrust/index.html#axzz4aFPI7CYb 

你可能感兴趣的:(高性能计算)