主要摘录《结构化并行程序设计》(Structured Parallel Programming) 中关于 TBB(现为oneTBB) 的一些知识和代码,并添加了一些其他东西。
安装
Intel TBB 安装可直接在 github 搜索 tbb,clone 下载后 cmake 安装即可,本文编写时(2021.7.12)tbb 已位于 oneapi 下:oneTBB,安装可参考:Intel TBB库+CMake+Ubuntu配置流程。
Debian/Ubuntu 系列可直接安装
sudo apt install libtbb-dev libtbb-doc libtbb2
oneTBB 中 example 目录里有不少参考代码,整个 oneapi 的参考 demo 可以参见 oneAPI-samples,其中包含使用 GPU+CPU 的 TBB 代码。此外,目标大一统的 oneapi 也支持 FPGA 等,值得关注。
测试代码如下:
#include
#include
int main() {
tbb::parallel_for(0, 100, 1, [](int i) {
std::cout << i << std::endl;
std::cout << "Hello World!" << std::endl;
});
return 0;
}
节选结果如下,可见乱序输出
Hello World!
28
Hello World!
66
Hello World!
98
Hello World!
Hello World!
Hello World!
Hello World!
Hello World!
60
Hello World!
SAXPY
SAXPY 的数学定义为
$$ \bm{y} \leftarrow a\bm{x}+\bm{y} $$
串行版本:
#include
void saxpy_serial(
size_t n,
float a,
const float x[],
float y[])
{
for (size_t i = 0; i < n; ++i)
y[i] = a * x[i] + y[i];
}
TBB 版本,分块得到的每个子区间 r 由独立任务处理。
#include
#include
void saxpy_tbb(
size_t n,
float a,
const float x[],
float y[])
{
tbb::parallel_for(
tbb::blocked_range(0, n),
[&](tbb::blocked_range r) {
for (size_t i = r.begin(); i != r.end(); ++i)
y[i] = a * x[i] + y[i];
});
}
点积
数学定义:
$$ \bm{a} \cdot \bm{b} = \sum_{i=0}^{n-1}{a_i b_i} $$
串行版本
#include
float sprod(
size_t n,
const float a[],
const float b[])
{
float res = 0.0f;
for (size_t i = 0; i < n; i++)
{
res += a[i] * b[i];
}
return res;
}
TBB 单精度版本
#include
#include // std::plus, std::divides
#include // std::inner_product
#include
float sprod_tbb(
size_t n,
const float a[],
const float b[])
{
return tbb::parallel_reduce(
tbb::blocked_range(0, n),
float(0),
[=](
tbb::blocked_range &r,
float in) {
return std::inner_product(a + r.begin(), a + r.end(), b + r.begin(), in);
},
std::plus());
}
快速排序
串行版本中默认运用的,解决最坏情况下递归到第 N 层导致栈溢出问题的方法:在较小的子问题上递归,在较大的子问题上迭代。
TBB 版本:使用 task_group 实现,由于窃取子任务语义,在空间问题上有最坏情况 O(n)
void parallel_quicksort( T* first, T* last ) {
tbb::task_group g;
while( last-first>QUICKSORT_CUTOFF ) {
// Divide
T* middle = divide(first,last);
if( !middle ) {
g.wait();
return;
}
// Now have two subproblems: [first..middle) and [middle+1..last)
if( middle-first < last-(middle+1) ) {
// Left problem (first..middle) is smaller, so spawn it.
g.run([=]{parallel_quicksort( first, middle );});
// Solve right subproblem in next iteration.
first = middle+1;
} else {
// Right problem (middle..last) is smaller, so spawn it.
g.run([=]{parallel_quicksort( middle+1, last );});
// Solve left subproblem in next iteration.
last = middle;
}
}
// Base case
std::sort(first,last);
g.wait();
}
解决方案是在需要之前不生成一个新的子任务,模拟窃取后续任务的语义来实现,即通过传递后续任务风格来编码,使用低级接口 tbb::task
,其设计目的就是分治的高效实现
class quicksort_task: public tbb::task {
/*override*/tbb::task* execute();
T *first, *last;
bool has_local_join;
void prepare_self_as_stealable_continuation();
public:
quicksort_task( T* first_, T* last_ ) : first(first_), last(last_), has_local_join(false) {}
};
void quicksort_task::prepare_self_as_stealable_continuation() {
if( !has_local_join ) {
task* local_join = new( allocate_continuation() ) tbb::empty_task();
local_join->set_ref_count(1);
set_parent(local_join);
has_local_join = true;
}
recycle_to_reexecute();
}
tbb::task* quicksort_task::execute() {
if( last-first<=QUICKSORT_CUTOFF ) {
std::sort(first,last);
// Return NULL continuation
return NULL;
} else {
// Divide
T* middle = divide(first,last);
if( !middle ) return NULL;
// Now have two subproblems: [first..middle) and [middle+1..last)
// Set up current task object as continuation of itself.
prepare_self_as_stealable_continuation();
// Now recurse on smaller subproblem.
tbb::task* smaller;
if( middle-first < last-(middle+1) ) {
// Left problem (first..middle) is smaller.
smaller = new( allocate_additional_child_of(*parent()) ) quicksort_task( first, middle );
// Continuation will do larger subproblem
first = middle+1;
} else {
// Right problem (middle..last) is smaller.
smaller = new( allocate_additional_child_of(*parent()) ) quicksort_task( middle+1, last );
// Continuation will do larger subproblem
last = middle;
}
// Dive into smaller subproblem
return smaller;
}
}
void parallel_quicksort( T* first, T* last ) {
// Create root task
tbb::task& t = *new( tbb::task::allocate_root() ) quicksort_task( first, last );
// Run it
tbb::task::spawn_root_and_wait(t);
}
K-means
基础函数
struct point {
float x, y;
void operator+=( const point& right ) {
x += right.x;
y += right.y;
}
point operator/( size_t count ) const {
point p = *this;
p.x /= count;
p.y /= count;
return p;
}
};
inline float distance2( const point& a, const point& b ) {
float dx = a.x-b.x;
float dy = a.y-b.y;
return dx*dx+dy*dy;
}
struct sum_and_count {
sum_and_count() : sum(), count(0) {}
point sum;
size_t count;
void clear() {
sum = point();
count = 0;
}
void tally( const point& p ) {
sum += p;
++count;
}
point mean() const {
return sum/count;
}
void operator+=( const sum_and_count& other ) {
sum += other.sum;
count += other.count;
};
};
void repair_empty_clusters( size_t n, const point points[], cluster_id id[], size_t k, point centroid[], sum_and_count sum[] ) {
for( size_t j=0; jmaxd ) {
maxd = d;
farthest = i;
}
}
#endif
id[farthest] = j;
sum[j].count = 1;
sum[j].sum = points[farthest];
}
}
}
TBB 配套函数
class view {
view( const view& v ); // Deny copy construction
void operator=( const view& v ); // Deny assignment
public:
sum_and_count* array;
size_t change;
view( size_t k ) : array(new sum_and_count[k]), change(0) {}
~view() {delete[] array;}
};
typedef tbb::enumerable_thread_specific tls_type;
void reduce_local_counts_to_global_count( tls_type& tls, view& global ) {
global.change = 0;
for( auto i=tls.begin(); i!=tls.end(); ++i ) {
view& v = *i;
global.change += i->change;
v.change = 0;
}
}
void reduce_local_sums_to_global_sum( size_t k, tls_type& tls, view& global ) {
for( auto i=tls.begin(); i!=tls.end(); ++i ) {
view& v = *i;
for( size_t j=0; j
TBB 实际函数
void compute_k_means( size_t n, const point points[], size_t k, cluster_id id[], point centroid[] ) {
tls_type tls([&]{return k;});
view global(k);
// Create initial clusters and compute their sums.
tbb::parallel_for(
tbb::blocked_range(0,n),
[=,&tls,&global]( tbb::blocked_range r ) {
view& v = tls.local();
for( size_t i=r.begin(); i!=r.end(); ++i ) {
id[i] = i % k;
// Peeled "Sum step"
v.array[id[i]].tally(points[i]);
}
}
);
// Loop until ids do not change
size_t change;
do {
// Reduce local sums to global sum
reduce_local_sums_to_global_sum( k, tls, global );
// Repair any empty clusters
repair_empty_clusters( n, points, id, k, centroid, global.array );
// "Divide step": Compute centroids from global sums
for( size_t j=0; j(0,n),
[=,&tls,&global]( tbb::blocked_range r ) {
view& v = tls.local();
for( size_t i=r.begin(); i!=r.end(); ++i ) {
// "Reassign step": Find index of centroid closest to points[i]
cluster_id j = reduce_min_ind(centroid, k , points[i]);
if( j!=id[i] ) {
id[i] = j;
++v.change;
}
// "Sum step"
v.array[j].tally(points[i]);
}
}
);
// Reduce local counts to global count
reduce_local_counts_to_global_count( tls, global );
} while( global.change!=0 );
}
注:来自书中的代码版权满足以下版权声明
Copyright (c) 2012 Michael McCool, Arch Robison, and James Reinders.
All rights reserved.