TBB 学习

主要摘录《结构化并行程序设计》(Structured Parallel Programming) 中关于 TBB（现为oneTBB) 的一些知识和代码，并添加了一些其他东西。

安装

Intel TBB 安装可直接在 github 搜索 tbb，clone 下载后 cmake 安装即可，本文编写时（2021.7.12）tbb 已位于 oneapi 下：oneTBB，安装可参考：Intel TBB库+CMake+Ubuntu配置流程。

Debian/Ubuntu 系列可直接安装

sudo apt install libtbb-dev libtbb-doc libtbb2

oneTBB 中 example 目录里有不少参考代码，整个 oneapi 的参考 demo 可以参见 oneAPI-samples，其中包含使用 GPU+CPU 的 TBB 代码。此外，目标大一统的 oneapi 也支持 FPGA 等，值得关注。

测试代码如下：

#include 
#include 

int main() {
  tbb::parallel_for(0, 100, 1, [](int i) {
    std::cout << i << std::endl;
    std::cout << "Hello World!" << std::endl;
  });

  return 0;
}

节选结果如下，可见乱序输出

Hello World!
28
Hello World!
66
Hello World!
98
Hello World!
Hello World!
Hello World!

Hello World!
Hello World!
60
Hello World!

SAXPY

SAXPY 的数学定义为

$$ \bm{y} \leftarrow a\bm{x}+\bm{y} $$

串行版本：

#include 
void saxpy_serial(
    size_t n,
    float a,
    const float x[],
    float y[])
{
    for (size_t i = 0; i < n; ++i)
        y[i] = a * x[i] + y[i];
}

TBB 版本，分块得到的每个子区间 r 由独立任务处理。

#include 
#include 
void saxpy_tbb(
    size_t n,
    float a,
    const float x[],
    float y[])
{
    tbb::parallel_for(
        tbb::blocked_range(0, n),
        [&](tbb::blocked_range r) {
            for (size_t i = r.begin(); i != r.end(); ++i)
                y[i] = a * x[i] + y[i];
        });
}

点积

数学定义：

$$ \bm{a} \cdot \bm{b} = \sum_{i=0}^{n-1}{a_i b_i} $$

串行版本

#include 
float sprod(
    size_t n,
    const float a[],
    const float b[])
{
    float res = 0.0f;
    for (size_t i = 0; i < n; i++)
    {
        res += a[i] * b[i];
    }
    return res;
}

TBB 单精度版本

#include 
#include  // std::plus, std::divides
#include     // std::inner_product
#include 
float sprod_tbb(
    size_t n,
    const float a[],
    const float b[])
{
    return tbb::parallel_reduce(
        tbb::blocked_range(0, n),
        float(0),
        [=](
            tbb::blocked_range &r,
            float in) {
            return std::inner_product(a + r.begin(), a + r.end(), b + r.begin(), in);
        },
        std::plus());
}

快速排序

串行版本中默认运用的，解决最坏情况下递归到第 N 层导致栈溢出问题的方法：在较小的子问题上递归，在较大的子问题上迭代。

TBB 版本：使用 task_group 实现，由于窃取子任务语义，在空间问题上有最坏情况 O(n)

void parallel_quicksort( T* first, T* last ) {
    tbb::task_group g;
    while( last-first>QUICKSORT_CUTOFF ) {
        // Divide
        T* middle = divide(first,last);
        if( !middle ) {
            g.wait();
            return;
        }

        // Now have two subproblems: [first..middle) and [middle+1..last)
        if( middle-first < last-(middle+1) )  {
            // Left problem (first..middle) is smaller, so spawn it.
            g.run([=]{parallel_quicksort( first, middle );});
            // Solve right subproblem in next iteration.
            first = middle+1;
        } else {
            // Right problem (middle..last) is smaller, so spawn it.
            g.run([=]{parallel_quicksort( middle+1, last );});
            // Solve left subproblem in next iteration.
            last = middle;
        }
    }
    // Base case
    std::sort(first,last);
    g.wait();
 }

解决方案是在需要之前不生成一个新的子任务，模拟窃取后续任务的语义来实现，即通过传递后续任务风格来编码，使用低级接口 tbb::task，其设计目的就是分治的高效实现

class quicksort_task: public tbb::task {
    /*override*/tbb::task* execute();
    T *first, *last;
    bool has_local_join;
    void prepare_self_as_stealable_continuation();
public:
    quicksort_task( T* first_, T* last_ ) : first(first_), last(last_), has_local_join(false) {}
};

void quicksort_task::prepare_self_as_stealable_continuation() {
    if( !has_local_join ) {
        task* local_join  = new( allocate_continuation() ) tbb::empty_task();
        local_join->set_ref_count(1);
        set_parent(local_join);
        has_local_join = true;
    }
    recycle_to_reexecute();
}

tbb::task* quicksort_task::execute() {
    if( last-first<=QUICKSORT_CUTOFF ) {
        std::sort(first,last);
        // Return NULL continuation
        return NULL;
    } else {
        // Divide
        T* middle = divide(first,last);
        if( !middle ) return NULL; 

        // Now have two subproblems: [first..middle) and [middle+1..last)

        // Set up current task object as continuation of itself.
        prepare_self_as_stealable_continuation();

        // Now recurse on smaller subproblem.
        tbb::task* smaller;
        if( middle-first < last-(middle+1) )  {
            // Left problem (first..middle) is smaller.
            smaller = new( allocate_additional_child_of(*parent()) ) quicksort_task( first, middle );
            // Continuation will do larger subproblem
            first = middle+1;
        } else {
            // Right problem (middle..last) is smaller.
            smaller = new( allocate_additional_child_of(*parent()) ) quicksort_task( middle+1, last );
            // Continuation will do larger subproblem
            last = middle;
        }
        // Dive into smaller subproblem
        return smaller;
    }
}

void parallel_quicksort( T* first, T* last ) {
    // Create root task
    tbb::task& t = *new( tbb::task::allocate_root() ) quicksort_task( first, last );
    // Run it
    tbb::task::spawn_root_and_wait(t);
}

K-means

基础函数

struct point {
    float x, y;
    void operator+=( const point& right ) {
        x += right.x;
        y += right.y;
    }
    point operator/( size_t count ) const {
        point p = *this;
        p.x /= count;
        p.y /= count;
        return p;
    }
};

inline float distance2( const point& a, const point& b ) {
    float dx = a.x-b.x;
    float dy = a.y-b.y;
    return dx*dx+dy*dy;
}

struct sum_and_count {
    sum_and_count() : sum(), count(0) {}
    point sum;
    size_t count;
    void clear() {
        sum = point();
        count = 0;
    }
    void tally( const point& p ) {
        sum += p;
        ++count;
    }
    point mean() const {
        return sum/count;
    }
    void operator+=( const sum_and_count& other ) {
        sum += other.sum;
        count += other.count;
    };
};

void repair_empty_clusters( size_t n, const point points[], cluster_id id[], size_t k, point centroid[], sum_and_count sum[] ) {
    for( size_t j=0; jmaxd  ) {
                    maxd = d;
                    farthest = i;
                }
            }
#endif
            id[farthest] = j;
            sum[j].count = 1;
            sum[j].sum = points[farthest];
        } 
    }
}

TBB 配套函数

class view {
    view( const view& v );            // Deny copy construction
    void operator=( const view& v );  // Deny assignment
public:
    sum_and_count* array;
    size_t change;
    view( size_t k ) : array(new sum_and_count[k]), change(0) {}
    ~view() {delete[] array;}
};

typedef tbb::enumerable_thread_specific tls_type;

void reduce_local_counts_to_global_count( tls_type& tls, view& global ) {
    global.change = 0;
    for( auto i=tls.begin(); i!=tls.end(); ++i ) {
        view& v = *i;
        global.change += i->change;
        v.change = 0;
    }
}

void reduce_local_sums_to_global_sum( size_t k, tls_type& tls, view& global ) {
    for( auto i=tls.begin(); i!=tls.end(); ++i ) {
        view& v = *i;
        for( size_t j=0; j

 
 TBB 实际函数 
 void compute_k_means( size_t n, const point points[], size_t k, cluster_id id[], point centroid[] ) {

    tls_type tls([&]{return k;}); 
    view global(k);

    // Create initial clusters and compute their sums.
    tbb::parallel_for(
        tbb::blocked_range(0,n),
        [=,&tls,&global]( tbb::blocked_range r ) {
            view& v = tls.local();
            for( size_t i=r.begin(); i!=r.end(); ++i ) {
                id[i] = i % k;  
                // Peeled "Sum step"
                v.array[id[i]].tally(points[i]);
            }
        }
    );

    // Loop until ids do not change
    size_t change;
    do {
        // Reduce local sums to global sum
        reduce_local_sums_to_global_sum( k, tls, global );

        // Repair any empty clusters
        repair_empty_clusters( n, points, id, k, centroid, global.array );

        // "Divide step": Compute centroids from global sums
        for( size_t j=0; j(0,n),
            [=,&tls,&global]( tbb::blocked_range r ) {
                view& v = tls.local();
                for( size_t i=r.begin(); i!=r.end(); ++i ) {
                    // "Reassign step": Find index of centroid closest to points[i]
                    cluster_id j = reduce_min_ind(centroid, k , points[i]); 
                    if( j!=id[i] ) {
                        id[i] = j;
                        ++v.change;
                    }
                    // "Sum step" 
                    v.array[j].tally(points[i]);
                }
            }
        );

        // Reduce local counts to global count
        reduce_local_counts_to_global_count( tls, global );
    } while( global.change!=0 );
} 
 注：来自书中的代码版权满足以下版权声明 
 
   Copyright (c) 2012 Michael McCool, Arch Robison, and James Reinders. 
  
All rights reserved.