目录
介绍
运行环境
算法设计和优化思路
1. 将RGB颜色空间转化为LAB颜色空间
优化思路
具体实现
2. 计算梯度并寻找聚类中心
优化思路
具体实现
3. 对邻域的像素点进行聚类
优化思路
具体实现
4. 更新聚类中心连接孤立像素
优化思路
具体实现
在这篇文章中,我将进一步介绍有关OpenMP优化SLIC超像素算法具体步骤,希望大家能看到最后,有所收获。
操作系统版本 |
Windows 10、 Ubuntu 20.04 |
并行编程模型 |
OpenMP、 Avx2 |
编译器版本 |
g++ 8.1.0+ |
CPU型号、主频及核数 |
Intel CORE i5、1.6 GHz、4核八线程 |
内存型号,大小及主频 |
DDR4、 16G、 2400MHz |
整个代码大体可以分为四个部分:将RGB颜色空间转化为LAB颜色空间、计算梯度并寻找聚类中心、对邻域的像素点进行聚类、更新聚类中心连接孤立像素。
因为在RGB转化到LAB之前,要先经过经RGB转化到XYZ的过程,而在这个过程中,存在对RGB的重复计算,所以我们利用建立查询表的方式对重复数据进行存储,避免重复运算。
同时,我们在发现,在运算的过程中,有一些数据是可以人工计算的,这样可以更大程度上优化浮点数的计算(主要是浮点数乘法和除法)。
另外,在循环前加上omp并行来进行多线程并行化和对数据的向量化运算。
建立查询表:
// 建立查询表
#pragma omp simd
for (int i = 0; i < 11; ++i) {
tableRGB[i] = i * (1.0 / 3294.6);
}
#pragma omp simd
for (int i = 11; i < 256; ++i) {
tableRGB[i] = pow((i * (1.0 / 269.025) + 0.0521327014218009), 2.4);
}
优化浮点数运算:
#pragma omp parallel for
....
double r0, g0, b0;
b0 = tableRGB[sRGB[0]];
g0 = tableRGB[sRGB[1]];
r0 = tableRGB[sRGB[2]];
double X, Y, Z;
X = r0 * 0.4124564 + g0 * 0.3575761 + b0 * 0.1804375;
Y = r0 * 0.2126729 + g0 * 0.7151522 + b0 * 0.0721750;
Z = r0 * 0.0193339 + g0 * 0.1191920 + b0 * 0.9503041;
// 此处微调优化浮点数除法
double fx, fy, fz;
if (X > 0.008417238336) fx = cbrt(X * (1.0 / 0.950456));
else fx = (8.192982069151272 * X + 0.1379310344827586);
if (Y > 0.008856) fy = cbrt(Y);
else fy = (7.787068965517241 * Y + 0.1379310344827586);
if (Z > 0.009642005424) fz = cbrt(Z * (1.0 / 1.088754));
else fz = (7.152275872710678 * Z + 0.1379310344827586);
lab_val[2] = 116.0 * fy - 16.0;
lab_val[1] = 500.0 * (fx - fy);
lab_val[0] = 200.0 * (fy - fz);
观察生成初始种子的过程中,并不需要全部的edge信息,因此可以将全局的edges计算去除,变为按需计算(只计算初始聚类中心周围八个点的梯度)。
// 只计算初始聚类中心周围八个点的梯度
const int dx8[8] = {-1, -1, 0, 1, 1, 1, 0, -1};
const int dy8[8] = {0, -1, -1, -1, 0, 1, 1, 1};
int oind = oy * m_width + ox;//原始的聚类中心
for (int j = 0; j < 8; j++) {
int nx = ox + dx8[j];//新的 x
int ny = oy + dy8[j];//新的 y
if (nx >= 0 && nx < m_width && ny >= 0 && ny < m_height) {
int nind = ny * m_width + nx;
double x0 = square(lab[oind * 3 - 3] - lab[oind * 3 + 3]) +
square(lab[oind * 3 - 2] - lab[oind * 3 + 4]) +
square(lab[oind * 3 - 1] - lab[oind * 3 + 5]);
double y0 = square(lab[oind * 3 - 3 * m_width] - lab[oind * 3 + 3 * m_width]) +
square(lab[oind * 3 - 3 * m_width + 1] - lab[oind * 3 + 3 * m_width + 1]) +
square(lab[oind * 3 - 3 * m_width + 2] - lab[oind * 3 + 3 * m_width + 2]);
double x1 = square(lab[nind * 3 - 3] - lab[nind * 3 + 3]) +
square(lab[nind * 3 - 2] - lab[nind * 3 + 4]) +
square(lab[nind * 3 - 1] - lab[nind * 3 + 5]);
double y1 = square(lab[nind * 3 - 3 * m_width] - lab[nind * 3 + 3 * m_width]) +
square(lab[nind * 3 - 3 * m_width + 1] - lab[nind * 3 + 3 * m_width + 1]) +
square(lab[nind * 3 - 3 * m_width + 2] - lab[nind * 3 + 3 * m_width + 2]);
if ((x1 + y1) < (x0 + y0)) {
oind = nind;
}
}
}
在遍历y循环前加上omp并行来进行多线程并行化,同时将y循环和n循环的层次进行调换,增大并行力度。
删除不必要的disxy数组的存储的同时利用迭代运算简化disxy的运算。
将每个像素点l a b x y 的叠加运算的循环与上述循环进行合并,提高局部性和并行性。
进行归并运算,将一部分不能并行的代码进行并行操作。
着重对size次的循环进行遍历,对size次的除转换为N次的除
void calculate_super_pixel() {
const int TIME = 10;//迭代次数
const int STEP = (int) (sqrt((double) (m_size) / (double) (K)) + 2.0);
int offset = STEP;
if (STEP < 10) offset = (int) (STEP * 1.5);
double inv_xy = 1.0 / (STEP * STEP);
int width = m_width;
int height = m_height;
double max_lab[N] __attribute__((aligned(32)));
double max_lab_div[N] __attribute__((aligned(32)));
double sigma[THREAD][N][5] __attribute__((aligned(32)));
double max_lab_t[THREAD][N] __attribute__((aligned(32)));
int cluster_size[THREAD][N] __attribute__((aligned(32)));
#pragma omp simd
for (int i = 0; i < N; ++i) {
max_lab[i] = 100;
max_lab_div[i] = 0.01;
}
//迭代TIME次
for (int t = 0; t < TIME; ++t) {
//初始化清零数据
memset(cluster_size, 0, N * THREAD * sizeof(int));
memset(max_lab_t, 0, N * THREAD * sizeof(double));
memset(sigma, 0, 5 * N * THREAD * sizeof(double));
//遍历所有seed,计算最近的靠近位置
#pragma omp parallel for default(none) shared(height, width, N, offset, t, seeds, lab, belong, inv_xy, max_lab_t, max_lab_div, sigma, cluster_size) schedule(guided, 10)
for (int y = 0; y < height; ++y) {
double dis_vec_y[width] __attribute__((aligned(32)));
double dis_lab_y[width] __attribute__((aligned(32)));
memset(dis_vec_y, MAX_CHAR_DOUBLE, width * sizeof(double));
const int y_index = y * width;
for (int n = 0; n < N; n++) {
double *seed = seeds + 5 * n;
if ((int) (seed[4] - offset) <= y && y < (int) (seed[4] + offset)) {
const int x1 = max(0, (int) (seed[3] - offset));
const int x2 = min(width, (int) (seed[3] + offset));
const double div_lab = max_lab_div[n];
for (int x = x1; x < x2; ++x) {
int i = y_index + x;
dis_lab_y[x] = square(lab[i * 3] - seed[0]) +
square(lab[i * 3 + 1] - seed[1]) +
square(lab[i * 3 + 2] - seed[2]);
}
for (int x = x1; x < x2; ++x) {
int i = y_index + x;
double temp_dis_xy = square(x - seed[3]) + square(y - seed[4]);
double dist = dis_lab_y[x] * div_lab + temp_dis_xy * inv_xy;
if (dist < dis_vec_y[x]) {
dis_vec_y[x] = dist;
belong[i] = n;
}
}
}
}
const int thread_num = omp_get_thread_num();
for (int x = 0; x < width; ++x) {
int i = width * y + x;
int k = belong[i];
if (max_lab_t[thread_num][k] < dis_lab_y[x]) {
max_lab_t[thread_num][k] = dis_lab_y[x];
}
#pragma omp simd
for (int j = 0; j < 3; ++j) {
sigma[thread_num][k][j] += lab[i * 3 + j];
}
sigma[thread_num][k][3] += x;
sigma[thread_num][k][4] += y;
cluster_size[thread_num][k]++;
}
}
// 重新计算种子点
for (int k = 0; k < N; k++) {
int seed_size = 0;
double sigma_t[5] __attribute__((aligned(32))) = {0};
for (int i = 0; i < THREAD; ++i) {
#pragma omp simd
for (int j = 0; j < 5; ++j) {
sigma_t[j] += sigma[i][k][j];
}
if (max_lab[k] < max_lab_t[i][k]) {
max_lab[k] = max_lab_t[i][k];
}
seed_size += cluster_size[i][k];
}
if (seed_size == 0) seed_size = 1;
double inv = 1.0 / seed_size;
max_lab_div[k] = 1.0 / max_lab[k];
#pragma omp simd
for (int i = 0; i < 5; ++i) {
seeds[5 * k + i] = sigma_t[i] * inv;
}
}
}
}
直接并行BFS对连通聚类进行打标签。在并行的过程中,对BFS结果进行合并。
顺序遍历并检查每一个合并结果的大小,进行合并操作,同时获得全局顺序标签,再映射回新的标签。
直接并行BFS对连通聚类进行打标签。在并行的过程中,对BFS结果进行合并。
// 直接并行BFS打标签并合并
#pragma omp parallel for num_threads(thread1) default(none) shared(thread_num, K0, N, Q, tempLabel, oindex, width, dx4, dy4, height, belong, P)
for (int id = 0; id < N; id++) {
int nowLabel = id;
int size = 0;
for (int tid = 0; tid < thread_num; tid++)
size += Q[tid][id].length;
int isOK = 0;
int *arr = (int *) malloc(size * sizeof(int));
while (isOK < size) {
int now = -1;
for (int tid = 0; tid < thread_num; tid++) {
for (int i = 0; i < Q[tid][id].length; i++) {
if (tempLabel[get(Q[tid][id], i)] == -1) {
now = get(Q[tid][id], i);
break;
}
}
}
int start = 0, finish = 0;
arr[finish++] = now;
tempLabel[now] = nowLabel;
oindex = now;
while (start < finish) {
int k = arr[start++];
int x = k % width;
int y = k / width;
for (int i = 0; i < 4; i++) {
int xx = x + dx4[i];
int yy = y + dy4[i];
if ((xx >= 0 && xx < width) && (yy >= 0 && yy < height)) {
int nindex = yy * width + xx;
if (0 > tempLabel[nindex] && belong[oindex] == belong[nindex]) {
arr[finish++] = nindex;
tempLabel[nindex] = nowLabel;
}
}
}
}
cut(&P[nowLabel], finish);
for (int i = 0; i < finish; i++)
set(P[nowLabel], i, arr[i]);
isOK += finish;
nowLabel += K0;
}
free(arr);
}
顺序遍历并获取和映射新的全局标签
/ 顺序遍历并获取新的全局标签
for (int j = 0; j < height; j++) {
for (int k = 0; k < width; k++) {
if (0 > labels[oindex]) {
labels[oindex] = label;
int bel = tempLabel[oindex];
int count2 = P[bel].length;
if (count2 <= THRESHOLD) {
for (int n = 3; n >= 0; n--) {
int x = k + dx4[n];
int y = j + dy4[n];
if ((x >= 0 && x < width) && (y >= 0 && y < height)) {
int nindex = y * width + x;
if (labels[nindex] >= 0) {
adjlabel = labels[nindex];
break;
}
}
}
#pragma omp parallel for num_threads(thread0) default(none) shared(count2, labels, P, bel, adjlabel)
for (int c = 0; c < count2; c++) {
labels[get(P[bel], c)] = adjlabel;
}
label--;
} else {
#pragma omp parallel for num_threads(thread0) default(none) shared(count2, labels, P, bel, label)
for (int c = 0; c < count2; c++) {
labels[get(P[bel], c)] = label;
}
}
label++;
}
oindex++;
}
}
free(tempLabel);