之前分析了 二维中值滤波器的并行加速
由于二维中值滤波器是控制密集型的滤波器(排序操作),所以SSE加速不太明显
这次选用了计算密集型的双边滤波器
针对双边滤波器在5*5的滤波核下的运算速度做优化和分析
以下会有主区域、全图、主循环、完整(初始化+主循环)的概念
1. 由于双边滤波的滤波半径为2+1,所以不能忽略图像四周边界的区域了。
所以,以下会对主区域和全图滤波做一个预算时间的对比。
2. 在快速算法中还做了查找表优化,所以滤波函数是个有状态滤波器,算法需要初始化。
以下会对主循环和包含初始化的运算时间做个对比
总结先写:
发现同样的版本 使用浮点比整形还要快。浮点的SSE并行并没有提速4倍。
目前分析是因为编译器自动做了优化,将浮点运算的速度提升了2-3倍。以至于比整形的版本还略快一点。
|
运算时间 |
旧版整型 主区域 主循环 |
8.766ms |
新版整型 全图 主循环 |
6.324ms |
Opencv 浮点 全图 主循环 |
5.713ms |
新版 浮点 全图 主循环 |
5.301ms |
SSE 浮点 全图 主循环 |
4.778ms |
omp浮点 全图 主循环 |
1.527ms |
SSE+omp浮点 全图 主循环 |
1.355ms |
并行算法优化分析
1. 整型双边滤波
|
运算时间 |
旧版整型 主区域 主循环 |
8.766ms |
新版整型 全图 主循环 |
6.324ms |
2. 整型、浮点型双边滤波
|
运算时间 |
新版 整型 全图 主循环 |
6.324ms |
新版 浮点 全图 主循环 |
5.301ms |
opencv浮点 全图 主循环 |
5.713ms |
3. 浮点型SSE双边滤波主区域、全图耗时
|
|
主循环 |
完整 |
浮点型 |
主区域 |
5.002ms |
5.078ms |
全图 |
5.198ms |
5.301ms |
|
浮点型SSE |
主区域 |
4.658ms |
4.764ms |
全图 |
4.778ms |
5.076ms |
4. 浮点型SSE omp双边滤波
|
|
主循环 |
浮点型 |
主区域 |
5.002ms |
全图 |
5.198ms |
|
浮点型SSE |
主区域 |
4.658ms |
全图 |
4.778ms |
|
浮点型omp |
主区域 |
1.434ms |
全图 |
1.527ms |
|
浮点型SSE+omp |
主区域 |
1.285ms |
全图 |
1.355ms |
一.重构了关于整型优化的双边滤波器
原先版本是对矩形区域做滤波的,现在改成了圆形区域。减少了近一半的计算量。
旧版的整型双边滤波主区域主循环耗时为8.766ms
新版的整型双边滤波全图主循环耗时为6.324ms
二.设计了浮点型优化的双边滤波器
浮点型双边滤波主区域主循环耗时为5.002ms
浮点型双边滤波主区域完整耗时为5.078ms
浮点型双边滤波全图主循环耗时为5.198ms
浮点型双边滤波全图完整耗时为5.301ms
Opencv浮点型双边滤波全图完整耗时为5.713ms
这里可以看出全图运算大概比主区域多耗时0.2ms
算法初始化耗时0.1ms
三.增加了SSE加速优化的双边滤波器
浮点型SSE加速双边滤波主区域主循环耗时为4.658ms
浮点型SSE加速双边滤波主区域完整耗时为4.764ms
浮点型SSE加速双边滤波全图主循环耗时为4.778ms
浮点型SSE加速双边滤波全图完整耗时为5.076ms
这里可以看出全图运算大概比主区域多耗时0.1ms
算法初始化耗时0.2ms
四.增加了omp加速优化的双边滤波
浮点型omp加速双边滤波主区域主循环耗时为1.434ms
浮点型omp加速双边滤波全图主循环耗时为1.527ms
这里可以看出全图运算大概比主区域多耗时0.1ms
五.增加了SSE+omp加速优化的双边滤波
浮点型SSE+omp加速双边滤波主区域主循环耗时为1.285ms
浮点型SSE+omp加速双边滤波全图主循环耗时为1.355ms
这里可以看出全图运算大概比主区域多耗时0.1ms
以下是具体的运算优化耗时
l opencv full 5.713ms
l mainBody mainLoop 5.002ms
l mainBody 5.078ms
l Full mainLoop5.198ms
l Full 5.301ms
l mainBody sse mainLoop 4.658ms
l mainBody sse 4.764ms
l Full sse mainLoop 4.778ms
l Full sse 5.076ms
l mainBody omp mainLoop 1.434ms
l Full omp mainLoop 1.527ms
l mainBody sse_omp mainLoop 1.285ms
l Full sse_omp mainLoop 1.355ms
l Int Full mainLoop 6.324ms
l INT old version mainBody mainLoop 8.766ms
具体代码如下
宏定义
因为整型的双边会有求和溢出风险,所以这里限制了滤波直径为11/半径5
#define MALLOC malloc
#define FREE(p) if (p != NULL) { free(p); p = NULL;}
#define ALIGN16 __declspec(align(16))
#define ALIGN_MALLOC16(n) _aligned_malloc(n, 16)
#define ALIGN_MALLOC32(n) _aligned_malloc(n, 32)
#define ALIGN_MALLOC64(n) _aligned_malloc(n, 64)
#define ALIGN_MALLOC128(n) _aligned_malloc(n, 128)
#define ALIGN_FREE(p) if (p != NULL) { _aligned_free(p); p = NULL;}
#define BF_INT_BITS (10)
#define BF_INT_SCALE (1 << BF_INT_BITS)
#define BF_INT_SHIFT ((S32)(((BF_INT_BITS + 1) << 1) - 31 + 16) + 7) //more 7 bits[>11*11]
#define BF_INT_BITS2 ((S32)((BF_INT_BITS << 1) - BF_INT_SHIFT))
#define BF_INT_SCALE2 (1 << BF_INT_BITS2)
#define BF_BUF_LEN (1024)
#define EDGEPRES_R_MAX (5)
类的定义
typedef class edgePresFiltMain
{
public:
edgePresFiltMain(S32 width, S32 height, S32 radius, F32 sigmaVal, F32 sigmaSp);
~edgePresFiltMain();
void edgePreserve_mainBody(U16 *src, U16 *dst);
void edgePreserve_mainBody_omp(U16 *src, U16 *dst);
void edgePreserve_mainBody_sse(U16 *src, U16 *dst);
void edgePreserve_mainBody_sse_omp(U16 *src, U16 *dst);
void *hdl;
}edgePresFiltMain_;
typedef class edgePresFilt
{
public:
edgePresFilt(S32 width, S32 height, S32 radius, F32 sigmaVal, F32 sigmaSp);
~edgePresFilt();
void edgePreserve(U16 *src, U16 *dst);
void edgePreserve_omp(U16 *src, U16 *dst);
void edgePreserve_sse(U16 *src, U16 *dst);
void edgePreserve_sse_omp(U16 *src, U16 *dst);
void *hdl;
}edgePresFilt_;
typedef class edgePresFiltInt
{
public:
edgePresFiltInt(S32 width, S32 height, S32 radius, F32 sigmaVal, F32 sigmaSp);
~edgePresFiltInt();
void edgePreserve(U16 *src, U16 *dst);
void edgePreserve_omp(U16 *src, U16 *dst);
void *hdl;
}edgePresFiltInt_;
纯C函数声明
// no smooth on the border
void edgePreserve_mainBody(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp);
// smooth on the border
void edgePreserve(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp);
// no smooth on the border
void edgePreserve_mainBody_sse(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp);
// smooth on the border
void edgePreserve_sse(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp);
// smooth on the border
void edgePreserveInt(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp);
边界处理
static void borderReflect(U16 *src, S32 width, S32 height, U16 *dst, S32 radius)
{
S32 i = 0;
S32 j = 0;
S32 itmp1 = radius - 1;
S32 itmp2 = -1 - radius;
S32 width2 = width + (radius << 1);
U16 *psrc = src;
U16 *pdst = dst + width2 * radius;
for (i = 0; i < height; i++)
{
for (j = 0; j < radius; j++)
{
pdst[j] = psrc[itmp1 - j];
}
memcpy(pdst + radius, psrc, sizeof(U16) * width);
psrc += width;
pdst += width2;
for (j = -1; j >= -radius; j--)
{
pdst[j] = psrc[itmp2 - j];
}
}
psrc = pdst - width2;
for (i = 0; i < radius; i++)
{
memcpy(pdst, psrc, sizeof(U16) * width2);
psrc -= width2;
pdst += width2;
}
psrc = dst + width2 * radius;
pdst = dst + width2 * (radius - 1);
for (i = 0; i < radius; i++)
{
memcpy(pdst, psrc, sizeof(U16) * width2);
psrc += width2;
pdst -= width2;
}
}
static void borderCopy(U16 *src, U16 *dst, S32 width, S32 height, S32 radius)
{
S32 i = 0;
S32 j = 0;
S32 xend = height - (radius << 1);
U16 *psrc = src;
U16 *pdst = dst;
memcpy(pdst, psrc, sizeof(U16) * (width * radius - radius));
psrc += radius * width;
pdst += radius * width;
for (i = 0; i < xend; i++)
{
memcpy(pdst - radius, psrc - radius, sizeof(U16) * (radius << 1));
psrc += width;
pdst += width;
}
memcpy(pdst - radius, psrc - radius, sizeof(U16) * (width * radius + radius));
}
static void edgePreserve_LUT(S32 radius, S32 width, F32 sigmaVal, F32 sigmaSp,
S32 *spOfs, F32 *spWt, F32 *valWt)
{
S32 i = 0;
S32 j = 0;
S32 k = 0;
F32 sigmaValCoeff = -0.5f / (sigmaVal * sigmaVal);
F32 sigmaSpCoeff = -0.5f / (sigmaSp * sigmaSp);
for (i = -radius; i <= radius; i++)
{
for (j = -radius; j <= radius; j++)
{
if (sqrtf((i * i + j * j) * 1.f) <= radius)
{
if ((i == 0) && (j == 0))
{
continue;
}
spOfs[k] = i * width + j;
spWt[k] = expf((i * i + j * j) * sigmaSpCoeff);
k++;
}
}
}
for (i = 0; i < BF_BUF_LEN - 1; i++)
{
valWt[i] = expf(i * i * sigmaValCoeff);
}
valWt[i] = 0.f;
}
// Smooth main body of src to the dst img
// src img is bigger than dst img
static void edgePreserve_mainBody_process(U16 *src, S32 srcWidth, S32 srcHeight, S32 srcStep,
U16 *dst, S32 dstStep, S32 radius, S32 *spOfs, F32 *spWt, F32 *valWt, S32 maxk)
{
const S32 xEnd = srcWidth - radius;
const S32 dstHeight = srcHeight - (radius << 1);
S32 i = 0;
S32 j = 0;
S32 k = 0;
S32 n = 0;
U16 val0 = 0;
U16 val = 0;
S32 tmp = 0;
F32 w = 0.f;
F32 sum = 0.f;
F32 wsum = 0.f;
U16 *psrc = src + radius * srcStep;
U16 *pdst = dst;
for (i = 0; i < dstHeight; i++)
{
for (j = radius; j < xEnd; j++)
{
val0 = psrc[j];
sum = 0.f;
wsum = 0.f;
for (k = 0; k < maxk; k++)
{
val = psrc[j + spOfs[k]];
tmp = val - val0;
tmp = ABS(tmp);
tmp = MIN(tmp, BF_BUF_LEN - 1);
w = spWt[k] * valWt[tmp];
sum += val * w;
wsum += w;
}
pdst[j - radius] = (U16)((sum + val0) / (wsum + 1.f));
}
psrc += srcStep;
pdst += dstStep;
}
}
// Smooth main body of src to the dst img
// src img is bigger than dst img
static void edgePreserve_mainBody_omp_process(U16 *src, S32 srcWidth, S32 srcHeight, S32 srcStep,
U16 *dst, S32 dstStep, S32 radius, S32 *spOfs, F32 *spWt, F32 *valWt, S32 maxk)
{
const S32 xEnd = srcWidth - radius;
const S32 dstHeight = srcHeight - (radius << 1);
S32 i = 0;
#pragma omp parallel for
for (i = 0; i < dstHeight; i++)
{
S32 j = 0;
S32 k = 0;
U16 val0 = 0;
U16 val = 0;
S32 tmp = 0;
F32 w = 0.f;
F32 sum = 0.f;
F32 wsum = 0.f;
U16 *psrc = src + (radius + i) * srcStep;
U16 *pdst = dst + i * dstStep;
for (j = radius; j < xEnd; j++)
{
val0 = psrc[j];
sum = 0.f;
wsum = 0.f;
for (k = 0; k < maxk; k++)
{
val = psrc[j + spOfs[k]];
tmp = val - val0;
tmp = ABS(tmp);
tmp = MIN(tmp, BF_BUF_LEN - 1);
w = spWt[k] * valWt[tmp];
sum += val * w;
wsum += w;
}
pdst[j - radius] = (U16)((sum + val0) / (wsum + 1.f));
}
}
}
// Smooth main body of src to the dst img
// src img is bigger than dst img
static void edgePreserve_mainBody_sse_process(U16 *src, S32 srcWidth, S32 srcHeight, S32 srcStep,
U16 *dst, S32 dstStep, S32 radius, S32 *spOfs, F32 *spWt, F32 *valWt, S32 maxk)
{
const S32 xEnd = srcWidth - radius;
const S32 dstHeight = srcHeight - (radius << 1);
const U32 ALIGN16 bufSignMask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
const F32 ALIGN16 bufLutLen[] = { BF_BUF_LEN - 1, BF_BUF_LEN - 1, BF_BUF_LEN - 1, BF_BUF_LEN - 1 };
S32 ALIGN16 buf[4];
F32 ALIGN16 bufSum[4];
S32 i = 0;
S32 j = 0;
S32 k = 0;
F32 val0 = 0.f;
F32 sum = 0.f;
F32 wsum = 0.f;
U16 *psrc = src + radius * srcStep;
U16 *pdst = dst;
__m128 _val;
__m128 _val0;
__m128 _idx;
__m128 _psum;
__m128 _sw;
__m128 _cw;
__m128 _w;
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
const __m128 _lutLen = _mm_load_ps((const float*)bufLutLen);
for (i = 0; i < dstHeight; i++)
{
for (j = radius; j < xEnd; j++)
{
val0 = psrc[j] * 1.0f;
_psum = _mm_setzero_ps();
_val0 = _mm_set1_ps(val0);
for (k = 0; k <= maxk - 4; k += 4)
{
_val = _mm_set_ps(psrc[j + spOfs[k + 3]], psrc[j + spOfs[k + 2]],
psrc[j + spOfs[k + 1]], psrc[j + spOfs[k]]);
// _sw = _mm_loadu_ps(spWt + k);
_sw = _mm_load_ps(spWt + k);
_idx = _mm_and_ps(_signMask, _mm_sub_ps(_val, _val0));
_mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_mm_min_ps(_idx, _lutLen)));
_cw = _mm_set_ps(valWt[buf[3]], valWt[buf[2]], valWt[buf[1]], valWt[buf[0]]);
_w = _mm_mul_ps(_cw, _sw);
_val = _mm_mul_ps(_w, _val);
_sw = _mm_hadd_ps(_w, _val);
_sw = _mm_hadd_ps(_sw, _sw);
_psum = _mm_add_ps(_sw, _psum);
}
_mm_storel_pi((__m64*)bufSum, _psum);
sum = bufSum[1] + val0;
wsum = bufSum[0] + 1.f;
pdst[j - radius] = (U16)(sum / wsum);
}
psrc += srcStep;
pdst += dstStep;
}
}
// Smooth main body of src to the dst img
// src img is bigger than dst img
static void edgePreserve_mainBody_sse_omp_process(U16 *src, S32 srcWidth, S32 srcHeight, S32 srcStep,
U16 *dst, S32 dstStep, S32 radius, S32 *spOfs, F32 *spWt, F32 *valWt, S32 maxk)
{
const S32 xEnd = srcWidth - radius;
const S32 dstHeight = srcHeight - (radius << 1);
const U32 ALIGN16 bufSignMask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
const F32 ALIGN16 bufLutLen[] = { BF_BUF_LEN - 1, BF_BUF_LEN - 1, BF_BUF_LEN - 1, BF_BUF_LEN - 1 };
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
const __m128 _lutLen = _mm_load_ps((const float*)bufLutLen);
S32 i = 0;
#pragma omp parallel for
for (i = 0; i < dstHeight; i++)
{
S32 ALIGN16 buf[4];
F32 ALIGN16 bufSum[4];
S32 j = 0;
S32 k = 0;
F32 val0 = 0.f;
F32 sum = 0.f;
F32 wsum = 0.f;
U16 *psrc = src + (radius + i) * srcStep;
U16 *pdst = dst + i * dstStep;
__m128 _val;
__m128 _val0;
__m128 _idx;
__m128 _psum;
__m128 _sw;
__m128 _cw;
__m128 _w;
for (j = radius; j < xEnd; j++)
{
val0 = psrc[j] * 1.0f;
_psum = _mm_setzero_ps();
_val0 = _mm_set1_ps(val0);
for (k = 0; k <= maxk - 4; k += 4)
{
_val = _mm_set_ps(psrc[j + spOfs[k + 3]], psrc[j + spOfs[k + 2]],
psrc[j + spOfs[k + 1]], psrc[j + spOfs[k]]);
// _sw = _mm_loadu_ps(spWt + k);
_sw = _mm_load_ps(spWt + k);
_idx = _mm_and_ps(_signMask, _mm_sub_ps(_val, _val0));
_mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_mm_min_ps(_idx, _lutLen)));
_cw = _mm_set_ps(valWt[buf[3]], valWt[buf[2]], valWt[buf[1]], valWt[buf[0]]);
_w = _mm_mul_ps(_cw, _sw);
_val = _mm_mul_ps(_w, _val);
_sw = _mm_hadd_ps(_w, _val);
_sw = _mm_hadd_ps(_sw, _sw);
_psum = _mm_add_ps(_sw, _psum);
}
_mm_storel_pi((__m64*)bufSum, _psum);
sum = bufSum[1] + val0;
wsum = bufSum[0] + 1.f;
pdst[j - radius] = (U16)(sum / wsum);
}
}
}
// no smooth on the border
void edgePreserve_mainBody(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp)
{
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 *spOfs = NULL;
F32 *spWt = NULL;
F32 *valWt = NULL;
for (i = -radius; i <= radius; i++)
{
for (j = -radius; j <= radius; j++)
{
if (sqrtf((i * i + j * j) * 1.f) <= radius)
{
maxk++;
}
}
}
spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
edgePreserve_LUT(radius, width, sigmaVal, sigmaSp, spOfs, spWt, valWt);
borderCopy(src, dst, width, height, radius);
edgePreserve_mainBody_process(src, width, height, width,
dst + width * radius + radius, width,
radius, spOfs, spWt, valWt, maxk);
ALIGN_FREE(spWt);
FREE(spOfs);
FREE(valWt);
}
// smooth on the border
void edgePreserve(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp)
{
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
S32 *spOfs = NULL;
F32 *spWt = NULL;
F32 *valWt = NULL;
U16 *buf = (U16 *)MALLOC(sizeof(U16) * width2 * height2);
borderReflect(src, width, height, buf, radius);
for (i = -radius; i <= radius; i++)
{
for (j = -radius; j <= radius; j++)
{
if (sqrtf((i * i + j * j) * 1.f) <= radius)
{
maxk++;
}
}
}
spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
edgePreserve_LUT(radius, width2, sigmaVal, sigmaSp, spOfs, spWt, valWt);
edgePreserve_mainBody_process(buf, width2, height2, width2,
dst, width, radius, spOfs, spWt, valWt, maxk);
ALIGN_FREE(spWt);
FREE(spOfs);
FREE(valWt);
FREE(buf);
}
// no smooth on the border
void edgePreserve_mainBody_sse(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp)
{
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 *spOfs = NULL;
F32 *spWt = NULL;
F32 *valWt = NULL;
for (i = -radius; i <= radius; i++)
{
for (j = -radius; j <= radius; j++)
{
if (sqrtf((i * i + j * j) * 1.f) <= radius)
{
maxk++;
}
}
}
spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
edgePreserve_LUT(radius, width, sigmaVal, sigmaSp, spOfs, spWt, valWt);
borderCopy(src, dst, width, height, radius);
// edgePreserve_noBorder_sse_mainloop(src, dst, width, height, radius, spOfs, spWt, valWt, maxk);
edgePreserve_mainBody_sse_process(src, width, height, width,
dst + width * radius + radius, width,
radius, spOfs, spWt, valWt, maxk);
ALIGN_FREE(spWt);
FREE(spOfs);
FREE(valWt);
}
// smooth on the border
void edgePreserve_sse(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp)
{
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
S32 *spOfs = NULL;
F32 *spWt = NULL;
F32 *valWt = NULL;
U16 *buf = (U16 *)MALLOC(sizeof(U16) * width2 * height2);
borderReflect(src, width, height, buf, radius);
for (i = -radius; i <= radius; i++)
{
for (j = -radius; j <= radius; j++)
{
if (sqrtf((i * i + j * j) * 1.f) <= radius)
{
maxk++;
}
}
}
spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
edgePreserve_LUT(radius, width2, sigmaVal, sigmaSp, spOfs, spWt, valWt);
// edgePreserve_sse_mainloop(buf, dst, width, height, radius, spOfs, spWt, valWt, maxk);
edgePreserve_mainBody_sse_process(buf, width2, height2, width2,
dst, width, radius, spOfs, spWt, valWt, maxk);
ALIGN_FREE(spWt);
FREE(spOfs);
FREE(valWt);
FREE(buf);
}
typedef struct EDGE_PRES_FILT_HDL
{
S32 maxk;
S32 width;
S32 height;
S32 radius;
S32 *spOfs;
F32 *spWt;
F32 *valWt;
U16 *buf;
}EDGE_PRES_FILT_HDL_;
edgePresFiltMain::edgePresFiltMain(S32 width, S32 height, S32 radius, F32 sigmaVal, F32 sigmaSp)
{
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
EDGE_PRES_FILT_HDL *pHdl = NULL;
pHdl = new EDGE_PRES_FILT_HDL;
hdl = pHdl;
for (i = -radius; i <= radius; i++)
{
for (j = -radius; j <= radius; j++)
{
if (sqrtf((i * i + j * j) * 1.f) <= radius)
{
maxk++;
}
}
}
pHdl->maxk = maxk;
pHdl->width = width;
pHdl->height = height;
pHdl->radius = radius;
pHdl->spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
pHdl->spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
pHdl->valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
edgePreserve_LUT(radius, width, sigmaVal, sigmaSp, pHdl->spOfs, pHdl->spWt, pHdl->valWt);
}
edgePresFiltMain::~edgePresFiltMain()
{
EDGE_PRES_FILT_HDL *pHdl = (EDGE_PRES_FILT_HDL *)hdl;
ALIGN_FREE(pHdl->spWt);
FREE(pHdl->spOfs);
FREE(pHdl->valWt);
delete hdl;
}
void edgePresFiltMain::edgePreserve_mainBody(U16 *src, U16 *dst)
{
EDGE_PRES_FILT_HDL *pHdl = (EDGE_PRES_FILT_HDL *)hdl;
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
borderCopy(src, dst, width, height, radius);
edgePreserve_mainBody_process(src, width, height, width,
dst + width * radius + radius, width,
radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
}
void edgePresFiltMain::edgePreserve_mainBody_omp(U16 *src, U16 *dst)
{
EDGE_PRES_FILT_HDL *pHdl = (EDGE_PRES_FILT_HDL *)hdl;
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
borderCopy(src, dst, width, height, radius);
edgePreserve_mainBody_omp_process(src, width, height, width,
dst + width * radius + radius, width,
radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
}
void edgePresFiltMain::edgePreserve_mainBody_sse(U16 *src, U16 *dst)
{
EDGE_PRES_FILT_HDL *pHdl = (EDGE_PRES_FILT_HDL *)hdl;
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
borderCopy(src, dst, width, height, radius);
edgePreserve_mainBody_sse_process(src, width, height, width,
dst + width * radius + radius, width,
radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
}
void edgePresFiltMain::edgePreserve_mainBody_sse_omp(U16 *src, U16 *dst)
{
EDGE_PRES_FILT_HDL *pHdl = (EDGE_PRES_FILT_HDL *)hdl;
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
borderCopy(src, dst, width, height, radius);
edgePreserve_mainBody_sse_omp_process(src, width, height, width,
dst + width * radius + radius, width,
radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
}
edgePresFilt::edgePresFilt(S32 width, S32 height, S32 radius, F32 sigmaVal, F32 sigmaSp)
{
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
EDGE_PRES_FILT_HDL *pHdl = NULL;
pHdl = new EDGE_PRES_FILT_HDL;
hdl = pHdl;
for (i = -radius; i <= radius; i++)
{
for (j = -radius; j <= radius; j++)
{
if (sqrtf((i * i + j * j) * 1.f) <= radius)
{
maxk++;
}
}
}
pHdl->maxk = maxk;
pHdl->width = width;
pHdl->height = height;
pHdl->radius = radius;
pHdl->spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
pHdl->spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
pHdl->valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
pHdl->buf = (U16 *)MALLOC(sizeof(U16) * width2 * height2);
edgePreserve_LUT(radius, width2, sigmaVal, sigmaSp, pHdl->spOfs, pHdl->spWt, pHdl->valWt);
}
edgePresFilt::~edgePresFilt()
{
EDGE_PRES_FILT_HDL *pHdl = (EDGE_PRES_FILT_HDL *)hdl;
ALIGN_FREE(pHdl->spWt);
FREE(pHdl->spOfs);
FREE(pHdl->valWt);
FREE(pHdl->buf);
delete hdl;
}
void edgePresFilt::edgePreserve(U16 *src, U16 *dst)
{
EDGE_PRES_FILT_HDL *pHdl = (EDGE_PRES_FILT_HDL *)hdl;
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
borderReflect(src, width, height, pHdl->buf, radius);
edgePreserve_mainBody_process(pHdl->buf, width2, height2, width2,
dst, width, radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
}
void edgePresFilt::edgePreserve_omp(U16 *src, U16 *dst)
{
EDGE_PRES_FILT_HDL *pHdl = (EDGE_PRES_FILT_HDL *)hdl;
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
borderReflect(src, width, height, pHdl->buf, radius);
edgePreserve_mainBody_omp_process(pHdl->buf, width2, height2, width2,
dst, width, radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
}
void edgePresFilt::edgePreserve_sse(U16 *src, U16 *dst)
{
EDGE_PRES_FILT_HDL *pHdl = (EDGE_PRES_FILT_HDL *)hdl;
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
borderReflect(src, width, height, pHdl->buf, radius);
edgePreserve_mainBody_sse_process(pHdl->buf, width2, height2, width2,
dst, width, radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
}
void edgePresFilt::edgePreserve_sse_omp(U16 *src, U16 *dst)
{
EDGE_PRES_FILT_HDL *pHdl = (EDGE_PRES_FILT_HDL *)hdl;
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
borderReflect(src, width, height, pHdl->buf, radius);
edgePreserve_mainBody_sse_omp_process(pHdl->buf, width2, height2, width2,
dst, width, radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
}
//////////////////////////////////////////////////////////////////////////
static void edgePreserveInt_LUT(S32 radius, S32 width, F32 sigmaVal, F32 sigmaSp,
S32 *spOfs, S32 *spWt, S32 *valWt)
{
S32 i = 0;
S32 j = 0;
S32 k = 0;
F32 sigmaValCoeff = -0.5f / (sigmaVal * sigmaVal);
F32 sigmaSpCoeff = -0.5f / (sigmaSp * sigmaSp);
for (i = -radius; i <= radius; i++)
{
for (j = -radius; j <= radius; j++)
{
if (sqrtf((i * i + j * j) * 1.f) <= radius)
{
if ((i == 0) && (j == 0))
{
continue;
}
spOfs[k] = i * width + j;
spWt[k] = (S32)(expf((i * i + j * j) * sigmaSpCoeff) * BF_INT_SCALE);
k++;
}
}
}
for (i = 0; i < BF_BUF_LEN - 1; i++)
{
valWt[i] = (S32)(expf(i * i * sigmaValCoeff) * BF_INT_SCALE);
}
valWt[i] = 0;
}
// Smooth main body of src to the dst img
// src img is bigger than dst img
static void edgePreserveInt_mainBody_process(U16 *src, S32 srcWidth, S32 srcHeight, S32 srcStep,
U16 *dst, S32 dstStep, S32 radius, S32 *spOfs, S32 *spWt, S32 *valWt, S32 maxk)
{
const S32 xEnd = srcWidth - radius;
const S32 dstHeight = srcHeight - (radius << 1);
S32 i = 0;
S32 j = 0;
S32 k = 0;
S32 n = 0;
U16 val0 = 0;
U16 val = 0;
S32 tmp = 0;
S32 w = 0;
S32 sum = 0;
S32 wsum = 0;
U16 *psrc = src + radius * srcStep;
U16 *pdst = dst;
for (i = 0; i < dstHeight; i++)
{
for (j = radius; j < xEnd; j++)
{
val0 = psrc[j];
sum = 0;
wsum = 0;
for (k = 0; k < maxk; k++)
{
val = psrc[j + spOfs[k]];
tmp = val - val0;
tmp = ABS(tmp);
tmp = MIN(tmp, BF_BUF_LEN - 1);
w = spWt[k] * valWt[tmp];
w >>= BF_INT_SHIFT;
wsum += w;
sum += (val * w);
}
pdst[j - radius] = (U16)((sum + (val0 << BF_INT_BITS2)) / (wsum + BF_INT_SCALE2));
}
psrc += srcStep;
pdst += dstStep;
}
}
// smooth on the border
void edgePreserveInt(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius_, F32 sigmaVal, F32 sigmaSp)
{
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 radius = GP_MIN(radius_, GP_EDGEPRES_R_MAX);
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
S32 *spOfs = NULL;
S32 *spWt = NULL;
S32 *valWt = NULL;
U16 *buf = (U16 *)MALLOC(sizeof(U16) * width2 * height2);
borderReflect(src, width, height, buf, radius);
for (i = -radius; i <= radius; i++)
{
for (j = -radius; j <= radius; j++)
{
if (sqrtf((i * i + j * j) * 1.f) <= radius)
{
maxk++;
}
}
}
spWt = (S32 *)ALIGN_MALLOC16(sizeof(S32) * maxk);
spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
valWt = (S32 *)MALLOC(sizeof(S32) * BF_BUF_LEN);
edgePreserveInt_LUT(radius, width2, sigmaVal, sigmaSp, spOfs, spWt, valWt);
edgePreserveInt_mainBody_process(buf, width2, height2, width2,
dst, width, radius, spOfs, spWt, valWt, maxk);
ALIGN_FREE(spWt);
FREE(spOfs);
FREE(valWt);
FREE(buf);
}
typedef struct EDGE_PRES_FILT_INT_HDL
{
S32 maxk;
S32 width;
S32 height;
S32 radius;
S32 *spOfs;
S32 *spWt;
S32 *valWt;
U16 *buf;
}EDGE_PRES_FILT_INT_HDL_;
edgePresFiltInt::edgePresFiltInt(S32 width, S32 height, S32 radius_, F32 sigmaVal, F32 sigmaSp)
{
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 radius = GP_MIN(radius_, GP_EDGEPRES_R_MAX);
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
EDGE_PRES_FILT_INT_HDL *pHdl = NULL;
pHdl = new EDGE_PRES_FILT_INT_HDL;
hdl = pHdl;
for (i = -radius; i <= radius; i++)
{
for (j = -radius; j <= radius; j++)
{
if (sqrtf((i * i + j * j) * 1.f) <= radius)
{
maxk++;
}
}
}
pHdl->maxk = maxk;
pHdl->width = width;
pHdl->height = height;
pHdl->radius = radius;
pHdl->spWt = (S32 *)ALIGN_MALLOC16(sizeof(S32) * maxk);
pHdl->spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
pHdl->valWt = (S32 *)MALLOC(sizeof(S32) * BF_BUF_LEN);
pHdl->buf = (U16 *)MALLOC(sizeof(U16) * width2 * height2);
edgePreserveInt_LUT(radius, width2, sigmaVal, sigmaSp, pHdl->spOfs, pHdl->spWt, pHdl->valWt);
}
edgePresFiltInt::~edgePresFiltInt()
{
EDGE_PRES_FILT_INT_HDL *pHdl = (EDGE_PRES_FILT_INT_HDL *)hdl;
ALIGN_FREE(pHdl->spWt);
FREE(pHdl->spOfs);
FREE(pHdl->valWt);
FREE(pHdl->buf);
delete hdl;
}
void edgePresFiltInt::edgePreserve(U16 *src, U16 *dst)
{
EDGE_PRES_FILT_INT_HDL *pHdl = (EDGE_PRES_FILT_INT_HDL *)hdl;
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
borderReflect(src, width, height, pHdl->buf, radius);
edgePreserveInt_mainBody_process(pHdl->buf, width2, height2, width2,
dst, width, radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
}