之前分析了 二维中值滤波器的并行加速
1. 由于双边滤波的滤波半径为2+1,所以不能忽略图像四周边界的区域了。
2. 在快速算法中还做了查找表优化,所以滤波函数是个有状态滤波器,算法需要初始化。
发现同样的版本 使用浮点比整形还要快。浮点的SSE并行并没有提速4倍。
运算时间 |
旧版整型 主区域 主循环 |
8.766ms |
新版整型 全图 主循环 |
6.324ms |
Opencv 浮点 全图 主循环 |
5.713ms |
新版 浮点 全图 主循环 |
5.301ms |
SSE 浮点 全图 主循环 |
4.778ms |
omp浮点 全图 主循环 |
1.527ms |
SSE+omp浮点 全图 主循环 |
1.355ms |
1. 整型双边滤波
运算时间 |
旧版整型 主区域 主循环 |
8.766ms |
新版整型 全图 主循环 |
6.324ms |
2. 整型、浮点型双边滤波
运算时间 |
新版 整型 全图 主循环 |
6.324ms |
新版 浮点 全图 主循环 |
5.301ms |
opencv浮点 全图 主循环 |
5.713ms |
3. 浮点型SSE双边滤波主区域、全图耗时
主循环 |
完整 |
浮点型 |
主区域 |
5.002ms |
5.078ms |
全图 |
5.198ms |
5.301ms |
浮点型SSE |
主区域 |
4.658ms |
4.764ms |
全图 |
4.778ms |
5.076ms |
4. 浮点型SSE omp双边滤波
主循环 |
浮点型 |
主区域 |
5.002ms |
全图 |
5.198ms |
浮点型SSE |
主区域 |
4.658ms |
全图 |
4.778ms |
浮点型omp |
主区域 |
1.434ms |
全图 |
1.527ms |
浮点型SSE+omp |
主区域 |
1.285ms |
全图 |
1.355ms |
l opencv full 5.713ms
l mainBody mainLoop 5.002ms
l mainBody 5.078ms
l Full mainLoop5.198ms
l Full 5.301ms
l mainBody sse mainLoop 4.658ms
l mainBody sse 4.764ms
l Full sse mainLoop 4.778ms
l Full sse 5.076ms
l mainBody omp mainLoop 1.434ms
l Full omp mainLoop 1.527ms
l mainBody sse_omp mainLoop 1.285ms
l Full sse_omp mainLoop 1.355ms
l Int Full mainLoop 6.324ms
l INT old version mainBody mainLoop 8.766ms
#define MALLOC malloc
#define FREE(p) if (p != NULL) { free(p); p = NULL;}
#define ALIGN16 __declspec(align(16))
#define ALIGN_MALLOC16(n) _aligned_malloc(n, 16)
#define ALIGN_MALLOC32(n) _aligned_malloc(n, 32)
#define ALIGN_MALLOC64(n) _aligned_malloc(n, 64)
#define ALIGN_MALLOC128(n) _aligned_malloc(n, 128)
#define ALIGN_FREE(p) if (p != NULL) { _aligned_free(p); p = NULL;}
#define BF_INT_BITS (10)
#define BF_INT_SCALE (1 << BF_INT_BITS)
#define BF_INT_SHIFT ((S32)(((BF_INT_BITS + 1) << 1) - 31 + 16) + 7) //more 7 bits[>11*11]
#define BF_INT_BITS2 ((S32)((BF_INT_BITS << 1) - BF_INT_SHIFT))
#define BF_INT_SCALE2 (1 << BF_INT_BITS2)
#define BF_BUF_LEN (1024)
#define EDGEPRES_R_MAX (5)
typedef class edgePresFiltMain
edgePresFiltMain(S32 width, S32 height, S32 radius, F32 sigmaVal, F32 sigmaSp);
void edgePreserve_mainBody(U16 *src, U16 *dst);
void edgePreserve_mainBody_omp(U16 *src, U16 *dst);
void edgePreserve_mainBody_sse(U16 *src, U16 *dst);
void edgePreserve_mainBody_sse_omp(U16 *src, U16 *dst);
void *hdl;
typedef class edgePresFilt
edgePresFilt(S32 width, S32 height, S32 radius, F32 sigmaVal, F32 sigmaSp);
void edgePreserve(U16 *src, U16 *dst);
void edgePreserve_omp(U16 *src, U16 *dst);
void edgePreserve_sse(U16 *src, U16 *dst);
void edgePreserve_sse_omp(U16 *src, U16 *dst);
void *hdl;
typedef class edgePresFiltInt
edgePresFiltInt(S32 width, S32 height, S32 radius, F32 sigmaVal, F32 sigmaSp);
void edgePreserve(U16 *src, U16 *dst);
void edgePreserve_omp(U16 *src, U16 *dst);
void *hdl;
// no smooth on the border
void edgePreserve_mainBody(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp);
// smooth on the border
void edgePreserve(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp);
// no smooth on the border
void edgePreserve_mainBody_sse(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp);
// smooth on the border
void edgePreserve_sse(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp);
// smooth on the border
void edgePreserveInt(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp);
static void borderReflect(U16 *src, S32 width, S32 height, U16 *dst, S32 radius)
S32 i = 0;
S32 j = 0;
S32 itmp1 = radius - 1;
S32 itmp2 = -1 - radius;
S32 width2 = width + (radius << 1);
U16 *psrc = src;
U16 *pdst = dst + width2 * radius;
for (i = 0; i < height; i++)
for (j = 0; j < radius; j++)
pdst[j] = psrc[itmp1 - j];
memcpy(pdst + radius, psrc, sizeof(U16) * width);
psrc += width;
pdst += width2;
for (j = -1; j >= -radius; j--)
pdst[j] = psrc[itmp2 - j];
psrc = pdst - width2;
for (i = 0; i < radius; i++)
memcpy(pdst, psrc, sizeof(U16) * width2);
psrc -= width2;
pdst += width2;
psrc = dst + width2 * radius;
pdst = dst + width2 * (radius - 1);
for (i = 0; i < radius; i++)
memcpy(pdst, psrc, sizeof(U16) * width2);
psrc += width2;
pdst -= width2;
static void borderCopy(U16 *src, U16 *dst, S32 width, S32 height, S32 radius)
S32 i = 0;
S32 j = 0;
S32 xend = height - (radius << 1);
U16 *psrc = src;
U16 *pdst = dst;
memcpy(pdst, psrc, sizeof(U16) * (width * radius - radius));
psrc += radius * width;
pdst += radius * width;
for (i = 0; i < xend; i++)
memcpy(pdst - radius, psrc - radius, sizeof(U16) * (radius << 1));
psrc += width;
pdst += width;
memcpy(pdst - radius, psrc - radius, sizeof(U16) * (width * radius + radius));
static void edgePreserve_LUT(S32 radius, S32 width, F32 sigmaVal, F32 sigmaSp,
S32 *spOfs, F32 *spWt, F32 *valWt)
S32 i = 0;
S32 j = 0;
S32 k = 0;
F32 sigmaValCoeff = -0.5f / (sigmaVal * sigmaVal);
F32 sigmaSpCoeff = -0.5f / (sigmaSp * sigmaSp);
for (i = -radius; i <= radius; i++)
for (j = -radius; j <= radius; j++)
if (sqrtf((i * i + j * j) * 1.f) <= radius)
if ((i == 0) && (j == 0))
spOfs[k] = i * width + j;
spWt[k] = expf((i * i + j * j) * sigmaSpCoeff);
for (i = 0; i < BF_BUF_LEN - 1; i++)
valWt[i] = expf(i * i * sigmaValCoeff);
valWt[i] = 0.f;
// Smooth main body of src to the dst img
// src img is bigger than dst img
static void edgePreserve_mainBody_process(U16 *src, S32 srcWidth, S32 srcHeight, S32 srcStep,
U16 *dst, S32 dstStep, S32 radius, S32 *spOfs, F32 *spWt, F32 *valWt, S32 maxk)
const S32 xEnd = srcWidth - radius;
const S32 dstHeight = srcHeight - (radius << 1);
S32 i = 0;
S32 j = 0;
S32 k = 0;
S32 n = 0;
U16 val0 = 0;
U16 val = 0;
S32 tmp = 0;
F32 w = 0.f;
F32 sum = 0.f;
F32 wsum = 0.f;
U16 *psrc = src + radius * srcStep;
U16 *pdst = dst;
for (i = 0; i < dstHeight; i++)
for (j = radius; j < xEnd; j++)
val0 = psrc[j];
sum = 0.f;
wsum = 0.f;
for (k = 0; k < maxk; k++)
val = psrc[j + spOfs[k]];
tmp = val - val0;
tmp = ABS(tmp);
tmp = MIN(tmp, BF_BUF_LEN - 1);
w = spWt[k] * valWt[tmp];
sum += val * w;
wsum += w;
pdst[j - radius] = (U16)((sum + val0) / (wsum + 1.f));
psrc += srcStep;
pdst += dstStep;
// Smooth main body of src to the dst img
// src img is bigger than dst img
static void edgePreserve_mainBody_omp_process(U16 *src, S32 srcWidth, S32 srcHeight, S32 srcStep,
U16 *dst, S32 dstStep, S32 radius, S32 *spOfs, F32 *spWt, F32 *valWt, S32 maxk)
const S32 xEnd = srcWidth - radius;
const S32 dstHeight = srcHeight - (radius << 1);
S32 i = 0;
#pragma omp parallel for
for (i = 0; i < dstHeight; i++)
S32 j = 0;
S32 k = 0;
U16 val0 = 0;
U16 val = 0;
S32 tmp = 0;
F32 w = 0.f;
F32 sum = 0.f;
F32 wsum = 0.f;
U16 *psrc = src + (radius + i) * srcStep;
U16 *pdst = dst + i * dstStep;
for (j = radius; j < xEnd; j++)
val0 = psrc[j];
sum = 0.f;
wsum = 0.f;
for (k = 0; k < maxk; k++)
val = psrc[j + spOfs[k]];
tmp = val - val0;
tmp = ABS(tmp);
tmp = MIN(tmp, BF_BUF_LEN - 1);
w = spWt[k] * valWt[tmp];
sum += val * w;
wsum += w;
pdst[j - radius] = (U16)((sum + val0) / (wsum + 1.f));
// Smooth main body of src to the dst img
// src img is bigger than dst img
static void edgePreserve_mainBody_sse_process(U16 *src, S32 srcWidth, S32 srcHeight, S32 srcStep,
U16 *dst, S32 dstStep, S32 radius, S32 *spOfs, F32 *spWt, F32 *valWt, S32 maxk)
const S32 xEnd = srcWidth - radius;
const S32 dstHeight = srcHeight - (radius << 1);
const U32 ALIGN16 bufSignMask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
const F32 ALIGN16 bufLutLen[] = { BF_BUF_LEN - 1, BF_BUF_LEN - 1, BF_BUF_LEN - 1, BF_BUF_LEN - 1 };
S32 ALIGN16 buf[4];
F32 ALIGN16 bufSum[4];
S32 i = 0;
S32 j = 0;
S32 k = 0;
F32 val0 = 0.f;
F32 sum = 0.f;
F32 wsum = 0.f;
U16 *psrc = src + radius * srcStep;
U16 *pdst = dst;
__m128 _val;
__m128 _val0;
__m128 _idx;
__m128 _psum;
__m128 _sw;
__m128 _cw;
__m128 _w;
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
const __m128 _lutLen = _mm_load_ps((const float*)bufLutLen);
for (i = 0; i < dstHeight; i++)
for (j = radius; j < xEnd; j++)
val0 = psrc[j] * 1.0f;
_psum = _mm_setzero_ps();
_val0 = _mm_set1_ps(val0);
for (k = 0; k <= maxk - 4; k += 4)
_val = _mm_set_ps(psrc[j + spOfs[k + 3]], psrc[j + spOfs[k + 2]],
psrc[j + spOfs[k + 1]], psrc[j + spOfs[k]]);
// _sw = _mm_loadu_ps(spWt + k);
_sw = _mm_load_ps(spWt + k);
_idx = _mm_and_ps(_signMask, _mm_sub_ps(_val, _val0));
_mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_mm_min_ps(_idx, _lutLen)));
_cw = _mm_set_ps(valWt[buf[3]], valWt[buf[2]], valWt[buf[1]], valWt[buf[0]]);
_w = _mm_mul_ps(_cw, _sw);
_val = _mm_mul_ps(_w, _val);
_sw = _mm_hadd_ps(_w, _val);
_sw = _mm_hadd_ps(_sw, _sw);
_psum = _mm_add_ps(_sw, _psum);
_mm_storel_pi((__m64*)bufSum, _psum);
sum = bufSum[1] + val0;
wsum = bufSum[0] + 1.f;
pdst[j - radius] = (U16)(sum / wsum);
psrc += srcStep;
pdst += dstStep;
// Smooth main body of src to the dst img
// src img is bigger than dst img
static void edgePreserve_mainBody_sse_omp_process(U16 *src, S32 srcWidth, S32 srcHeight, S32 srcStep,
U16 *dst, S32 dstStep, S32 radius, S32 *spOfs, F32 *spWt, F32 *valWt, S32 maxk)
const S32 xEnd = srcWidth - radius;
const S32 dstHeight = srcHeight - (radius << 1);
const U32 ALIGN16 bufSignMask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
const F32 ALIGN16 bufLutLen[] = { BF_BUF_LEN - 1, BF_BUF_LEN - 1, BF_BUF_LEN - 1, BF_BUF_LEN - 1 };
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
const __m128 _lutLen = _mm_load_ps((const float*)bufLutLen);
S32 i = 0;
#pragma omp parallel for
for (i = 0; i < dstHeight; i++)
S32 ALIGN16 buf[4];
F32 ALIGN16 bufSum[4];
S32 j = 0;
S32 k = 0;
F32 val0 = 0.f;
F32 sum = 0.f;
F32 wsum = 0.f;
U16 *psrc = src + (radius + i) * srcStep;
U16 *pdst = dst + i * dstStep;
__m128 _val;
__m128 _val0;
__m128 _idx;
__m128 _psum;
__m128 _sw;
__m128 _cw;
__m128 _w;
for (j = radius; j < xEnd; j++)
val0 = psrc[j] * 1.0f;
_psum = _mm_setzero_ps();
_val0 = _mm_set1_ps(val0);
for (k = 0; k <= maxk - 4; k += 4)
_val = _mm_set_ps(psrc[j + spOfs[k + 3]], psrc[j + spOfs[k + 2]],
psrc[j + spOfs[k + 1]], psrc[j + spOfs[k]]);
// _sw = _mm_loadu_ps(spWt + k);
_sw = _mm_load_ps(spWt + k);
_idx = _mm_and_ps(_signMask, _mm_sub_ps(_val, _val0));
_mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_mm_min_ps(_idx, _lutLen)));
_cw = _mm_set_ps(valWt[buf[3]], valWt[buf[2]], valWt[buf[1]], valWt[buf[0]]);
_w = _mm_mul_ps(_cw, _sw);
_val = _mm_mul_ps(_w, _val);
_sw = _mm_hadd_ps(_w, _val);
_sw = _mm_hadd_ps(_sw, _sw);
_psum = _mm_add_ps(_sw, _psum);
_mm_storel_pi((__m64*)bufSum, _psum);
sum = bufSum[1] + val0;
wsum = bufSum[0] + 1.f;
pdst[j - radius] = (U16)(sum / wsum);
// no smooth on the border
void edgePreserve_mainBody(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp)
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 *spOfs = NULL;
F32 *spWt = NULL;
F32 *valWt = NULL;
for (i = -radius; i <= radius; i++)
for (j = -radius; j <= radius; j++)
if (sqrtf((i * i + j * j) * 1.f) <= radius)
spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
edgePreserve_LUT(radius, width, sigmaVal, sigmaSp, spOfs, spWt, valWt);
borderCopy(src, dst, width, height, radius);
edgePreserve_mainBody_process(src, width, height, width,
dst + width * radius + radius, width,
radius, spOfs, spWt, valWt, maxk);
// smooth on the border
void edgePreserve(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp)
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
S32 *spOfs = NULL;
F32 *spWt = NULL;
F32 *valWt = NULL;
U16 *buf = (U16 *)MALLOC(sizeof(U16) * width2 * height2);
borderReflect(src, width, height, buf, radius);
for (i = -radius; i <= radius; i++)
for (j = -radius; j <= radius; j++)
if (sqrtf((i * i + j * j) * 1.f) <= radius)
spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
edgePreserve_LUT(radius, width2, sigmaVal, sigmaSp, spOfs, spWt, valWt);
edgePreserve_mainBody_process(buf, width2, height2, width2,
dst, width, radius, spOfs, spWt, valWt, maxk);
// no smooth on the border
void edgePreserve_mainBody_sse(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp)
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 *spOfs = NULL;
F32 *spWt = NULL;
F32 *valWt = NULL;
for (i = -radius; i <= radius; i++)
for (j = -radius; j <= radius; j++)
if (sqrtf((i * i + j * j) * 1.f) <= radius)
spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
edgePreserve_LUT(radius, width, sigmaVal, sigmaSp, spOfs, spWt, valWt);
borderCopy(src, dst, width, height, radius);
// edgePreserve_noBorder_sse_mainloop(src, dst, width, height, radius, spOfs, spWt, valWt, maxk);
edgePreserve_mainBody_sse_process(src, width, height, width,
dst + width * radius + radius, width,
radius, spOfs, spWt, valWt, maxk);
// smooth on the border
void edgePreserve_sse(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius, F32 sigmaVal, F32 sigmaSp)
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
S32 *spOfs = NULL;
F32 *spWt = NULL;
F32 *valWt = NULL;
U16 *buf = (U16 *)MALLOC(sizeof(U16) * width2 * height2);
borderReflect(src, width, height, buf, radius);
for (i = -radius; i <= radius; i++)
for (j = -radius; j <= radius; j++)
if (sqrtf((i * i + j * j) * 1.f) <= radius)
spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
edgePreserve_LUT(radius, width2, sigmaVal, sigmaSp, spOfs, spWt, valWt);
// edgePreserve_sse_mainloop(buf, dst, width, height, radius, spOfs, spWt, valWt, maxk);
edgePreserve_mainBody_sse_process(buf, width2, height2, width2,
dst, width, radius, spOfs, spWt, valWt, maxk);
typedef struct EDGE_PRES_FILT_HDL
S32 maxk;
S32 width;
S32 height;
S32 radius;
S32 *spOfs;
F32 *spWt;
F32 *valWt;
U16 *buf;
edgePresFiltMain::edgePresFiltMain(S32 width, S32 height, S32 radius, F32 sigmaVal, F32 sigmaSp)
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
hdl = pHdl;
for (i = -radius; i <= radius; i++)
for (j = -radius; j <= radius; j++)
if (sqrtf((i * i + j * j) * 1.f) <= radius)
pHdl->maxk = maxk;
pHdl->width = width;
pHdl->height = height;
pHdl->radius = radius;
pHdl->spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
pHdl->spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
pHdl->valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
edgePreserve_LUT(radius, width, sigmaVal, sigmaSp, pHdl->spOfs, pHdl->spWt, pHdl->valWt);
delete hdl;
void edgePresFiltMain::edgePreserve_mainBody(U16 *src, U16 *dst)
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
borderCopy(src, dst, width, height, radius);
edgePreserve_mainBody_process(src, width, height, width,
dst + width * radius + radius, width,
radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
void edgePresFiltMain::edgePreserve_mainBody_omp(U16 *src, U16 *dst)
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
borderCopy(src, dst, width, height, radius);
edgePreserve_mainBody_omp_process(src, width, height, width,
dst + width * radius + radius, width,
radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
void edgePresFiltMain::edgePreserve_mainBody_sse(U16 *src, U16 *dst)
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
borderCopy(src, dst, width, height, radius);
edgePreserve_mainBody_sse_process(src, width, height, width,
dst + width * radius + radius, width,
radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
void edgePresFiltMain::edgePreserve_mainBody_sse_omp(U16 *src, U16 *dst)
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
borderCopy(src, dst, width, height, radius);
edgePreserve_mainBody_sse_omp_process(src, width, height, width,
dst + width * radius + radius, width,
radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
edgePresFilt::edgePresFilt(S32 width, S32 height, S32 radius, F32 sigmaVal, F32 sigmaSp)
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
hdl = pHdl;
for (i = -radius; i <= radius; i++)
for (j = -radius; j <= radius; j++)
if (sqrtf((i * i + j * j) * 1.f) <= radius)
pHdl->maxk = maxk;
pHdl->width = width;
pHdl->height = height;
pHdl->radius = radius;
pHdl->spWt = (F32 *)ALIGN_MALLOC16(sizeof(F32) * maxk);
pHdl->spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
pHdl->valWt = (F32 *)MALLOC(sizeof(F32) * BF_BUF_LEN);
pHdl->buf = (U16 *)MALLOC(sizeof(U16) * width2 * height2);
edgePreserve_LUT(radius, width2, sigmaVal, sigmaSp, pHdl->spOfs, pHdl->spWt, pHdl->valWt);
delete hdl;
void edgePresFilt::edgePreserve(U16 *src, U16 *dst)
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
borderReflect(src, width, height, pHdl->buf, radius);
edgePreserve_mainBody_process(pHdl->buf, width2, height2, width2,
dst, width, radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
void edgePresFilt::edgePreserve_omp(U16 *src, U16 *dst)
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
borderReflect(src, width, height, pHdl->buf, radius);
edgePreserve_mainBody_omp_process(pHdl->buf, width2, height2, width2,
dst, width, radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
void edgePresFilt::edgePreserve_sse(U16 *src, U16 *dst)
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
borderReflect(src, width, height, pHdl->buf, radius);
edgePreserve_mainBody_sse_process(pHdl->buf, width2, height2, width2,
dst, width, radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
void edgePresFilt::edgePreserve_sse_omp(U16 *src, U16 *dst)
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
borderReflect(src, width, height, pHdl->buf, radius);
edgePreserve_mainBody_sse_omp_process(pHdl->buf, width2, height2, width2,
dst, width, radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);
static void edgePreserveInt_LUT(S32 radius, S32 width, F32 sigmaVal, F32 sigmaSp,
S32 *spOfs, S32 *spWt, S32 *valWt)
S32 i = 0;
S32 j = 0;
S32 k = 0;
F32 sigmaValCoeff = -0.5f / (sigmaVal * sigmaVal);
F32 sigmaSpCoeff = -0.5f / (sigmaSp * sigmaSp);
for (i = -radius; i <= radius; i++)
for (j = -radius; j <= radius; j++)
if (sqrtf((i * i + j * j) * 1.f) <= radius)
if ((i == 0) && (j == 0))
spOfs[k] = i * width + j;
spWt[k] = (S32)(expf((i * i + j * j) * sigmaSpCoeff) * BF_INT_SCALE);
for (i = 0; i < BF_BUF_LEN - 1; i++)
valWt[i] = (S32)(expf(i * i * sigmaValCoeff) * BF_INT_SCALE);
valWt[i] = 0;
// Smooth main body of src to the dst img
// src img is bigger than dst img
static void edgePreserveInt_mainBody_process(U16 *src, S32 srcWidth, S32 srcHeight, S32 srcStep,
U16 *dst, S32 dstStep, S32 radius, S32 *spOfs, S32 *spWt, S32 *valWt, S32 maxk)
const S32 xEnd = srcWidth - radius;
const S32 dstHeight = srcHeight - (radius << 1);
S32 i = 0;
S32 j = 0;
S32 k = 0;
S32 n = 0;
U16 val0 = 0;
U16 val = 0;
S32 tmp = 0;
S32 w = 0;
S32 sum = 0;
S32 wsum = 0;
U16 *psrc = src + radius * srcStep;
U16 *pdst = dst;
for (i = 0; i < dstHeight; i++)
for (j = radius; j < xEnd; j++)
val0 = psrc[j];
sum = 0;
wsum = 0;
for (k = 0; k < maxk; k++)
val = psrc[j + spOfs[k]];
tmp = val - val0;
tmp = ABS(tmp);
tmp = MIN(tmp, BF_BUF_LEN - 1);
w = spWt[k] * valWt[tmp];
wsum += w;
sum += (val * w);
pdst[j - radius] = (U16)((sum + (val0 << BF_INT_BITS2)) / (wsum + BF_INT_SCALE2));
psrc += srcStep;
pdst += dstStep;
// smooth on the border
void edgePreserveInt(U16 *src, U16 *dst, S32 width, S32 height,
S32 radius_, F32 sigmaVal, F32 sigmaSp)
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 radius = GP_MIN(radius_, GP_EDGEPRES_R_MAX);
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
S32 *spOfs = NULL;
S32 *spWt = NULL;
S32 *valWt = NULL;
U16 *buf = (U16 *)MALLOC(sizeof(U16) * width2 * height2);
borderReflect(src, width, height, buf, radius);
for (i = -radius; i <= radius; i++)
for (j = -radius; j <= radius; j++)
if (sqrtf((i * i + j * j) * 1.f) <= radius)
spWt = (S32 *)ALIGN_MALLOC16(sizeof(S32) * maxk);
spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
valWt = (S32 *)MALLOC(sizeof(S32) * BF_BUF_LEN);
edgePreserveInt_LUT(radius, width2, sigmaVal, sigmaSp, spOfs, spWt, valWt);
edgePreserveInt_mainBody_process(buf, width2, height2, width2,
dst, width, radius, spOfs, spWt, valWt, maxk);
typedef struct EDGE_PRES_FILT_INT_HDL
S32 maxk;
S32 width;
S32 height;
S32 radius;
S32 *spOfs;
S32 *spWt;
S32 *valWt;
U16 *buf;
edgePresFiltInt::edgePresFiltInt(S32 width, S32 height, S32 radius_, F32 sigmaVal, F32 sigmaSp)
S32 i = 0;
S32 j = 0;
S32 maxk = -1; //exclude the center
S32 radius = GP_MIN(radius_, GP_EDGEPRES_R_MAX);
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
hdl = pHdl;
for (i = -radius; i <= radius; i++)
for (j = -radius; j <= radius; j++)
if (sqrtf((i * i + j * j) * 1.f) <= radius)
pHdl->maxk = maxk;
pHdl->width = width;
pHdl->height = height;
pHdl->radius = radius;
pHdl->spWt = (S32 *)ALIGN_MALLOC16(sizeof(S32) * maxk);
pHdl->spOfs = (S32 *)MALLOC(sizeof(S32) * maxk);
pHdl->valWt = (S32 *)MALLOC(sizeof(S32) * BF_BUF_LEN);
pHdl->buf = (U16 *)MALLOC(sizeof(U16) * width2 * height2);
edgePreserveInt_LUT(radius, width2, sigmaVal, sigmaSp, pHdl->spOfs, pHdl->spWt, pHdl->valWt);
delete hdl;
void edgePresFiltInt::edgePreserve(U16 *src, U16 *dst)
S32 width = pHdl->width;
S32 height = pHdl->height;
S32 radius = pHdl->radius;
S32 width2 = width + (radius << 1);
S32 height2 = height + (radius << 1);
borderReflect(src, width, height, pHdl->buf, radius);
edgePreserveInt_mainBody_process(pHdl->buf, width2, height2, width2,
dst, width, radius, pHdl->spOfs, pHdl->spWt, pHdl->valWt, pHdl->maxk);