中值滤波器使用了快速3x3中值滤波器 数据类型16U
摘要
我们以下将使用
1. SIMD: SSE, AVX
2. multiThread: openmp, std::thread
3. SIMD + multiThread: AVX + openmp
4. data: 分行并行加速,分块儿并行加速
这里先把文末的总结写出来
总结:
1.1 快速3X3中值滤波本身的计算是元素比较和排序,
属于控制密集型,而非运算密集型
所以带宽的耗时比计算的耗时明显
1.2 由于算法需要,数据IO使用的是loadu和storeu,而不是load和store。读写访问速度要慢一些。
2. 单核版本和SSE加速版本耗时几乎一样,这里是因为VS编译器的优化使得快速3X3中值滤波速度很快
计算速度瓶颈主要在内存访问
3. omp加速版本相比单核的快了3.65倍,接近4倍,和四核加速比较符合
4. omp分行加速和多行分块加速是几乎一样的,可见这里的额外开销不大
5. avx加速是sse加速的1.62倍,接近2倍,符合128位数据和256位数据特性
6. SIMD + multiThread : 这里测试了avx + omp
6.1 在avx的基础上增加omp多线程确实能增加3.65倍速度,接近4倍,和四核加速比较符合
6.2 avx + omp分行加速和多行分块加速是几乎一样的,可见这里的额外开销不大
7. std::thread的额外开销好像很大,分块并行反而更慢了,近20倍
先给出宏定义的操作
#define med_op(a,b,t) {t = a; a = MAX(a, b); b = MIN(t, b);}
#define med_op_sse(a,b,t) {t = a; a = _mm_max_epu16(a, b); b = _mm_min_epu16(t, b); }
#define med_op_neon(a,b,t) {t = a; a = vmaxq_u16(a, b); b = vminq_u16(t, b); }
#define med_op_avx(a,b,t) {t = a; a = _mm256_max_epu16(a, b); b = _mm256_min_epu16(t, b); }
void medianFilt::median3(U16 *src, U16 *dst, S32 width, S32 height)
{
U16 p0 = 0;
U16 p1 = 0;
U16 p2 = 0;
U16 p3 = 0;
U16 p4 = 0;
U16 p5 = 0;
U16 p6 = 0;
U16 p7 = 0;
U16 p8 = 0;
U16 tmp = 0;
S32 i0 = 0;
S32 i = 0;
S32 i2 = 0;
S32 j = 0;
U16 *psrc1 = NULL;
U16 *psrc2 = NULL;
U16 *psrc3 = NULL;
U16 *pdst = NULL;
memcpy(dst, src, width * sizeof(U16));
psrc1 = src;
psrc2 = psrc1 + width;
psrc3 = psrc2 + width;
pdst = dst + width;
for (j = 1; j < height - 1; j++)
{
pdst[0] = psrc2[0];
for (i = 1; i < width - 1; i++)
{
i0 = i - 1;
i2 = i + 1;
p0 = psrc1[i0];
p1 = psrc1[i];
p2 = psrc1[i2];
p3 = psrc2[i0];
p4 = psrc2[i];
p5 = psrc2[i2];
p6 = psrc3[i0];
p7 = psrc3[i];
p8 = psrc3[i2];
med_op(p1, p2, tmp);
med_op(p4, p5, tmp);
med_op(p7, p8, tmp);
med_op(p0, p1, tmp);
med_op(p3, p4, tmp);
med_op(p6, p7, tmp);
med_op(p1, p2, tmp);
med_op(p4, p5, tmp);
med_op(p7, p8, tmp);
med_op(p0, p3, tmp);
med_op(p5, p8, tmp);
med_op(p4, p7, tmp);
med_op(p3, p6, tmp);
med_op(p1, p4, tmp);
med_op(p2, p5, tmp);
med_op(p4, p7, tmp);
med_op(p4, p2, tmp);
med_op(p6, p4, tmp);
med_op(p4, p2, tmp);
pdst[i] = p4;;
}
pdst[i] = psrc2[i];
psrc1 += width;
psrc2 += width;
psrc3 += width;
pdst += width;
}
memcpy(pdst, psrc2, width * sizeof(U16));
}
// median omp
void medianFilt::median3_omp(U16 *src, U16 *dst, S32 width, S32 height)
{
S32 j = 0;
memcpy(dst, src, width * sizeof(U16));
memcpy(dst + width * height - width, src + width * height - width, width * sizeof(U16));
#pragma omp parallel for
for (j = 1; j < height - 1; j++)
{
U16 *psrc2 = src + width * j;
U16 *pdst = dst + width * j;
U16 *psrc1 = psrc2 - width;
U16 *psrc3 = psrc2 + width;
U16 p0 = 0;
U16 p1 = 0;
U16 p2 = 0;
U16 p3 = 0;
U16 p4 = 0;
U16 p5 = 0;
U16 p6 = 0;
U16 p7 = 0;
U16 p8 = 0;
U16 tmp = 0;
S32 i0 = 0;
S32 i = 0;
S32 i2 = 0;
pdst[0] = psrc2[0];
for (i = 1; i < width - 1; i++)
{
i0 = i - 1;
i2 = i + 1;
p0 = psrc1[i0];
p1 = psrc1[i];
p2 = psrc1[i2];
p3 = psrc2[i0];
p4 = psrc2[i];
p5 = psrc2[i2];
p6 = psrc3[i0];
p7 = psrc3[i];
p8 = psrc3[i2];
med_op(p1, p2, tmp);
med_op(p4, p5, tmp);
med_op(p7, p8, tmp);
med_op(p0, p1, tmp);
med_op(p3, p4, tmp);
med_op(p6, p7, tmp);
med_op(p1, p2, tmp);
med_op(p4, p5, tmp);
med_op(p7, p8, tmp);
med_op(p0, p3, tmp);
med_op(p5, p8, tmp);
med_op(p4, p7, tmp);
med_op(p3, p6, tmp);
med_op(p1, p4, tmp);
med_op(p2, p5, tmp);
med_op(p4, p7, tmp);
med_op(p4, p2, tmp);
med_op(p6, p4, tmp);
med_op(p4, p2, tmp);
pdst[i] = p4;;
}
pdst[i] = psrc2[i];
}
}
static void median3_section(U16 *src, U16 *dst, S32 width, S32 height)
{
U16 p0 = 0;
U16 p1 = 0;
U16 p2 = 0;
U16 p3 = 0;
U16 p4 = 0;
U16 p5 = 0;
U16 p6 = 0;
U16 p7 = 0;
U16 p8 = 0;
U16 tmp = 0;
S32 i0 = 0;
S32 i = 0;
S32 i2 = 0;
S32 j = 0;
U16 *psrc1 = NULL;
U16 *psrc2 = NULL;
U16 *psrc3 = NULL;
U16 *pdst = NULL;
psrc1 = src;
psrc2 = psrc1 + width;
psrc3 = psrc2 + width;
pdst = dst + width;
for (j = 1; j < height - 1; j++)
{
pdst[0] = psrc2[0];
for (i = 1; i < width - 1; i++)
{
i0 = i - 1;
i2 = i + 1;
p0 = psrc1[i0];
p1 = psrc1[i];
p2 = psrc1[i2];
p3 = psrc2[i0];
p4 = psrc2[i];
p5 = psrc2[i2];
p6 = psrc3[i0];
p7 = psrc3[i];
p8 = psrc3[i2];
med_op(p1, p2, tmp);
med_op(p4, p5, tmp);
med_op(p7, p8, tmp);
med_op(p0, p1, tmp);
med_op(p3, p4, tmp);
med_op(p6, p7, tmp);
med_op(p1, p2, tmp);
med_op(p4, p5, tmp);
med_op(p7, p8, tmp);
med_op(p0, p3, tmp);
med_op(p5, p8, tmp);
med_op(p4, p7, tmp);
med_op(p3, p6, tmp);
med_op(p1, p4, tmp);
med_op(p2, p5, tmp);
med_op(p4, p7, tmp);
med_op(p4, p2, tmp);
med_op(p6, p4, tmp);
med_op(p4, p2, tmp);
pdst[i] = p4;;
}
pdst[i] = psrc2[i];
psrc1 += width;
psrc2 += width;
psrc3 += width;
pdst += width;
}
}
void medianFilt::median3_omp4(U16 *src, U16 *dst, S32 width, S32 height)
{
// const S32 CPU_CORE_NUM = 4;
S32 j = 0;
S32 offset = 0;
memcpy(dst, src, width * sizeof(U16));
memcpy(dst + width * height - width, src + width * height - width, width * sizeof(U16));
#pragma omp parallel for
for (j = 0; j < 4; j++)
{
offset = j * width * 72;
median3_section(src + offset, dst + offset, width, 74);
}
}
void medianFilt::median3_4thd(U16 *src, U16 *dst, S32 width, S32 height)
{
// const S32 CPU_CORE_NUM = 4;
S32 j = 0;
S32 offset = 0;
U16 *psrc = NULL;
U16 *pdst = NULL;
memcpy(dst, src, width * sizeof(U16));
memcpy(dst + width * height - width, src + width * height - width, width * sizeof(U16));
psrc = src;
pdst = dst;
std::thread t1(median3_section, psrc, pdst, width, 74);
offset = 72 * width;
psrc = src + offset;
pdst = dst + offset;
std::thread t2(median3_section, psrc, pdst, width, 74);
offset = 144 * width;
psrc = src + offset;
pdst = dst + offset;
std::thread t3(median3_section, psrc, pdst, width, 74);
offset = 216 * width;
psrc = src + offset;
pdst = dst + offset;
std::thread t4(median3_section, psrc, pdst, width, 74);
t1.detach();
t2.detach();
t3.detach();
t4.join();
}
// median SSE version
void medianFilt::median3_sse(U16 *src, U16 *dst, S32 width, S32 height)
{
#define MED_VEC_SZ 8
S32 i0 = 0;
S32 i = 0;
S32 i2 = 0;
S32 j = 0;
const S32 xend = (width - 2) % MED_VEC_SZ + 1;
__m128i v0;
__m128i v1;
__m128i v2;
__m128i v3;
__m128i v4;
__m128i v5;
__m128i v6;
__m128i v7;
__m128i v8;
__m128i vtmp;
U16 *psrc1 = NULL;
U16 *psrc2 = NULL;
U16 *psrc3 = NULL;
U16 *pdst = NULL;
memcpy(dst, src, width * sizeof(U16));
psrc1 = src;
psrc2 = psrc1 + width;
psrc3 = psrc2 + width;
pdst = dst + width;
for (j = 1; j < height - 1; j++)
{
pdst[0] = psrc2[0];
v0 = _mm_loadu_si128((const __m128i*)(psrc1));
v1 = _mm_loadu_si128((const __m128i*)(psrc1 + 1));
v2 = _mm_loadu_si128((const __m128i*)(psrc1 + 2));
v3 = _mm_loadu_si128((const __m128i*)(psrc2));
v4 = _mm_loadu_si128((const __m128i*)(psrc2 + 1));
v5 = _mm_loadu_si128((const __m128i*)(psrc2 + 2));
v6 = _mm_loadu_si128((const __m128i*)(psrc3));
v7 = _mm_loadu_si128((const __m128i*)(psrc3 + 1));
v8 = _mm_loadu_si128((const __m128i*)(psrc3 + 2));
med_op_sse(v1, v2, vtmp);
med_op_sse(v4, v5, vtmp);
med_op_sse(v7, v8, vtmp);
med_op_sse(v0, v1, vtmp);
med_op_sse(v3, v4, vtmp);
med_op_sse(v6, v7, vtmp);
med_op_sse(v1, v2, vtmp);
med_op_sse(v4, v5, vtmp);
med_op_sse(v7, v8, vtmp);
med_op_sse(v0, v3, vtmp);
med_op_sse(v5, v8, vtmp);
med_op_sse(v4, v7, vtmp);
med_op_sse(v3, v6, vtmp);
med_op_sse(v1, v4, vtmp);
med_op_sse(v2, v5, vtmp);
med_op_sse(v4, v7, vtmp);
med_op_sse(v4, v2, vtmp);
med_op_sse(v6, v4, vtmp);
med_op_sse(v4, v2, vtmp);
_mm_storeu_si128((__m128i*)(pdst + 1), v4);
for (i = xend; i < width - 1; i += MED_VEC_SZ)
{
i0 = i - 1;
i2 = i + 1;
v0 = _mm_loadu_si128((const __m128i*)(psrc1 + i0));
v1 = _mm_loadu_si128((const __m128i*)(psrc1 + i));
v2 = _mm_loadu_si128((const __m128i*)(psrc1 + i2));
v3 = _mm_loadu_si128((const __m128i*)(psrc2 + i0));
v4 = _mm_loadu_si128((const __m128i*)(psrc2 + i));
v5 = _mm_loadu_si128((const __m128i*)(psrc2 + i2));
v6 = _mm_loadu_si128((const __m128i*)(psrc3 + i0));
v7 = _mm_loadu_si128((const __m128i*)(psrc3 + i));
v8 = _mm_loadu_si128((const __m128i*)(psrc3 + i2));
med_op_sse(v1, v2, vtmp);
med_op_sse(v4, v5, vtmp);
med_op_sse(v7, v8, vtmp);
med_op_sse(v0, v1, vtmp);
med_op_sse(v3, v4, vtmp);
med_op_sse(v6, v7, vtmp);
med_op_sse(v1, v2, vtmp);
med_op_sse(v4, v5, vtmp);
med_op_sse(v7, v8, vtmp);
med_op_sse(v0, v3, vtmp);
med_op_sse(v5, v8, vtmp);
med_op_sse(v4, v7, vtmp);
med_op_sse(v3, v6, vtmp);
med_op_sse(v1, v4, vtmp);
med_op_sse(v2, v5, vtmp);
med_op_sse(v4, v7, vtmp);
med_op_sse(v4, v2, vtmp);
med_op_sse(v6, v4, vtmp);
med_op_sse(v4, v2, vtmp);
_mm_storeu_si128((__m128i*)(pdst + i), v4);
}
pdst[i] = psrc2[i];
psrc1 += width;
psrc2 += width;
psrc3 += width;
pdst += width;
}
memcpy(pdst, psrc2, width * sizeof(U16));
#undef MED_VEC_SZ
}
// median AVX version
void medianFilt::median3_avx(U16 *src, U16 *dst, S32 width, S32 height)
{
#define MED_VEC_SZ 16
S32 i0 = 0;
S32 i = 0;
S32 i2 = 0;
S32 j = 0;
const S32 xend = (width - 2) % MED_VEC_SZ + 1;
__m256i v0;
__m256i v1;
__m256i v2;
__m256i v3;
__m256i v4;
__m256i v5;
__m256i v6;
__m256i v7;
__m256i v8;
__m256i vtmp;
U16 *psrc1 = NULL;
U16 *psrc2 = NULL;
U16 *psrc3 = NULL;
U16 *pdst = NULL;
memcpy(dst, src, width * sizeof(U16));
psrc1 = src;
psrc2 = psrc1 + width;
psrc3 = psrc2 + width;
pdst = dst + width;
for (j = 1; j < height - 1; j++)
{
pdst[0] = psrc2[0];
v0 = _mm256_loadu_si256((const __m256i*)(psrc1));
v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + 1));
v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + 2));
v3 = _mm256_loadu_si256((const __m256i*)(psrc2));
v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + 1));
v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + 2));
v6 = _mm256_loadu_si256((const __m256i*)(psrc3));
v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + 1));
v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + 2));
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v1, vtmp);
med_op_avx(v3, v4, vtmp);
med_op_avx(v6, v7, vtmp);
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v3, vtmp);
med_op_avx(v5, v8, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v3, v6, vtmp);
med_op_avx(v1, v4, vtmp);
med_op_avx(v2, v5, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v4, v2, vtmp);
med_op_avx(v6, v4, vtmp);
med_op_avx(v4, v2, vtmp);
_mm256_storeu_si256((__m256i*)(pdst + 1), v4);
for (i = xend; i < width - 1; i += MED_VEC_SZ)
{
i0 = i - 1;
i2 = i + 1;
v0 = _mm256_loadu_si256((const __m256i*)(psrc1 + i0));
v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + i));
v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + i2));
v3 = _mm256_loadu_si256((const __m256i*)(psrc2 + i0));
v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + i));
v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + i2));
v6 = _mm256_loadu_si256((const __m256i*)(psrc3 + i0));
v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + i));
v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + i2));
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v1, vtmp);
med_op_avx(v3, v4, vtmp);
med_op_avx(v6, v7, vtmp);
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v3, vtmp);
med_op_avx(v5, v8, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v3, v6, vtmp);
med_op_avx(v1, v4, vtmp);
med_op_avx(v2, v5, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v4, v2, vtmp);
med_op_avx(v6, v4, vtmp);
med_op_avx(v4, v2, vtmp);
_mm256_storeu_si256((__m256i*)(pdst + i), v4);
}
pdst[i] = psrc2[i];
psrc1 += width;
psrc2 += width;
psrc3 += width;
pdst += width;
}
memcpy(pdst, psrc2, width * sizeof(U16));
#undef MED_VEC_SZ
}
// median AVX omp version
void medianFilt::median3_avx_omp(U16 *src, U16 *dst, S32 width, S32 height)
{
#define MED_VEC_SZ 16
S32 j = 0;
const S32 xend = (width - 2) % MED_VEC_SZ + 1;
memcpy(dst, src, width * sizeof(U16));
memcpy(dst + width * height - width, src + width * height - width, width * sizeof(U16));
#pragma omp parallel for
for (j = 1; j < height - 1; j++)
{
U16 *psrc2 = src + width * j;
U16 *pdst = dst + width * j;
U16 *psrc1 = psrc2 - width;
U16 *psrc3 = psrc2 + width;
__m256i v0;
__m256i v1;
__m256i v2;
__m256i v3;
__m256i v4;
__m256i v5;
__m256i v6;
__m256i v7;
__m256i v8;
__m256i vtmp;
S32 i0 = 0;
S32 i = 0;
S32 i2 = 0;
pdst[0] = psrc2[0];
v0 = _mm256_loadu_si256((const __m256i*)(psrc1));
v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + 1));
v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + 2));
v3 = _mm256_loadu_si256((const __m256i*)(psrc2));
v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + 1));
v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + 2));
v6 = _mm256_loadu_si256((const __m256i*)(psrc3));
v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + 1));
v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + 2));
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v1, vtmp);
med_op_avx(v3, v4, vtmp);
med_op_avx(v6, v7, vtmp);
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v3, vtmp);
med_op_avx(v5, v8, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v3, v6, vtmp);
med_op_avx(v1, v4, vtmp);
med_op_avx(v2, v5, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v4, v2, vtmp);
med_op_avx(v6, v4, vtmp);
med_op_avx(v4, v2, vtmp);
_mm256_storeu_si256((__m256i*)(pdst + 1), v4);
for (i = xend; i < width - 1; i += MED_VEC_SZ)
{
i0 = i - 1;
i2 = i + 1;
v0 = _mm256_loadu_si256((const __m256i*)(psrc1 + i0));
v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + i));
v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + i2));
v3 = _mm256_loadu_si256((const __m256i*)(psrc2 + i0));
v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + i));
v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + i2));
v6 = _mm256_loadu_si256((const __m256i*)(psrc3 + i0));
v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + i));
v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + i2));
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v1, vtmp);
med_op_avx(v3, v4, vtmp);
med_op_avx(v6, v7, vtmp);
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v3, vtmp);
med_op_avx(v5, v8, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v3, v6, vtmp);
med_op_avx(v1, v4, vtmp);
med_op_avx(v2, v5, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v4, v2, vtmp);
med_op_avx(v6, v4, vtmp);
med_op_avx(v4, v2, vtmp);
_mm256_storeu_si256((__m256i*)(pdst + i), v4);
}
pdst[i] = psrc2[i];
psrc1 += width;
psrc2 += width;
psrc3 += width;
pdst += width;
}
#undef MED_VEC_SZ
}
static void median3_avx_section(U16 *src, U16 *dst, S32 width, S32 height)
{
#define MED_VEC_SZ 16
S32 i0 = 0;
S32 i = 0;
S32 i2 = 0;
S32 j = 0;
const S32 xend = (width - 2) % MED_VEC_SZ + 1;
__m256i v0;
__m256i v1;
__m256i v2;
__m256i v3;
__m256i v4;
__m256i v5;
__m256i v6;
__m256i v7;
__m256i v8;
__m256i vtmp;
U16 *psrc1 = NULL;
U16 *psrc2 = NULL;
U16 *psrc3 = NULL;
U16 *pdst = NULL;
psrc1 = src;
psrc2 = psrc1 + width;
psrc3 = psrc2 + width;
pdst = dst + width;
for (j = 1; j < height - 1; j++)
{
pdst[0] = psrc2[0];
v0 = _mm256_loadu_si256((const __m256i*)(psrc1));
v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + 1));
v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + 2));
v3 = _mm256_loadu_si256((const __m256i*)(psrc2));
v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + 1));
v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + 2));
v6 = _mm256_loadu_si256((const __m256i*)(psrc3));
v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + 1));
v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + 2));
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v1, vtmp);
med_op_avx(v3, v4, vtmp);
med_op_avx(v6, v7, vtmp);
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v3, vtmp);
med_op_avx(v5, v8, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v3, v6, vtmp);
med_op_avx(v1, v4, vtmp);
med_op_avx(v2, v5, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v4, v2, vtmp);
med_op_avx(v6, v4, vtmp);
med_op_avx(v4, v2, vtmp);
_mm256_storeu_si256((__m256i*)(pdst + 1), v4);
for (i = xend; i < width - 1; i += MED_VEC_SZ)
{
i0 = i - 1;
i2 = i + 1;
v0 = _mm256_loadu_si256((const __m256i*)(psrc1 + i0));
v1 = _mm256_loadu_si256((const __m256i*)(psrc1 + i));
v2 = _mm256_loadu_si256((const __m256i*)(psrc1 + i2));
v3 = _mm256_loadu_si256((const __m256i*)(psrc2 + i0));
v4 = _mm256_loadu_si256((const __m256i*)(psrc2 + i));
v5 = _mm256_loadu_si256((const __m256i*)(psrc2 + i2));
v6 = _mm256_loadu_si256((const __m256i*)(psrc3 + i0));
v7 = _mm256_loadu_si256((const __m256i*)(psrc3 + i));
v8 = _mm256_loadu_si256((const __m256i*)(psrc3 + i2));
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v1, vtmp);
med_op_avx(v3, v4, vtmp);
med_op_avx(v6, v7, vtmp);
med_op_avx(v1, v2, vtmp);
med_op_avx(v4, v5, vtmp);
med_op_avx(v7, v8, vtmp);
med_op_avx(v0, v3, vtmp);
med_op_avx(v5, v8, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v3, v6, vtmp);
med_op_avx(v1, v4, vtmp);
med_op_avx(v2, v5, vtmp);
med_op_avx(v4, v7, vtmp);
med_op_avx(v4, v2, vtmp);
med_op_avx(v6, v4, vtmp);
med_op_avx(v4, v2, vtmp);
_mm256_storeu_si256((__m256i*)(pdst + i), v4);
}
pdst[i] = psrc2[i];
psrc1 += width;
psrc2 += width;
psrc3 += width;
pdst += width;
}
#undef MED_VEC_SZ
}
void medianFilt::median3_avx_omp4(U16 *src, U16 *dst, S32 width, S32 height)
{
// const S32 CPU_CORE_NUM = 4;
S32 j = 0;
S32 offset = 0;
memcpy(dst, src, width * sizeof(U16));
memcpy(dst + width * height - width, src + width * height - width, width * sizeof(U16));
#pragma omp parallel for
for (j = 0; j < 4; j++)
{
offset = j * width * 72;
median3_avx_section(src + offset, dst + offset, width, 74);
}
}
使用了opencv读图并计算时间消耗
void median1_testP()
{
Mat src, dst_, dst, delta;
src = imread("D:/d.pgm", -1);
medianBlur(src, dst, 3);
dst_ = src.clone();
U16 *psrc = (U16 *)src.data;
U16 *pdst = (U16 *)dst_.data;
double t;
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
medianFilt::median3(psrc, pdst, src.cols, src.rows);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("%.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
medianFilt::median3_omp(psrc, pdst, src.cols, src.rows);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("omp %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
medianFilt::median3_omp4(psrc, pdst, src.cols, src.rows);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("omp4 %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
medianFilt::median3_4thd(psrc, pdst, src.cols, src.rows);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("4thd %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
medianFilt::median3_sse(psrc, pdst, src.cols, src.rows);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("sse %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
medianFilt::median3_avx(psrc, pdst, src.cols, src.rows);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("avx %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
medianFilt::median3_avx_omp(psrc, pdst, src.cols, src.rows);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("avx omp %.3fms\n", t);
t = (double)getTickCount();
for (int i = 0; i < CYC_NUM; i++)
{
medianFilt::median3_avx_omp4(psrc, pdst, src.cols, src.rows);
}
t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
printf("avx omp4 %.3fms\n", t);
Rect roi(1, 1, 498, 288);
delta = (dst_ - dst)(roi);
}
#if NEON_ENABLE
// median NEON version
void medianFilt::median3_neon(U16 *src, U16 *dst, S32 width, S32 height)
{
#define MED_VEC_SZ 8
S32 i0 = 0;
S32 i = 0;
S32 i2 = 0;
S32 j = 0;
const S32 xend = (width - 2) % MED_VEC_SZ + 1;
uint16x8_t v0;
uint16x8_t v1;
uint16x8_t v2;
uint16x8_t v3;
uint16x8_t v4;
uint16x8_t v5;
uint16x8_t v6;
uint16x8_t v7;
uint16x8_t v8;
uint16x8_t vtmp;
U16 *psrc1 = NULL;
U16 *psrc2 = NULL;
U16 *psrc3 = NULL;
U16 *pdst = NULL;
memcpy(dst, src, width * sizeof(U16));
psrc1 = src;
psrc2 = psrc1 + width;
psrc3 = psrc2 + width;
pdst = dst + width;
for (j = 1; j < height - 1; j++)
{
pdst[0] = psrc2[0];
v0 = vld1q_u16(psrc1);
v1 = vld1q_u16(psrc1 + 1);
v2 = vld1q_u16(psrc1 + 2);
v3 = vld1q_u16(psrc2);
v4 = vld1q_u16(psrc2 + 1);
v5 = vld1q_u16(psrc2 + 2);
v6 = vld1q_u16(psrc3);
v7 = vld1q_u16(psrc3 + 1);
v8 = vld1q_u16(psrc3 + 2);
med_op_neon(v1, v2, vtmp);
med_op_neon(v4, v5, vtmp);
med_op_neon(v7, v8, vtmp);
med_op_neon(v0, v1, vtmp);
med_op_neon(v3, v4, vtmp);
med_op_neon(v6, v7, vtmp);
med_op_neon(v1, v2, vtmp);
med_op_neon(v4, v5, vtmp);
med_op_neon(v7, v8, vtmp);
med_op_neon(v0, v3, vtmp);
med_op_neon(v5, v8, vtmp);
med_op_neon(v4, v7, vtmp);
med_op_neon(v3, v6, vtmp);
med_op_neon(v1, v4, vtmp);
med_op_neon(v2, v5, vtmp);
med_op_neon(v4, v7, vtmp);
med_op_neon(v4, v2, vtmp);
med_op_neon(v6, v4, vtmp);
med_op_neon(v4, v2, vtmp);
vst1q_u16((pdst + 1), v4);
for (i = xend; i < width - 1; i += MED_VEC_SZ)
{
i0 = i - 1;
i2 = i + 1;
v0 = vld1q_u16(psrc1 + i0);
v1 = vld1q_u16(psrc1 + i);
v2 = vld1q_u16(psrc1 + i2);
v3 = vld1q_u16(psrc2 + i0);
v4 = vld1q_u16(psrc2 + i);
v5 = vld1q_u16(psrc2 + i2);
v6 = vld1q_u16(psrc3 + i0);
v7 = vld1q_u16(psrc3 + i);
v8 = vld1q_u16(psrc3 + i2);
med_op_neon(v1, v2, vtmp);
med_op_neon(v4, v5, vtmp);
med_op_neon(v7, v8, vtmp);
med_op_neon(v0, v1, vtmp);
med_op_neon(v3, v4, vtmp);
med_op_neon(v6, v7, vtmp);
med_op_neon(v1, v2, vtmp);
med_op_neon(v4, v5, vtmp);
med_op_neon(v7, v8, vtmp);
med_op_neon(v0, v3, vtmp);
med_op_neon(v5, v8, vtmp);
med_op_neon(v4, v7, vtmp);
med_op_neon(v3, v6, vtmp);
med_op_neon(v1, v4, vtmp);
med_op_neon(v2, v5, vtmp);
med_op_neon(v4, v7, vtmp);
med_op_neon(v4, v2, vtmp);
med_op_neon(v6, v4, vtmp);
med_op_neon(v4, v2, vtmp);
vst1q_u16((pdst + i), v4);
}
pdst[i] = psrc2[i];
psrc1 += width;
psrc2 += width;
psrc3 += width;
pdst += width;
}
memcpy(pdst, psrc2, width * sizeof(U16));
#undef MED_VEC_SZ
}
#endif
现在使用测试代码测试速度(我的平台是4核CPU)
1 core : 0.157ms
omp : 0.043ms
omp 4 sections : 0.043ms
4threads : 2.919ms
sse : 0.154ms
avx : 0.095ms
avx omp : 0.026ms
avx omp 4 sections : 0.028ms
总结:
1.1 快速3X3中值滤波本身的计算是元素比较和排序,
属于控制密集型,而非运算密集型
所以带宽的耗时比计算的耗时明显
1.2 由于算法需要,数据IO使用的是loadu和storeu,而不是load和store。读写访问速度要慢一些。
2. 单核版本和SSE加速版本耗时几乎一样,这里是因为VS编译器的优化使得快速3X3中值滤波速度很快
计算速度瓶颈主要在内存访问
3. omp加速版本相比单核的快了3.65倍,接近4倍,和四核加速比较符合
4. omp分行加速和多行分块加速是几乎一样的,可见这里的额外开销不大
5. avx加速是sse加速的1.62倍,接近2倍,符合128位数据和256位数据特性
6. SIMD + multiThread : 这里测试了avx + omp
6.1 在avx的基础上增加omp多线程确实能增加3.65倍速度,接近4倍,和四核加速比较符合
6.2 avx + omp分行加速和多行分块加速是几乎一样的,可见这里的额外开销不大
7. std::thread的额外开销好像很大,分块并行反而更慢了,近20倍
8* neon版本在文末,暂时没有测试过