本文将以矩阵转置为例,用cpu模式下的写列连续、写行连续,neon模式下 8 * 8、8 * 16,cache预读等几种样例分析优化心得。
测试用例:
数组大小:640 * 480
运算次数:1024
测试平台:RV1126
由于矩阵转置的逻辑功能是将矩阵行列对调,同时矩阵在内存中通常是按照行内存连续的方式存储的。因此,原始矩阵和目标矩阵之间转换的过程中,一定存在一个矩阵的内存访问时不连续的。为了让程序执行时间足够长,使执行时间足够稳定,同时便于抓包分析,重复执行代码1024次。
#include //要包含这个头文件
#include
#include
#include
#include
#include
#include
#include
#include
#include
typedef unsigned long long am_ms_t;
typedef unsigned long long am_us_t;
#define IMAGE_WIDTH (320)
#define IMAGE_HEIGH (240)
#define IMAGE_TEST (10240)
am_ms_t sys_get_ms(void)
{
struct timespec ts;
am_ms_t ms = 0;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
ms = (am_ms_t)(ts.tv_sec * 1000);
ms += ((am_ms_t)(ts.tv_nsec / 1000000));
return ms;
}
am_us_t sys_get_us(void)
{
am_us_t us = 0;
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
us = (am_us_t)(ts.tv_sec * 1000000);
us += (am_us_t)(ts.tv_nsec / 1000);
return us;
}
int main()
{
uint32_t size = IMAGE_WIDTH * IMAGE_HEIGH;
uint32_t i = 0;
uint8_t *psrc = NULL;
uint8_t *pdst0 = NULL;
posix_memalign((void **)&psrc, 64, size);
posix_memalign((void **)&pdst0, 64, size);
am_ms_t old_ms = 0;
memset(pdst0, 0, size);
srand(time(NULL));
for(i=0; i<size; i++)
{
*(psrc + i) = (rand()&0xff);
}
old_ms = sys_get_ms();
for (i=0; i<IMAGE_TEST; i++)
{
transposition_cpu_colseq(psrc, pdst0, IMAGE_WIDTH, IMAGE_HEIGH);
}
printf("trans col const:%llu ms\n", sys_get_ms()-old_ms);
old_ms = sys_get_ms();
for (i=0; i<IMAGE_TEST; i++)
{
transposition_cpu_rowseq(psrc, pdst0, IMAGE_WIDTH, IMAGE_HEIGH);
}
printf("trans row const:%llu ms\n", sys_get_ms()-old_ms);
old_ms = sys_get_ms();
for (i=0; i<IMAGE_TEST; i++)
{
transposition_neon_8x8(psrc, pdst0, IMAGE_WIDTH, IMAGE_HEIGH);
}
printf("trans neon 8x8 const:%llu ms\n", sys_get_ms()-old_ms);
old_ms = sys_get_ms();
for (i=0; i<IMAGE_TEST; i++)
{
transposition_neon_16x8(psrc, pdst0, IMAGE_WIDTH, IMAGE_HEIGH);
}
printf("trans neon 16x8 const:%llu ms\n", sys_get_ms()-old_ms);
old_ms = sys_get_ms();
for (i=0; i<IMAGE_TEST; i++)
{
transposition_neon_16x8_prefetch(psrc, pdst0, IMAGE_WIDTH, IMAGE_HEIGH);
}
printf("trans neon 16x8 prefetch const:%llu ms\n", sys_get_ms()-old_ms);
free(psrc);
free(pdst0);
return 0;
}
编译选项
ASFLAGS := -g -ggdb -Os -mfpu=neon-vfpv4 -mfloat-abi=hard
CFLAGS := -g -ggdb -Os -mfpu=neon-vfpv4 -mfloat-abi=hard
CXXFLAGS := -g -ggdb -Os -mfpu=neon-vfpv4 -mfloat-abi=hard
LDFLAGS :=
ASFLAGS += -ftree-vectorize -fopenmp -ffast-math -finline-functions -funroll-all-loops
CFLAGS += -ftree-vectorize -fopenmp -ffast-math -finline-functions -funroll-all-loops
CXXFLAGS += -ftree-vectorize -fopenmp -ffast-math -finline-functions -funroll-all-loops
LDFLAGS += -fopenmp
int transposition_cpu_colseq(uint8_t *src, uint8_t *dst, const int w, const int h)
{
//读连续,写跳跃
int x, y;
for (y = 0; y < h; y++)
{
for (x = 0; x < w; x++)
{
dst[x * h + y] = src[y * w + x];
}
}
return 0;
}
int transposition_cpu_rowseq(uint8_t *src, uint8_t *dst, const int w, const int h)
{
//读跳跃,写连续
int x, y;
for(x=0; x<w; x++)
{
for(y=0; y<h; y++)
{
dst[x * h + y] = src[y * w + x];
}
}
return 0;
}
缓存一致性算法在读写内存的实现细节不同,具体读写性能差异因IC架构所导致的。实际需两种对比后,择优选取。
参考《深入理解计算机系统》中缓存章节,缓存更新算法处理写内存操作比读内存更复杂。其中,读miss时,缓存会依次从低等级存储中寻找并加载包含当前数据的Cache line;在写miss时,首先会依次从低等级缓存中加载数据至当前缓存中,然后对当前缓存行更新数据,并依次向低等级缓存跟新Cache line。因此写内存不连续的情况会比读内存不连续时额外做了写回操作,造成性能下降。
与我实测数据结构有差异,暂未能分析出具体原因
int transposition_neon_8x8(uint8_t *src, uint8_t *dst, int w, int h)
{
uint8x8x4_t mat1;
uint8x8x4_t mat2;
uint8x8x2_t temp1;
uint8x8x2_t temp2;
uint8x8x2_t temp3;
uint8x8x2_t temp4;
uint16x4x4_t temp11;
uint16x4x4_t temp12;
uint16x4x2_t temp5;
uint16x4x2_t temp6;
uint16x4x2_t temp7;
uint16x4x2_t temp8;
uint32x2x4_t temp21;
uint32x2x4_t temp22;
uint32x2x2_t res1;
uint32x2x2_t res2;
uint32x2x2_t res3;
uint32x2x2_t res4;
int dw = w & 7;
int dh = h & 7;
int sw = w - dw;
int sh = h - dh;
int x, y;
for(y=0; y<sh; y+=8)
{
for(x=0; x<sw; x+=8)
{
mat1.val[0] = vld1_u8(src + (y + 0) * w + x);
mat1.val[1] = vld1_u8(src + (y + 1) * w + x);
mat1.val[2] = vld1_u8(src + (y + 2) * w + x);
mat1.val[3] = vld1_u8(src + (y + 3) * w + x);
mat2.val[0] = vld1_u8(src + (y + 4) * w + x);
mat2.val[1] = vld1_u8(src + (y + 5) * w + x);
mat2.val[2] = vld1_u8(src + (y + 6) * w + x);
mat2.val[3] = vld1_u8(src + (y + 7) * w + x);
temp1 = vtrn_u8(mat1.val[0], mat1.val[1]);
temp2 = vtrn_u8(mat1.val[2], mat1.val[3]);
temp3 = vtrn_u8(mat2.val[0], mat2.val[1]);
temp4 = vtrn_u8(mat2.val[2], mat2.val[3]);
temp11.val[0] = vreinterpret_u16_u8(temp1.val[0]);
temp11.val[1] = vreinterpret_u16_u8(temp1.val[1]);
temp11.val[2] = vreinterpret_u16_u8(temp2.val[0]);
temp11.val[3] = vreinterpret_u16_u8(temp2.val[1]);
temp12.val[0] = vreinterpret_u16_u8(temp3.val[0]);
temp12.val[1] = vreinterpret_u16_u8(temp3.val[1]);
temp12.val[2] = vreinterpret_u16_u8(temp4.val[0]);
temp12.val[3] = vreinterpret_u16_u8(temp4.val[1]);
temp5 = vtrn_u16(temp11.val[0], temp11.val[2]);
temp6 = vtrn_u16(temp11.val[1], temp11.val[3]);
temp7 = vtrn_u16(temp12.val[0], temp12.val[2]);
temp8 = vtrn_u16(temp12.val[1], temp12.val[3]);
temp21.val[0] = vreinterpret_u32_u16(temp5.val[0]);
temp21.val[1] = vreinterpret_u32_u16(temp5.val[1]);
temp21.val[2] = vreinterpret_u32_u16(temp6.val[0]);
temp21.val[3] = vreinterpret_u32_u16(temp6.val[1]);
temp22.val[0] = vreinterpret_u32_u16(temp7.val[0]);
temp22.val[1] = vreinterpret_u32_u16(temp7.val[1]);
temp22.val[2] = vreinterpret_u32_u16(temp8.val[0]);
temp22.val[3] = vreinterpret_u32_u16(temp8.val[1]);
res1 = vtrn_u32(temp21.val[0],temp22.val[0]);
res2 = vtrn_u32(temp21.val[1],temp22.val[1]);
res3 = vtrn_u32(temp21.val[2],temp22.val[2]);
res4 = vtrn_u32(temp21.val[3],temp22.val[3]);
mat1.val[0] = vreinterpret_u8_u32(res1.val[0]);
mat1.val[1] = vreinterpret_u8_u32(res2.val[0]);
mat1.val[2] = vreinterpret_u8_u32(res3.val[0]);
mat1.val[3] = vreinterpret_u8_u32(res4.val[0]);
mat2.val[0] = vreinterpret_u8_u32(res1.val[1]);
mat2.val[1] = vreinterpret_u8_u32(res2.val[1]);
mat2.val[2] = vreinterpret_u8_u32(res3.val[1]);
mat2.val[3] = vreinterpret_u8_u32(res4.val[1]);
vst1_u8(dst + (x + 0) * h + y, mat1.val[0]);
vst1_u8(dst + (x + 1) * h + y, mat1.val[2]);
vst1_u8(dst + (x + 2) * h + y, mat1.val[1]);
vst1_u8(dst + (x + 3) * h + y, mat1.val[3]);
vst1_u8(dst + (x + 4) * h + y, mat2.val[0]);
vst1_u8(dst + (x + 5) * h + y, mat2.val[2]);
vst1_u8(dst + (x + 6) * h + y, mat2.val[1]);
vst1_u8(dst + (x + 7) * h + y, mat2.val[3]);
}
}
for(y=sh-1; y<h; y++)
{
for(x=0; x<w; x++)
{
dst[x * h + y] = src[ y * w + x];
}
}
for(x=sw-1; x<w; x++)
{
for(y=0; y<sh; y++)
{
dst[x * h + y] = src[ y * w + x];
}
}
return 0;
}
int transposition_neon_16x8(uint8_t *src, uint8_t *dst, int w, int h)
{
uint8_t *tmp_src = src;
uint8_t *tmp_dst = dst;
uint8x16x4_t mat[2];
uint8x16x2_t temp8x16x2[4];
uint16x8x4_t temp16x8x4[2];
uint16x8x2_t temp16x8x2[4];
uint32x4x4_t temp32x4x4[2];
uint32x4x2_t res[4];
int dw = w & 15;
int dh = h & 7;
int sw = w - dw;
int sh = h - dh;
int x, y;
int i, j;
//#pragma omp parallel for num_threads(2) schedule(dynamic)
for (y = 0; y < sh; y += 8)
{
for (x = 0; x < sw; x += 16)
{
mat[0].val[0] = vld1q_u8(tmp_src + (y + 0) * w + x);
mat[0].val[1] = vld1q_u8(tmp_src + (y + 1) * w + x);
mat[0].val[2] = vld1q_u8(tmp_src + (y + 2) * w + x);
mat[0].val[3] = vld1q_u8(tmp_src + (y + 3) * w + x);
mat[1].val[0] = vld1q_u8(tmp_src + (y + 4) * w + x);
mat[1].val[1] = vld1q_u8(tmp_src + (y + 5) * w + x);
mat[1].val[2] = vld1q_u8(tmp_src + (y + 6) * w + x);
mat[1].val[3] = vld1q_u8(tmp_src + (y + 7) * w + x);
temp8x16x2[0] = vtrnq_u8(mat[0].val[0], mat[0].val[1]);
temp8x16x2[1] = vtrnq_u8(mat[0].val[2], mat[0].val[3]);
temp8x16x2[2] = vtrnq_u8(mat[1].val[0], mat[1].val[1]);
temp8x16x2[3] = vtrnq_u8(mat[1].val[2], mat[1].val[3]);
temp16x8x4[0].val[0] = vreinterpretq_u16_u8(temp8x16x2[0].val[0]);
temp16x8x4[0].val[1] = vreinterpretq_u16_u8(temp8x16x2[0].val[1]);
temp16x8x4[0].val[2] = vreinterpretq_u16_u8(temp8x16x2[1].val[0]);
temp16x8x4[0].val[3] = vreinterpretq_u16_u8(temp8x16x2[1].val[1]);
temp16x8x4[1].val[0] = vreinterpretq_u16_u8(temp8x16x2[2].val[0]);
temp16x8x4[1].val[1] = vreinterpretq_u16_u8(temp8x16x2[2].val[1]);
temp16x8x4[1].val[2] = vreinterpretq_u16_u8(temp8x16x2[3].val[0]);
temp16x8x4[1].val[3] = vreinterpretq_u16_u8(temp8x16x2[3].val[1]);
temp16x8x2[0] = vtrnq_u16(temp16x8x4[0].val[0], temp16x8x4[0].val[2]);
temp16x8x2[1] = vtrnq_u16(temp16x8x4[0].val[1], temp16x8x4[0].val[3]);
temp16x8x2[2] = vtrnq_u16(temp16x8x4[1].val[0], temp16x8x4[1].val[2]);
temp16x8x2[3] = vtrnq_u16(temp16x8x4[1].val[1], temp16x8x4[1].val[3]);
temp32x4x4[0].val[0] = vreinterpretq_u32_u16(temp16x8x2[0].val[0]);
temp32x4x4[0].val[1] = vreinterpretq_u32_u16(temp16x8x2[0].val[1]);
temp32x4x4[0].val[2] = vreinterpretq_u32_u16(temp16x8x2[1].val[0]);
temp32x4x4[0].val[3] = vreinterpretq_u32_u16(temp16x8x2[1].val[1]);
temp32x4x4[1].val[0] = vreinterpretq_u32_u16(temp16x8x2[2].val[0]);
temp32x4x4[1].val[1] = vreinterpretq_u32_u16(temp16x8x2[2].val[1]);
temp32x4x4[1].val[2] = vreinterpretq_u32_u16(temp16x8x2[3].val[0]);
temp32x4x4[1].val[3] = vreinterpretq_u32_u16(temp16x8x2[3].val[1]);
res[0] = vtrnq_u32(temp32x4x4[0].val[0], temp32x4x4[1].val[0]);
res[1] = vtrnq_u32(temp32x4x4[0].val[1], temp32x4x4[1].val[1]);
res[2] = vtrnq_u32(temp32x4x4[0].val[2], temp32x4x4[1].val[2]);
res[3] = vtrnq_u32(temp32x4x4[0].val[3], temp32x4x4[1].val[3]);
vst1_u32((uint32_t*)(tmp_dst + (x + 0) * h + y), vget_low_u32(res[0].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 1) * h + y), vget_low_u32(res[2].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 2) * h + y), vget_low_u32(res[1].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 3) * h + y), vget_low_u32(res[3].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 4) * h + y), vget_low_u32(res[0].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 5) * h + y), vget_low_u32(res[2].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 6) * h + y), vget_low_u32(res[1].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 7) * h + y), vget_low_u32(res[3].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 8) * h + y), vget_high_u32(res[0].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 9) * h + y), vget_high_u32(res[2].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 10) * h + y), vget_high_u32(res[1].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 11) * h + y), vget_high_u32(res[3].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 12) * h + y), vget_high_u32(res[0].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 13) * h + y), vget_high_u32(res[2].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 14) * h + y), vget_high_u32(res[1].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 15) * h + y), vget_high_u32(res[3].val[1]));
}
}
for (y = sh; y < h; y++)
{
for (x = 0; x < w; x++)
dst[x * h + y] = src[y * w + x];
}
for (x = sw; x < w; x++)
{
for (y = 0; y < sh; y++)
{
dst[x * h + y] = src[y * w + x];
}
}
return 0;
}
int transposition_neon_16x8_prefetch(uint8_t *src, uint8_t *dst, int w, int h)
{
uint8_t *tmp_src = src;
uint8_t *tmp_dst = dst;
uint8x16x4_t mat[2];
uint8x16x2_t temp8x16x2[4];
uint16x8x4_t temp16x8x4[2];
uint16x8x2_t temp16x8x2[4];
uint32x4x4_t temp32x4x4[2];
uint32x4x2_t res[4];
int dw = w & 15;
int dh = h & 7;
int sw = w - dw;
int sh = h - dh;
int x, y;
int i, j;
//#pragma omp parallel for num_threads(2) schedule(dynamic)
for (y = 0; y < sh; y += 8)
{
for (x = 0; x < sw; x += 16)
{
mat[0].val[0] = vld1q_u8(tmp_src + (y + 0) * w + x);
mat[0].val[1] = vld1q_u8(tmp_src + (y + 1) * w + x);
mat[0].val[2] = vld1q_u8(tmp_src + (y + 2) * w + x);
mat[0].val[3] = vld1q_u8(tmp_src + (y + 3) * w + x);
mat[1].val[0] = vld1q_u8(tmp_src + (y + 4) * w + x);
mat[1].val[1] = vld1q_u8(tmp_src + (y + 5) * w + x);
mat[1].val[2] = vld1q_u8(tmp_src + (y + 6) * w + x);
mat[1].val[3] = vld1q_u8(tmp_src + (y + 7) * w + x);
temp8x16x2[0] = vtrnq_u8(mat[0].val[0], mat[0].val[1]);
temp8x16x2[1] = vtrnq_u8(mat[0].val[2], mat[0].val[3]);
temp8x16x2[2] = vtrnq_u8(mat[1].val[0], mat[1].val[1]);
temp8x16x2[3] = vtrnq_u8(mat[1].val[2], mat[1].val[3]);
temp16x8x4[0].val[0] = vreinterpretq_u16_u8(temp8x16x2[0].val[0]);
temp16x8x4[0].val[1] = vreinterpretq_u16_u8(temp8x16x2[0].val[1]);
temp16x8x4[0].val[2] = vreinterpretq_u16_u8(temp8x16x2[1].val[0]);
temp16x8x4[0].val[3] = vreinterpretq_u16_u8(temp8x16x2[1].val[1]);
temp16x8x4[1].val[0] = vreinterpretq_u16_u8(temp8x16x2[2].val[0]);
temp16x8x4[1].val[1] = vreinterpretq_u16_u8(temp8x16x2[2].val[1]);
temp16x8x4[1].val[2] = vreinterpretq_u16_u8(temp8x16x2[3].val[0]);
temp16x8x4[1].val[3] = vreinterpretq_u16_u8(temp8x16x2[3].val[1]);
temp16x8x2[0] = vtrnq_u16(temp16x8x4[0].val[0], temp16x8x4[0].val[2]);
temp16x8x2[1] = vtrnq_u16(temp16x8x4[0].val[1], temp16x8x4[0].val[3]);
temp16x8x2[2] = vtrnq_u16(temp16x8x4[1].val[0], temp16x8x4[1].val[2]);
temp16x8x2[3] = vtrnq_u16(temp16x8x4[1].val[1], temp16x8x4[1].val[3]);
temp32x4x4[0].val[0] = vreinterpretq_u32_u16(temp16x8x2[0].val[0]);
temp32x4x4[0].val[1] = vreinterpretq_u32_u16(temp16x8x2[0].val[1]);
temp32x4x4[0].val[2] = vreinterpretq_u32_u16(temp16x8x2[1].val[0]);
temp32x4x4[0].val[3] = vreinterpretq_u32_u16(temp16x8x2[1].val[1]);
temp32x4x4[1].val[0] = vreinterpretq_u32_u16(temp16x8x2[2].val[0]);
temp32x4x4[1].val[1] = vreinterpretq_u32_u16(temp16x8x2[2].val[1]);
temp32x4x4[1].val[2] = vreinterpretq_u32_u16(temp16x8x2[3].val[0]);
temp32x4x4[1].val[3] = vreinterpretq_u32_u16(temp16x8x2[3].val[1]);
res[0] = vtrnq_u32(temp32x4x4[0].val[0], temp32x4x4[1].val[0]);
res[1] = vtrnq_u32(temp32x4x4[0].val[1], temp32x4x4[1].val[1]);
res[2] = vtrnq_u32(temp32x4x4[0].val[2], temp32x4x4[1].val[2]);
res[3] = vtrnq_u32(temp32x4x4[0].val[3], temp32x4x4[1].val[3]);
vst1_u32((uint32_t*)(tmp_dst + (x + 0) * h + y), vget_low_u32(res[0].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 1) * h + y), vget_low_u32(res[2].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 2) * h + y), vget_low_u32(res[1].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 3) * h + y), vget_low_u32(res[3].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 4) * h + y), vget_low_u32(res[0].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 5) * h + y), vget_low_u32(res[2].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 6) * h + y), vget_low_u32(res[1].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 7) * h + y), vget_low_u32(res[3].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 8) * h + y), vget_high_u32(res[0].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 9) * h + y), vget_high_u32(res[2].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 10) * h + y), vget_high_u32(res[1].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 11) * h + y), vget_high_u32(res[3].val[0]));
vst1_u32((uint32_t*)(tmp_dst + (x + 12) * h + y), vget_high_u32(res[0].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 13) * h + y), vget_high_u32(res[2].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 14) * h + y), vget_high_u32(res[1].val[1]));
vst1_u32((uint32_t*)(tmp_dst + (x + 15) * h + y), vget_high_u32(res[3].val[1]));
__builtin_prefetch(tmp_src + (y + 0) * w + (x + 16), 0, 0);
__builtin_prefetch(tmp_src + (y + 1) * w + (x + 16), 0, 0);
__builtin_prefetch(tmp_src + (y + 2) * w + (x + 16), 0, 0);
__builtin_prefetch(tmp_src + (y + 3) * w + (x + 16), 0, 0);
__builtin_prefetch(tmp_src + (y + 4) * w + (x + 16), 0, 0);
__builtin_prefetch(tmp_src + (y + 5) * w + (x + 16), 0, 0);
__builtin_prefetch(tmp_src + (y + 6) * w + (x + 16), 0, 0);
__builtin_prefetch(tmp_src + (y + 7) * w + (x + 16), 0, 0);
}
}
for (y = sh; y < h; y++)
{
for (x = 0; x < w; x++)
dst[x * h + y] = src[y * w + x];
}
for (x = sw; x < w; x++)
{
for (y = 0; y < sh; y++)
{
dst[x * h + y] = src[y * w + x];
}
}
return 0;
}
由上图可知,neon 16x8 比 8x8提升39ms,加入预取之后反而增加了32ms,初步怀疑是图像较小,cache足够缓存数据,预取反而增加了额外的负担。
在此补充一个1024x1024的测试实验。测试结果如下:
由上可知,在cache不足的情况下,加入预取,有助于性能的提升。
1)由于NEON硬件设计,需要先将源数据加载到D/Q寄存器,运算完之后还需将数据拷贝到目标内存,这将额外增加内存拷贝的流程;
2)cpu对读写动作的开销不一致,因架构而已;
3)neon的并行运算有助于提高性能;
C++性能调优
neon加速图像转置