图像矩阵转置性能优化实例

图像矩阵转置性能优化实例

文章目录

  • 图像矩阵转置性能优化实例
    • 1. 背景
    • 2. main函数
    • 3. CPU模式
      • 3.1 读连续,写跳跃
      • 3.2 读跳跃,写连续
      • 3.3 执行耗时
      • 3.4 结论
    • 4. NEON模式
      • 4.1 NEON 8x8矩阵
      • 4.2 NEON 16x8矩阵
      • 4.3 NEON 16x8矩阵 预取
    • 4.4 测试情况
    • 5. 总结
    • 6. 参考资料

1. 背景

本文将以矩阵转置为例,用cpu模式下的写列连续、写行连续,neon模式下 8 * 8、8 * 16,cache预读等几种样例分析优化心得。

测试用例:
数组大小:640 * 480
运算次数:1024
测试平台:RV1126

由于矩阵转置的逻辑功能是将矩阵行列对调,同时矩阵在内存中通常是按照行内存连续的方式存储的。因此,原始矩阵和目标矩阵之间转换的过程中,一定存在一个矩阵的内存访问时不连续的。为了让程序执行时间足够长,使执行时间足够稳定,同时便于抓包分析,重复执行代码1024次。

2. main函数

#include  //要包含这个头文件
#include 
#include 
#include 
#include 
#include 
#include 

#include 
#include 
#include 

typedef unsigned long long am_ms_t;
typedef unsigned long long am_us_t;

#define IMAGE_WIDTH (320)
#define IMAGE_HEIGH (240)

#define IMAGE_TEST  (10240)

am_ms_t sys_get_ms(void)
{
	struct timespec ts;
	am_ms_t ms = 0;
	
	clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
	ms  = (am_ms_t)(ts.tv_sec * 1000);
	ms += ((am_ms_t)(ts.tv_nsec / 1000000));

	return ms;
}

am_us_t sys_get_us(void)
{
	am_us_t us = 0;
	struct timespec ts;

	clock_gettime(CLOCK_MONOTONIC_RAW, &ts);

	us = (am_us_t)(ts.tv_sec * 1000000);
	us += (am_us_t)(ts.tv_nsec / 1000);

	return us;
}

int main()
{
    uint32_t size = IMAGE_WIDTH * IMAGE_HEIGH;
    uint32_t i = 0;
    
    uint8_t *psrc  = NULL;
    uint8_t *pdst0 = NULL;

    posix_memalign((void **)&psrc, 64, size);
    posix_memalign((void **)&pdst0, 64, size);

    am_ms_t old_ms = 0;
    
    memset(pdst0, 0, size);

    srand(time(NULL));
    for(i=0; i<size; i++)
    {
        *(psrc + i) = (rand()&0xff);
    }

    old_ms = sys_get_ms();
    for (i=0; i<IMAGE_TEST; i++) 
    {
        transposition_cpu_colseq(psrc, pdst0, IMAGE_WIDTH, IMAGE_HEIGH);
    }
    printf("trans col const:%llu ms\n", sys_get_ms()-old_ms);

    old_ms = sys_get_ms();
    for (i=0; i<IMAGE_TEST; i++) 
    {
        transposition_cpu_rowseq(psrc, pdst0, IMAGE_WIDTH, IMAGE_HEIGH);
    }
    printf("trans row const:%llu ms\n", sys_get_ms()-old_ms);

    old_ms = sys_get_ms();
    for (i=0; i<IMAGE_TEST; i++) 
    {
        transposition_neon_8x8(psrc, pdst0, IMAGE_WIDTH, IMAGE_HEIGH);
    }
    printf("trans neon 8x8 const:%llu ms\n", sys_get_ms()-old_ms);

    old_ms = sys_get_ms();
    for (i=0; i<IMAGE_TEST; i++) 
    {
        transposition_neon_16x8(psrc, pdst0, IMAGE_WIDTH, IMAGE_HEIGH);
    }
    printf("trans neon 16x8 const:%llu ms\n", sys_get_ms()-old_ms);
    
    old_ms = sys_get_ms();
    for (i=0; i<IMAGE_TEST; i++) 
    {
        transposition_neon_16x8_prefetch(psrc, pdst0, IMAGE_WIDTH, IMAGE_HEIGH);
    }
    printf("trans neon 16x8 prefetch const:%llu ms\n", sys_get_ms()-old_ms);

    free(psrc);
    free(pdst0);

    return 0;
}

编译选项

ASFLAGS        := -g -ggdb -Os -mfpu=neon-vfpv4 -mfloat-abi=hard
CFLAGS         := -g -ggdb -Os -mfpu=neon-vfpv4 -mfloat-abi=hard
CXXFLAGS       := -g -ggdb -Os -mfpu=neon-vfpv4 -mfloat-abi=hard
LDFLAGS        :=

ASFLAGS        += -ftree-vectorize -fopenmp -ffast-math -finline-functions -funroll-all-loops
CFLAGS         += -ftree-vectorize -fopenmp -ffast-math -finline-functions -funroll-all-loops
CXXFLAGS       += -ftree-vectorize -fopenmp -ffast-math -finline-functions -funroll-all-loops
LDFLAGS        += -fopenmp

3. CPU模式

3.1 读连续,写跳跃

int transposition_cpu_colseq(uint8_t *src, uint8_t *dst, const int w, const int h)
{
    //读连续,写跳跃
    int x, y;

    for (y = 0; y < h; y++)
    {
        for (x = 0; x < w; x++)
        {
            dst[x * h + y] = src[y * w + x];
        }
    }

    return 0;
}

3.2 读跳跃,写连续

int transposition_cpu_rowseq(uint8_t *src, uint8_t *dst, const int w, const int h)
{
    //读跳跃,写连续
    int x, y;
    
    for(x=0; x<w; x++)
    {
        for(y=0; y<h; y++)
        {
            dst[x * h + y] = src[y * w + x];
        }
    }


    return 0;
}

3.3 执行耗时

读连续,写跳跃(col) :3672 ms
图像矩阵转置性能优化实例_第1张图片

读跳跃,写连续(row):7370 ms
图像矩阵转置性能优化实例_第2张图片

3.4 结论

缓存一致性算法在读写内存的实现细节不同,具体读写性能差异因IC架构所导致的。实际需两种对比后,择优选取。

参考《深入理解计算机系统》中缓存章节,缓存更新算法处理写内存操作比读内存更复杂。其中,读miss时,缓存会依次从低等级存储中寻找并加载包含当前数据的Cache line;在写miss时,首先会依次从低等级缓存中加载数据至当前缓存中,然后对当前缓存行更新数据,并依次向低等级缓存跟新Cache line。因此写内存不连续的情况会比读内存不连续时额外做了写回操作,造成性能下降。

与我实测数据结构有差异,暂未能分析出具体原因

4. NEON模式

4.1 NEON 8x8矩阵

int transposition_neon_8x8(uint8_t *src, uint8_t *dst, int w, int h)
{
    uint8x8x4_t mat1;
    uint8x8x4_t mat2;
    
    uint8x8x2_t temp1;
    uint8x8x2_t temp2;
    uint8x8x2_t temp3;
    uint8x8x2_t temp4;
    
    uint16x4x4_t temp11;
    uint16x4x4_t temp12;
    uint16x4x2_t temp5;
    uint16x4x2_t temp6;
    uint16x4x2_t temp7;
    uint16x4x2_t temp8;
    
    uint32x2x4_t temp21;
    uint32x2x4_t temp22;
    
    uint32x2x2_t res1;
    uint32x2x2_t res2;
    uint32x2x2_t res3;
    uint32x2x2_t res4;


    int dw = w & 7;
    int dh = h & 7;
    int sw = w - dw;
    int sh = h - dh;
    int x, y;
    
    for(y=0; y<sh; y+=8)
    {
        for(x=0; x<sw; x+=8)
        {
            mat1.val[0] = vld1_u8(src + (y + 0) * w + x);
            mat1.val[1] = vld1_u8(src + (y + 1) * w + x);
            mat1.val[2] = vld1_u8(src + (y + 2) * w + x);
            mat1.val[3] = vld1_u8(src + (y + 3) * w + x);
            
            mat2.val[0] = vld1_u8(src + (y + 4) * w + x);
            mat2.val[1] = vld1_u8(src + (y + 5) * w + x);
            mat2.val[2] = vld1_u8(src + (y + 6) * w + x);
            mat2.val[3] = vld1_u8(src + (y + 7) * w + x);
            
            temp1 = vtrn_u8(mat1.val[0], mat1.val[1]);
            temp2 = vtrn_u8(mat1.val[2], mat1.val[3]);
            
            temp3 = vtrn_u8(mat2.val[0], mat2.val[1]);
            temp4 = vtrn_u8(mat2.val[2], mat2.val[3]);


            temp11.val[0] = vreinterpret_u16_u8(temp1.val[0]);
            temp11.val[1] = vreinterpret_u16_u8(temp1.val[1]);
            temp11.val[2] = vreinterpret_u16_u8(temp2.val[0]);
            temp11.val[3] = vreinterpret_u16_u8(temp2.val[1]);
            
            temp12.val[0] = vreinterpret_u16_u8(temp3.val[0]);
            temp12.val[1] = vreinterpret_u16_u8(temp3.val[1]);
            temp12.val[2] = vreinterpret_u16_u8(temp4.val[0]);
            temp12.val[3] = vreinterpret_u16_u8(temp4.val[1]);


            temp5 = vtrn_u16(temp11.val[0], temp11.val[2]);
            temp6 = vtrn_u16(temp11.val[1], temp11.val[3]);
            temp7 = vtrn_u16(temp12.val[0], temp12.val[2]);
            temp8 = vtrn_u16(temp12.val[1], temp12.val[3]);


            temp21.val[0] = vreinterpret_u32_u16(temp5.val[0]);
            temp21.val[1] = vreinterpret_u32_u16(temp5.val[1]);
            temp21.val[2] = vreinterpret_u32_u16(temp6.val[0]);
            temp21.val[3] = vreinterpret_u32_u16(temp6.val[1]);
            temp22.val[0] = vreinterpret_u32_u16(temp7.val[0]);
            temp22.val[1] = vreinterpret_u32_u16(temp7.val[1]);
            temp22.val[2] = vreinterpret_u32_u16(temp8.val[0]);
            temp22.val[3] = vreinterpret_u32_u16(temp8.val[1]);


            res1 = vtrn_u32(temp21.val[0],temp22.val[0]);
            res2 = vtrn_u32(temp21.val[1],temp22.val[1]);
            res3 = vtrn_u32(temp21.val[2],temp22.val[2]);
            res4 = vtrn_u32(temp21.val[3],temp22.val[3]);


            mat1.val[0] = vreinterpret_u8_u32(res1.val[0]);
            mat1.val[1] = vreinterpret_u8_u32(res2.val[0]);
            mat1.val[2] = vreinterpret_u8_u32(res3.val[0]);
            mat1.val[3] = vreinterpret_u8_u32(res4.val[0]);
            
            mat2.val[0] = vreinterpret_u8_u32(res1.val[1]);
            mat2.val[1] = vreinterpret_u8_u32(res2.val[1]);
            mat2.val[2] = vreinterpret_u8_u32(res3.val[1]);
            mat2.val[3] = vreinterpret_u8_u32(res4.val[1]);


            vst1_u8(dst + (x + 0) * h + y, mat1.val[0]);
            vst1_u8(dst + (x + 1) * h + y, mat1.val[2]);
            vst1_u8(dst + (x + 2) * h + y, mat1.val[1]);
            vst1_u8(dst + (x + 3) * h + y, mat1.val[3]);
            vst1_u8(dst + (x + 4) * h + y, mat2.val[0]);
            vst1_u8(dst + (x + 5) * h + y, mat2.val[2]);
            vst1_u8(dst + (x + 6) * h + y, mat2.val[1]);
            vst1_u8(dst + (x + 7) * h + y, mat2.val[3]);
        }
    }
    
    for(y=sh-1; y<h; y++)
    {
        for(x=0; x<w; x++)
        {
            dst[x * h + y] = src[ y * w + x];
        }
    }
    
    for(x=sw-1; x<w; x++)
    {    
        for(y=0; y<sh; y++)
        {
            dst[x * h + y] = src[ y * w + x];
        }
    }
    
    return 0;
}

4.2 NEON 16x8矩阵

int transposition_neon_16x8(uint8_t *src, uint8_t *dst, int w, int h)
{
    uint8_t *tmp_src = src;
    uint8_t *tmp_dst = dst;


    uint8x16x4_t mat[2];
    
    uint8x16x2_t temp8x16x2[4];
        
    uint16x8x4_t temp16x8x4[2];
    
    uint16x8x2_t temp16x8x2[4];


    uint32x4x4_t temp32x4x4[2];
    
    uint32x4x2_t res[4];


    int dw = w & 15;
    int dh = h & 7;
    int sw = w - dw;
    int sh = h - dh;
    int x, y;
    int i, j;


    //#pragma omp parallel for num_threads(2) schedule(dynamic)
    for (y = 0; y < sh; y += 8)
    {
        for (x = 0; x < sw; x += 16)
        {
            mat[0].val[0] = vld1q_u8(tmp_src + (y + 0) * w + x);
            mat[0].val[1] = vld1q_u8(tmp_src + (y + 1) * w + x);
            mat[0].val[2] = vld1q_u8(tmp_src + (y + 2) * w + x);
            mat[0].val[3] = vld1q_u8(tmp_src + (y + 3) * w + x);
            
            mat[1].val[0] = vld1q_u8(tmp_src + (y + 4) * w + x);
            mat[1].val[1] = vld1q_u8(tmp_src + (y + 5) * w + x);
            mat[1].val[2] = vld1q_u8(tmp_src + (y + 6) * w + x);
            mat[1].val[3] = vld1q_u8(tmp_src + (y + 7) * w + x);
            
            temp8x16x2[0] = vtrnq_u8(mat[0].val[0], mat[0].val[1]);
            temp8x16x2[1] = vtrnq_u8(mat[0].val[2], mat[0].val[3]);
            
            temp8x16x2[2] = vtrnq_u8(mat[1].val[0], mat[1].val[1]);
            temp8x16x2[3] = vtrnq_u8(mat[1].val[2], mat[1].val[3]);


            temp16x8x4[0].val[0] = vreinterpretq_u16_u8(temp8x16x2[0].val[0]);
            temp16x8x4[0].val[1] = vreinterpretq_u16_u8(temp8x16x2[0].val[1]);
            temp16x8x4[0].val[2] = vreinterpretq_u16_u8(temp8x16x2[1].val[0]);
            temp16x8x4[0].val[3] = vreinterpretq_u16_u8(temp8x16x2[1].val[1]);
            
            temp16x8x4[1].val[0] = vreinterpretq_u16_u8(temp8x16x2[2].val[0]);
            temp16x8x4[1].val[1] = vreinterpretq_u16_u8(temp8x16x2[2].val[1]);
            temp16x8x4[1].val[2] = vreinterpretq_u16_u8(temp8x16x2[3].val[0]);
            temp16x8x4[1].val[3] = vreinterpretq_u16_u8(temp8x16x2[3].val[1]);


            temp16x8x2[0] = vtrnq_u16(temp16x8x4[0].val[0], temp16x8x4[0].val[2]);
            temp16x8x2[1] = vtrnq_u16(temp16x8x4[0].val[1], temp16x8x4[0].val[3]);
            temp16x8x2[2] = vtrnq_u16(temp16x8x4[1].val[0], temp16x8x4[1].val[2]);
            temp16x8x2[3] = vtrnq_u16(temp16x8x4[1].val[1], temp16x8x4[1].val[3]);


            temp32x4x4[0].val[0] = vreinterpretq_u32_u16(temp16x8x2[0].val[0]);
            temp32x4x4[0].val[1] = vreinterpretq_u32_u16(temp16x8x2[0].val[1]);
            temp32x4x4[0].val[2] = vreinterpretq_u32_u16(temp16x8x2[1].val[0]);
            temp32x4x4[0].val[3] = vreinterpretq_u32_u16(temp16x8x2[1].val[1]);
            temp32x4x4[1].val[0] = vreinterpretq_u32_u16(temp16x8x2[2].val[0]);
            temp32x4x4[1].val[1] = vreinterpretq_u32_u16(temp16x8x2[2].val[1]);
            temp32x4x4[1].val[2] = vreinterpretq_u32_u16(temp16x8x2[3].val[0]);
            temp32x4x4[1].val[3] = vreinterpretq_u32_u16(temp16x8x2[3].val[1]);


            res[0] = vtrnq_u32(temp32x4x4[0].val[0], temp32x4x4[1].val[0]);
            res[1] = vtrnq_u32(temp32x4x4[0].val[1], temp32x4x4[1].val[1]);
            res[2] = vtrnq_u32(temp32x4x4[0].val[2], temp32x4x4[1].val[2]);
            res[3] = vtrnq_u32(temp32x4x4[0].val[3], temp32x4x4[1].val[3]);


            vst1_u32((uint32_t*)(tmp_dst + (x +  0) * h + y), vget_low_u32(res[0].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  1) * h + y), vget_low_u32(res[2].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  2) * h + y), vget_low_u32(res[1].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  3) * h + y), vget_low_u32(res[3].val[0]));
            
            vst1_u32((uint32_t*)(tmp_dst + (x +  4) * h + y), vget_low_u32(res[0].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  5) * h + y), vget_low_u32(res[2].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  6) * h + y), vget_low_u32(res[1].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  7) * h + y), vget_low_u32(res[3].val[1]));
            
            vst1_u32((uint32_t*)(tmp_dst + (x +  8) * h + y), vget_high_u32(res[0].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  9) * h + y), vget_high_u32(res[2].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x + 10) * h + y), vget_high_u32(res[1].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x + 11) * h + y), vget_high_u32(res[3].val[0]));
            
            vst1_u32((uint32_t*)(tmp_dst + (x + 12) * h + y), vget_high_u32(res[0].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x + 13) * h + y), vget_high_u32(res[2].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x + 14) * h + y), vget_high_u32(res[1].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x + 15) * h + y), vget_high_u32(res[3].val[1]));
        }
    }


    for (y = sh; y < h; y++)
    {
        for (x = 0; x < w; x++)
            dst[x * h + y] = src[y * w + x];
    }


    for (x = sw; x < w; x++)
    {
        for (y = 0; y < sh; y++)
        {
            dst[x * h + y] = src[y * w + x];
        }
    }
    
    return 0;
}

4.3 NEON 16x8矩阵 预取

int transposition_neon_16x8_prefetch(uint8_t *src, uint8_t *dst, int w, int h)
{
    uint8_t *tmp_src = src;
    uint8_t *tmp_dst = dst;


    uint8x16x4_t mat[2];
    
    uint8x16x2_t temp8x16x2[4];
        
    uint16x8x4_t temp16x8x4[2];
    
    uint16x8x2_t temp16x8x2[4];


    uint32x4x4_t temp32x4x4[2];
    
    uint32x4x2_t res[4];


    int dw = w & 15;
    int dh = h & 7;
    int sw = w - dw;
    int sh = h - dh;
    int x, y;
    int i, j;


    //#pragma omp parallel for num_threads(2) schedule(dynamic)
    for (y = 0; y < sh; y += 8)
    {
        for (x = 0; x < sw; x += 16)
        {
            mat[0].val[0] = vld1q_u8(tmp_src + (y + 0) * w + x);
            mat[0].val[1] = vld1q_u8(tmp_src + (y + 1) * w + x);
            mat[0].val[2] = vld1q_u8(tmp_src + (y + 2) * w + x);
            mat[0].val[3] = vld1q_u8(tmp_src + (y + 3) * w + x);
            
            mat[1].val[0] = vld1q_u8(tmp_src + (y + 4) * w + x);
            mat[1].val[1] = vld1q_u8(tmp_src + (y + 5) * w + x);
            mat[1].val[2] = vld1q_u8(tmp_src + (y + 6) * w + x);
            mat[1].val[3] = vld1q_u8(tmp_src + (y + 7) * w + x);
            
            temp8x16x2[0] = vtrnq_u8(mat[0].val[0], mat[0].val[1]);
            temp8x16x2[1] = vtrnq_u8(mat[0].val[2], mat[0].val[3]);
            
            temp8x16x2[2] = vtrnq_u8(mat[1].val[0], mat[1].val[1]);
            temp8x16x2[3] = vtrnq_u8(mat[1].val[2], mat[1].val[3]);


            temp16x8x4[0].val[0] = vreinterpretq_u16_u8(temp8x16x2[0].val[0]);
            temp16x8x4[0].val[1] = vreinterpretq_u16_u8(temp8x16x2[0].val[1]);
            temp16x8x4[0].val[2] = vreinterpretq_u16_u8(temp8x16x2[1].val[0]);
            temp16x8x4[0].val[3] = vreinterpretq_u16_u8(temp8x16x2[1].val[1]);
            
            temp16x8x4[1].val[0] = vreinterpretq_u16_u8(temp8x16x2[2].val[0]);
            temp16x8x4[1].val[1] = vreinterpretq_u16_u8(temp8x16x2[2].val[1]);
            temp16x8x4[1].val[2] = vreinterpretq_u16_u8(temp8x16x2[3].val[0]);
            temp16x8x4[1].val[3] = vreinterpretq_u16_u8(temp8x16x2[3].val[1]);


            temp16x8x2[0] = vtrnq_u16(temp16x8x4[0].val[0], temp16x8x4[0].val[2]);
            temp16x8x2[1] = vtrnq_u16(temp16x8x4[0].val[1], temp16x8x4[0].val[3]);
            temp16x8x2[2] = vtrnq_u16(temp16x8x4[1].val[0], temp16x8x4[1].val[2]);
            temp16x8x2[3] = vtrnq_u16(temp16x8x4[1].val[1], temp16x8x4[1].val[3]);


            temp32x4x4[0].val[0] = vreinterpretq_u32_u16(temp16x8x2[0].val[0]);
            temp32x4x4[0].val[1] = vreinterpretq_u32_u16(temp16x8x2[0].val[1]);
            temp32x4x4[0].val[2] = vreinterpretq_u32_u16(temp16x8x2[1].val[0]);
            temp32x4x4[0].val[3] = vreinterpretq_u32_u16(temp16x8x2[1].val[1]);
            temp32x4x4[1].val[0] = vreinterpretq_u32_u16(temp16x8x2[2].val[0]);
            temp32x4x4[1].val[1] = vreinterpretq_u32_u16(temp16x8x2[2].val[1]);
            temp32x4x4[1].val[2] = vreinterpretq_u32_u16(temp16x8x2[3].val[0]);
            temp32x4x4[1].val[3] = vreinterpretq_u32_u16(temp16x8x2[3].val[1]);


            res[0] = vtrnq_u32(temp32x4x4[0].val[0], temp32x4x4[1].val[0]);
            res[1] = vtrnq_u32(temp32x4x4[0].val[1], temp32x4x4[1].val[1]);
            res[2] = vtrnq_u32(temp32x4x4[0].val[2], temp32x4x4[1].val[2]);
            res[3] = vtrnq_u32(temp32x4x4[0].val[3], temp32x4x4[1].val[3]);


            vst1_u32((uint32_t*)(tmp_dst + (x +  0) * h + y), vget_low_u32(res[0].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  1) * h + y), vget_low_u32(res[2].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  2) * h + y), vget_low_u32(res[1].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  3) * h + y), vget_low_u32(res[3].val[0]));
            
            vst1_u32((uint32_t*)(tmp_dst + (x +  4) * h + y), vget_low_u32(res[0].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  5) * h + y), vget_low_u32(res[2].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  6) * h + y), vget_low_u32(res[1].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  7) * h + y), vget_low_u32(res[3].val[1]));
            
            vst1_u32((uint32_t*)(tmp_dst + (x +  8) * h + y), vget_high_u32(res[0].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x +  9) * h + y), vget_high_u32(res[2].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x + 10) * h + y), vget_high_u32(res[1].val[0]));
            vst1_u32((uint32_t*)(tmp_dst + (x + 11) * h + y), vget_high_u32(res[3].val[0]));
            
            vst1_u32((uint32_t*)(tmp_dst + (x + 12) * h + y), vget_high_u32(res[0].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x + 13) * h + y), vget_high_u32(res[2].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x + 14) * h + y), vget_high_u32(res[1].val[1]));
            vst1_u32((uint32_t*)(tmp_dst + (x + 15) * h + y), vget_high_u32(res[3].val[1]));


            __builtin_prefetch(tmp_src + (y + 0) * w + (x + 16), 0, 0);
            __builtin_prefetch(tmp_src + (y + 1) * w + (x + 16), 0, 0);
            __builtin_prefetch(tmp_src + (y + 2) * w + (x + 16), 0, 0);
            __builtin_prefetch(tmp_src + (y + 3) * w + (x + 16), 0, 0);
            __builtin_prefetch(tmp_src + (y + 4) * w + (x + 16), 0, 0);
            __builtin_prefetch(tmp_src + (y + 5) * w + (x + 16), 0, 0);
            __builtin_prefetch(tmp_src + (y + 6) * w + (x + 16), 0, 0);
            __builtin_prefetch(tmp_src + (y + 7) * w + (x + 16), 0, 0);
        }
    }


    for (y = sh; y < h; y++)
    {
        for (x = 0; x < w; x++)
            dst[x * h + y] = src[y * w + x];
    }


    for (x = sw; x < w; x++)
    {
        for (y = 0; y < sh; y++)
        {
            dst[x * h + y] = src[y * w + x];
        }
    }
    
    return 0;
}

4.4 测试情况

图像矩阵转置性能优化实例_第3张图片
图像矩阵转置性能优化实例_第4张图片
由上图可知,neon 16x8 比 8x8提升39ms,加入预取之后反而增加了32ms,初步怀疑是图像较小,cache足够缓存数据,预取反而增加了额外的负担。

在此补充一个1024x1024的测试实验。测试结果如下:图像矩阵转置性能优化实例_第5张图片
由上可知,在cache不足的情况下,加入预取,有助于性能的提升。

5. 总结

1)由于NEON硬件设计,需要先将源数据加载到D/Q寄存器,运算完之后还需将数据拷贝到目标内存,这将额外增加内存拷贝的流程;
2)cpu对读写动作的开销不一致,因架构而已;
3)neon的并行运算有助于提高性能;

6. 参考资料

C++性能调优
neon加速图像转置

你可能感兴趣的:(Linux性能优化,矩阵,线性代数,neon,性能优化)