arm neon矩阵相乘

主要是参考了写一个基于NEON指令的矩阵乘法(一)的原理,写了一个矩阵乘法作为练习,同时这个参考链接里面的有些neon intrinsics跟NEON Programmer’s Guide有冲突,所以需要改一下。记录下来,留以备用。

#include 
#include 
#include 
#include 
#include 

double sub_time(struct timeval t1, struct timeval t0)
{
    double s = t1.tv_sec - t0.tv_sec;
    double us = t1.tv_usec - t0.tv_usec;

    return s*1000 + us/1000;
}

#define N 4

int main(void)
{
    float a1[16][16] = {{1,2,3,4},{2,3,4,5},{3,4,5,6},{4,5,6,7}};
    float c1[16][16] = {{2,2,3,4},{2,5,4,5},{3,4,7,6},{4,9,6,7}};
    float a2[16][16] = {{1,2,3,4},{2,3,4,5},{3,4,5,6},{4,5,6,7}};
    float c2[16][16] = {{2,2,3,4},{2,5,4,5},{3,4,7,6},{4,9,6,7}};
    float d[N][N] = {{0}};
    float e[N][N] = {{0}};
    int i, j, k, m;

    struct timeval t1, t0;
  
    gettimeofday(&t0, NULL);
    for (i=0;i<10000;i++)
    {
        for (j=0;j<N;j++)
        {
            for(k=0;k<N;k++)
            {
                for (m=0;m<N;m++)
                {
                    d[j][k] += a1[j][m] * c1[m][k];
                }
            }
        }
    }

    gettimeofday(&t1, NULL);
    printf("basic time used: %0.3f.\n", sub_time(t1,t0));

    for (j=0;j<N;j++)
    {
        for(k=0;k<N;k++)
        {
            printf("%f\t", d[j][k]);
        }
        printf("\n");
    }
    
    gettimeofday(&t0, NULL);
    for (i=0;i<10000;i++)
    {
        float32x4_t vc0 = vdupq_n_f32(0.0f);
        float32x4_t vc1 = vdupq_n_f32(0.0f);
        float32x4_t vc2 = vdupq_n_f32(0.0f);
        float32x4_t vc3 = vdupq_n_f32(0.0f);

        for (j=0;j<4;j++)
        {
        	// 通过neon直接计算4*4矩阵块的结果
            float32x4_t vb = vld1q_f32(&c2[j][0]);
            
            vc0 = vmlaq_f32(vc0, vdupq_n_f32(a2[0][j]), vb);
            vc1 = vmlaq_f32(vc1, vdupq_n_f32(a2[1][j]), vb);
            vc2 = vmlaq_f32(vc2, vdupq_n_f32(a2[2][j]), vb);
            vc3 = vmlaq_f32(vc3, vdupq_n_f32(a2[3][j]), vb);
        }

        vst1q_f32(&e[0][0], vc0);
        vst1q_f32(&e[1][0], vc1);
        vst1q_f32(&e[2][0], vc2);
        vst1q_f32(&e[3][0], vc3);
    }

    gettimeofday(&t1, NULL);
    printf("neon time used: %0.3f.\n", sub_time(t1,t0));

    for (j=0;j<N;j++)
    {
        for(k=0;k<N;k++)
        {
            printf("%f\t", e[j][k]);
        }
        printf("\n");
    }

    return 0;
}

运行结果:
N = 4时,

正常矩阵乘法 neon矩阵乘法
-O0 60ms 14ms
-O3 1.8ms 2.8ms

看来neon优化的效果还是比较明显的,但是最外面的循环导致neon的效率降低(反复load和store)。

你可能感兴趣的:(嵌入式,性能优化,NEON)