主要是参考了写一个基于NEON指令的矩阵乘法(一)的原理,写了一个矩阵乘法作为练习,同时这个参考链接里面的有些neon intrinsics跟NEON Programmer’s Guide有冲突,所以需要改一下。记录下来,留以备用。
#include
#include
#include
#include
#include
double sub_time(struct timeval t1, struct timeval t0)
{
double s = t1.tv_sec - t0.tv_sec;
double us = t1.tv_usec - t0.tv_usec;
return s*1000 + us/1000;
}
#define N 4
int main(void)
{
float a1[16][16] = {{1,2,3,4},{2,3,4,5},{3,4,5,6},{4,5,6,7}};
float c1[16][16] = {{2,2,3,4},{2,5,4,5},{3,4,7,6},{4,9,6,7}};
float a2[16][16] = {{1,2,3,4},{2,3,4,5},{3,4,5,6},{4,5,6,7}};
float c2[16][16] = {{2,2,3,4},{2,5,4,5},{3,4,7,6},{4,9,6,7}};
float d[N][N] = {{0}};
float e[N][N] = {{0}};
int i, j, k, m;
struct timeval t1, t0;
gettimeofday(&t0, NULL);
for (i=0;i<10000;i++)
{
for (j=0;j<N;j++)
{
for(k=0;k<N;k++)
{
for (m=0;m<N;m++)
{
d[j][k] += a1[j][m] * c1[m][k];
}
}
}
}
gettimeofday(&t1, NULL);
printf("basic time used: %0.3f.\n", sub_time(t1,t0));
for (j=0;j<N;j++)
{
for(k=0;k<N;k++)
{
printf("%f\t", d[j][k]);
}
printf("\n");
}
gettimeofday(&t0, NULL);
for (i=0;i<10000;i++)
{
float32x4_t vc0 = vdupq_n_f32(0.0f);
float32x4_t vc1 = vdupq_n_f32(0.0f);
float32x4_t vc2 = vdupq_n_f32(0.0f);
float32x4_t vc3 = vdupq_n_f32(0.0f);
for (j=0;j<4;j++)
{
// 通过neon直接计算4*4矩阵块的结果
float32x4_t vb = vld1q_f32(&c2[j][0]);
vc0 = vmlaq_f32(vc0, vdupq_n_f32(a2[0][j]), vb);
vc1 = vmlaq_f32(vc1, vdupq_n_f32(a2[1][j]), vb);
vc2 = vmlaq_f32(vc2, vdupq_n_f32(a2[2][j]), vb);
vc3 = vmlaq_f32(vc3, vdupq_n_f32(a2[3][j]), vb);
}
vst1q_f32(&e[0][0], vc0);
vst1q_f32(&e[1][0], vc1);
vst1q_f32(&e[2][0], vc2);
vst1q_f32(&e[3][0], vc3);
}
gettimeofday(&t1, NULL);
printf("neon time used: %0.3f.\n", sub_time(t1,t0));
for (j=0;j<N;j++)
{
for(k=0;k<N;k++)
{
printf("%f\t", e[j][k]);
}
printf("\n");
}
return 0;
}
运行结果:
N = 4时,
正常矩阵乘法 | neon矩阵乘法 | |
---|---|---|
-O0 | 60ms | 14ms |
-O3 | 1.8ms | 2.8ms |
看来neon优化的效果还是比较明显的,但是最外面的循环导致neon的效率降低(反复load和store)。