矩阵乘法优化:4x4矩阵块优化方法

MMult_4x4_3.h

一次计算C中的4x4小块

0.24gflops

2.1%

1

MMult_4x4_4.h

一次计算C中的4x4小块

0.24gflops

2.1%

1

MMult_4x4_5.h

一次计算C中的4x4小块,将16个循环合并一个

0.25gflops

2.2%

1

MMult_4x4_6.h

一次计算C中的4x4小块(我们在寄存器中累加C的元素,并对a的元素使用寄存器)

1.75gflops

16.0%

1

MMult_4x4_7.h

在MMult_4x4_6的基础上用指针来寻址B中的元素

1.75gflops

16.0%

1

MMult_4x4_8.h

使用更多的寄存器

1.75gflops

16.0%

1

MMult_4x4_10.h

NEON指令集优化

2.6gflops

23.8%

1

MMult_4x4_11.h

NEON指令集优化, 并且为了保持较小问题规模所获得的性能,我们分块矩阵C(以及相应的A和B)

2.6gflops

23.8%

1

MMult_4x4_13.h

NEON指令集优化, 对矩阵A和B进行Pack,这样就可以连续访问内存

2.6gflops

23.8%

1

MMult_4x4_3:

一次计算C矩阵的16个元素:

void AddDot( int k, float *x, int incx,  float *y, float *gamma )
{
  int p;
  for ( p=0; p

MMult_4x4_4:

将上一步的矩阵元素直接乘加计算,合并成循环操作。将两个计算函数合并成一个。并显式地将C矩阵中各个索引的元素作为计算结果:

void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{
  int p;

  /* First row */
  //  AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );
  for ( p=0; p

MMult_4x4_5:

上一步一次for循环计算1个C中元素,这一步一次for循环计算4个C中元素:

void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{
  int p;

  for ( p=0; p

MMult_4x4_6:

将A和C矩阵中元素送入寄存器来优化计算。每次只计算1个C中元素:

void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{
  int p;
  register float 
       c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
       c_10_reg,   c_11_reg,   c_12_reg,   c_13_reg,  
       c_20_reg,   c_21_reg,   c_22_reg,   c_23_reg,  
       c_30_reg,   c_31_reg,   c_32_reg,   c_33_reg,
       a_0p_reg,
       a_1p_reg,
       a_2p_reg,
       a_3p_reg;

  c_00_reg = 0.0;   c_01_reg = 0.0;   c_02_reg = 0.0;   c_03_reg = 0.0;
  c_10_reg = 0.0;   c_11_reg = 0.0;   c_12_reg = 0.0;   c_13_reg = 0.0;
  c_20_reg = 0.0;   c_21_reg = 0.0;   c_22_reg = 0.0;   c_23_reg = 0.0;
  c_30_reg = 0.0;   c_31_reg = 0.0;   c_32_reg = 0.0;   c_33_reg = 0.0;

  for ( p=0; p

MMult_4x4_7:

将A和C矩阵中元素送入寄存器,并用指针来索引B矩阵中元素,使用索引++来获取B矩阵中下一个元素:

void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{
  int p;
  register float 

       c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
       c_10_reg,   c_11_reg,   c_12_reg,   c_13_reg,  
       c_20_reg,   c_21_reg,   c_22_reg,   c_23_reg,  
       c_30_reg,   c_31_reg,   c_32_reg,   c_33_reg,

       a_0p_reg,
       a_1p_reg,
       a_2p_reg,
       a_3p_reg;
  float 
    /* Point to the current elements in the four columns of B */
    *b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr; 

  c_00_reg = 0.0;   c_01_reg = 0.0;   c_02_reg = 0.0;   c_03_reg = 0.0;
  c_10_reg = 0.0;   c_11_reg = 0.0;   c_12_reg = 0.0;   c_13_reg = 0.0;
  c_20_reg = 0.0;   c_21_reg = 0.0;   c_22_reg = 0.0;   c_23_reg = 0.0;
  c_30_reg = 0.0;   c_31_reg = 0.0;   c_32_reg = 0.0;   c_33_reg = 0.0;

  for ( p=0; p

MMult_4x4_8:

将A,B,C矩阵中元素都送入寄存器中来加速计算:

void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{
  int p;
       c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  
       c_10_reg,   c_11_reg,   c_12_reg,   c_13_reg,  
       c_20_reg,   c_21_reg,   c_22_reg,   c_23_reg,  
       c_30_reg,   c_31_reg,   c_32_reg,   c_33_reg,
       a_0p_reg,
       a_1p_reg,
       a_2p_reg,
       a_3p_reg,
       b_p0_reg,
       b_p1_reg,
       b_p2_reg,
       b_p3_reg;

  float 
    /* Point to the current elements in the four rows of A */
    *a_0p_pntr, *a_1p_pntr, *a_2p_pntr, *a_3p_pntr;
  
  a_0p_pntr = &A( 0, 0);
  a_1p_pntr = &A( 1, 0);
  a_2p_pntr = &A( 2, 0);
  a_3p_pntr = &A( 3, 0);

  c_00_reg = 0.0;   c_01_reg = 0.0;   c_02_reg = 0.0;   c_03_reg = 0.0;
  c_10_reg = 0.0;   c_11_reg = 0.0;   c_12_reg = 0.0;   c_13_reg = 0.0;
  c_20_reg = 0.0;   c_21_reg = 0.0;   c_22_reg = 0.0;   c_23_reg = 0.0;
  c_30_reg = 0.0;   c_31_reg = 0.0;   c_32_reg = 0.0;   c_33_reg = 0.0;

  for ( p=0; p

你可能感兴趣的:(算法,开发语言,c++,矩阵)