FPGA硬件加速vivado hls------------004 矩阵乘法

矩阵乘法的代码如下

void matrixmul(int A[N][M], int B[M][P], int AB[N][P]) {
  #pragma HLS ARRAY_RESHAPE variable=A complete dim=2
  #pragma HLS ARRAY_RESHAPE variable=B complete dim=1
  /* for each row and column of AB */
  row: for(int i = 0; i < N; ++i) {
    col: for(int j = 0; j < P; ++j) {
      #pragma HLS PIPELINE II=1
      /* compute (AB)i,j */
      int ABij = 0;
    product: for(int k = 0; k < M; ++k) {
        ABij += A[i][k] * B[k][j];
      }
      AB[i][j] = ABij;
    }
  }
}

FPGA硬件加速vivado hls------------004 矩阵乘法_第1张图片
这次把pipeline放在row的循环:
FPGA硬件加速vivado hls------------004 矩阵乘法_第2张图片
为了优化对乘法进行优化操作
代码如下
头文件

#ifndef _BLOCK_MM_H_
#define _BLOCK_MM_H_
#include "hls_stream.h"
#include 
#include 
#include 
using namespace std;

typedef int DTYPE;
const int SIZE = 8;
const int BLOCK_SIZE = 4;

typedef struct { DTYPE a[BLOCK_SIZE]; } blockvec;

typedef struct { DTYPE out[BLOCK_SIZE][BLOCK_SIZE]; } blockmat;

void blockmatmul(hls::stream &Arows, hls::stream &Bcols,
                                 blockmat & ABpartial, DTYPE iteration);
#endif

主代码

#include "block_mm.h"
void blockmatmul(hls::stream &Arows, hls::stream &Bcols,
        blockmat &ABpartial, int it) {
  #pragma HLS DATAFLOW
  int counter = it % (SIZE/BLOCK_SIZE);
  static DTYPE A[BLOCK_SIZE][SIZE];
  if(counter == 0){ //only load the A rows when necessary
    loadA: for(int i = 0; i < SIZE; i++) {
      blockvec tempA = Arows.read();
      for(int j = 0; j < BLOCK_SIZE; j++) {
        #pragma HLS PIPELINE II=1
        A[j][i] = tempA.a[j];
      }
    }
  }
  DTYPE AB[BLOCK_SIZE][BLOCK_SIZE] = { 0 };
  partialsum: for(int k=0; k < SIZE; k++) {
    blockvec tempB = Bcols.read();
    for(int i = 0; i < BLOCK_SIZE; i++) {
      for(int j = 0; j < BLOCK_SIZE; j++) {
        AB[i][j] = AB[i][j] +  A[i][k] * tempB.a[j];
      }
    }
  }
  writeoutput: for(int i = 0; i < BLOCK_SIZE; i++) {
    for(int j = 0; j < BLOCK_SIZE; j++) {
      ABpartial.out[i][j] = AB[i][j];
    }
  }
}

testbench

#include "block_mm.h"
#include 
using namespace std;

void matmatmul_sw(DTYPE A[SIZE][SIZE], DTYPE B[SIZE][SIZE],
      DTYPE out[SIZE][SIZE]){
 DTYPE sum = 0;
 for(int i = 0; i < SIZE; i++){
  for(int j = 0;j strm_matrix1("strm_matrix1");
 hls::stream strm_matrix2("strm_matrix2");
 blockvec strm_matrix1_element, strm_matrix2_element;
 blockmat block_out;
 DTYPE A[SIZE][SIZE], B[SIZE][SIZE];
 DTYPE matrix_swout[SIZE][SIZE], matrix_hwout[SIZE][SIZE];

 initmatrices: for(int i = 0; i < SIZE; i++){
  for(int j = 0; j < SIZE; j++){
   A[i][j] = rand() % 512;
   B[i][j] = rand() % 512;
   matrix_swout[i][j] = 0;
   matrix_hwout[i][j] = 0;
  }

  int row, col, it = 0;
    for(int it1 = 0; it1 < SIZE; it1 = it1 + BLOCK_SIZE) {
      for(int it2 = 0; it2 < SIZE; it2 = it2 + BLOCK_SIZE) {
        row = it1; //row + BLOCK_SIZE * factor_row;
        col = it2; //col + BLOCK_SIZE * factor_col;

        for(int k = 0; k < SIZE; k++) {
          for(int i = 0; i < BLOCK_SIZE; i++) {
            if(it % (SIZE/BLOCK_SIZE) == 0) strm_matrix1_element.a[i] = A[row+i][k];
            strm_matrix2_element.a[i] = B[k][col+i];
          }
          if(it % (SIZE/BLOCK_SIZE) == 0) strm_matrix1.write(strm_matrix1_element);
          strm_matrix2.write(strm_matrix2_element);
        }
        blockmatmul(strm_matrix1, strm_matrix2, block_out, it);

        for(int i = 0; i < BLOCK_SIZE; i++)
          for(int j = 0; j < BLOCK_SIZE; j++)
            matrix_hwout[row+i][col+j] = block_out.out[i][j];
        it = it + 1;
      }
    }

    matmatmul_sw(A, B, matrix_swout);

    for(int i = 0; i

你可能感兴趣的:(FPGA硬件加速vivado hls------------004 矩阵乘法)