1、在Vivado HLS中生成矩阵乘法加速的IP核。
2、在Vivado中完成Block Design。
3、在Jupyter Notebook上完成IP的调用。
完整项目工程文件下载链接见文末
PYNQ-Z2开发板、USB数据线、网线
SD卡、读卡器
开发板配置参考链接
#define MAT_A_ROWS 32
#define MAT_A_COLS 32
#define MAT_B_ROWS 32
#define MAT_B_COLS 32
typedef int mat_a_t;
typedef int mat_b_t;
typedef int result_t;
void matrixmul(
mat_a_t a[MAT_A_ROWS][MAT_A_COLS],
mat_b_t b[MAT_B_ROWS][MAT_B_COLS],
result_t res[MAT_A_ROWS][MAT_B_COLS]);
定义输入的a,b矩阵维度和输出矩阵的维度
#include "mul.h"
void matrixmul(
mat_a_t a[MAT_A_ROWS][MAT_A_COLS],
mat_b_t b[MAT_B_ROWS][MAT_B_COLS],
result_t res[MAT_A_ROWS][MAT_B_COLS])
{
int tempA[MAT_A_ROWS][MAT_A_COLS];
int tempB[MAT_B_ROWS][MAT_B_COLS];
int tempAB[MAT_A_ROWS][MAT_B_COLS];
for (int ia = 0; ia<MAT_A_ROWS ;ia++){
for(int ja = 0; ja< MAT_A_COLS; ja++){
tempA[ia][ja] = a[ia][ja];
}
}
for (int ib = 0; ib<MAT_B_ROWS ;ib++){
for(int jb = 0; jb< MAT_B_COLS; jb++){
tempB[ib][jb] = b[ib][jb];
}
}
/* for each row and column of AB */
row: for(int i = 0; i < MAT_A_ROWS; ++i) {
col: for(int j = 0; j < MAT_B_COLS; ++j) {
/* compute (AB)i,j */
int ABij = 0;
product: for(int k = 0; k < MAT_A_COLS; ++k) {
ABij += tempA[i][k] * tempB[k][j];
}
tempAB[i][j] = ABij;
}
}
for (int iab = 0; iab<MAT_A_ROWS ;iab++){
for(int jab = 0; jab< MAT_B_COLS; jab++){
res[iab][jab] = tempAB[iab][jab];
}
}
}
PYNQ-Z2板的详细配置过程见文章顶的相关链接,此处默认大家能正常启动板子。
import pynq.lib.dma
import numpy as np
mmol = pynq.Overlay("./mul.bit")
dma0 = mmol.axi_dma_0
dma1 = mmol.axi_dma_1
from pynq import Xlnk
xlnk = Xlnk()
a = xlnk.cma_array(shape=(32,32), dtype=np.int)
b = xlnk.cma_array(shape=(32,32), dtype=np.int)
res = xlnk.cma_array(shape=(32,32), dtype=np.int)
for i in range(32):
for j in range(32):
a[i][j] = 8;
b[i][j] = 8;
dma0.sendchannel.transfer(a)
dma1.sendchannel.transfer(b)
dma0.recvchannel.transfer(res)
print(res)
传入的a,b矩阵为32*32的矩阵,元素均为8。结果显示乘法IP核调用正常。
项目工程下载链接
[2020.6.18更新,解决Block Design中TLAST管脚警告的问题]
改变数据类型
mul.c:
#include "mul.h"
void matrixmul(
mat_a_t a[SIZE],
mat_b_t b[SIZE],
result_t res[SIZE])
{
int tempA[MAT_A_ROWS][MAT_A_COLS];
int tempB[MAT_B_ROWS][MAT_B_COLS];
int tempAB[MAT_A_ROWS][MAT_B_COLS];
for (int ia = 0; ia<MAT_A_ROWS ;ia++){
for(int ja = 0; ja< MAT_A_COLS; ja++){
tempA[ia][ja] = a[ia*MAT_A_ROWS+ja].data;
}
}
for (int ib = 0; ib<MAT_B_ROWS ;ib++){
for(int jb = 0; jb< MAT_B_COLS; jb++){
tempB[ib][jb] = b[ib*MAT_A_ROWS+jb].data;
}
}
/* for each row and column of AB */
row: for(int i = 0; i < MAT_A_ROWS; ++i) {
col: for(int j = 0; j < MAT_B_COLS; ++j) {
/* compute (AB)i,j */
int ABij = 0;
product: for(int k = 0; k < MAT_A_COLS; ++k) {
ABij += tempA[i][k] * tempB[k][j];
}
tempAB[i][j] = ABij;
}
}
for (int iab = 0; iab<MAT_A_ROWS ;iab++){
for(int jab = 0; jab< MAT_B_COLS; jab++){
res[iab*MAT_A_ROWS+jab]=push_stream<int>(tempAB[iab][jab],iab==(MAT_A_ROWS-1)&&jab==(MAT_B_COLS-1));
}
}
}
mul.h:
#ifndef __MATRIXMUL_H__
#define __MATRIXMUL_H__
#include
#include
// Uncomment this line to compare TB vs HW C-model and/or RTL
//#define HW_COSIM
#define MAT_A_ROWS 32
#define MAT_A_COLS 32
#define MAT_B_ROWS 32
#define MAT_B_COLS 32
#define SIZE 1024
typedef ap_axis<32,0,0,0> mat_a_t;
typedef ap_axis<32,0,0,0> mat_b_t;
typedef ap_axis<32,0,0,0> result_t;
// Prototype of top level function for C-synthesis
void matrixmul(
mat_a_t a[SIZE],
mat_b_t b[SIZE],
result_t res[SIZE]);
template <typename T>
ap_axis<32,0,0,0> push_stream(T const &v, bool last = false)
{
#pragma HLS INLINE
ap_axis<32,0,0,0> e;
//assert(sizeof(T) == sizeof(int));
union
{
int oval;
T ival;
} converter;
converter.ival = v;
e.data = converter.oval;
e.strb=-1;
e.keep=15;
e.last = last ? 1 : 0;
return e;
}
#endif // __MATRIXMUL_H__ not defined
使用ap_axis数据类型,其中含有last信号,需对最后一次输出的信号的last信号赋1。或者自己定义数据类型的结构体,含last信号即可。这时进行Block Design的验证时不会再报TLAST信号丢失的Warning。