本篇为笔者第一次进行神经网络加速器设计的工程开发流程,在此做如下整理。
python环境:python:3.6.13;tensorflow:1.14.0;numpy:1.16.0
硬件环境:vivado2019,vivado_hls2019,pynq-Z1(xc7z020clg400-1)
本神经网络主要是进行图像识别,在人眼中图片是带有颜色的,而在电脑中图片的颜色则是使用0-1之间的灰度值表示,最为简单的黑白照片便是使用0表示白色,1表示黑色,中间值表示灰度。彩色图片则是采用红绿蓝(RGB)进行三通道混合表示。传统的数字识别思路为:对于图片中的每个像素点即矩阵中每个位置的点,对应于不同的分类结果(0-9)都一个支持率,之后将图像中对于每个分类的支持率全部加起来,支持率最高的分类就作为这张图片的识别结果,即 y = softmax(wx + b)。(其中softmax对计算结果进行归一化处理方便训练过程中的反向传播计算)
代码部分:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
import tensorflow as tf
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.nn.softmax(tf.matmul(x, W) + b)
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
init = tf.global_variables_initializer()
sess.run(init)
for _ in range(1000):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, {x: batch_xs, y_: batch_ys})
#print(sess.run(tf.matmul(x, W) + b, {x: mnist.test.images}))
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, {x: mnist.test.images, y_: mnist.test.labels}))
通过卷积和池化不断地对图片信息进行提取,最后在全连接中实现结果的输出。同时卷积神经网络考虑到具有一定的泛化能力(不同数据集上的准确度应该都较高),会加入dropout以防止过拟合,即会在神经网络的训练过程中随机地打开或关闭一些层。
此处为数据存储代码,进过训练以后的神经网络每一层的代码需要存储下来,方便后续硬件端进行计算,这里需要注意的是为了和后面硬件端C语言的函数读取方式一致,需要根据存储数据的维度进行分类,而不是简单读入。
def Record_Tensor(tensor,name):
print ("Recording tensor "+name+" ...")
f = open('./record/'+name+'.dat', 'w')
array=tensor.eval();
#print ("The range: ["+str(np.min(array))+":"+str(np.max(array))+"]")
if(np.size(np.shape(array))==1):
Record_Array1D(array,name,f)
else:
if(np.size(np.shape(array))==2):
Record_Array2D(array,name,f)
else:
if(np.size(np.shape(array))==3):
Record_Array3D(array,name,f)
else:
Record_Array4D(array,name,f)
f.close();
def Record_Array1D(array,name,f):
for i in range(np.shape(array)[0]):
f.write(str(array[i])+"\n");
def Record_Array2D(array,name,f):
for i in range(np.shape(array)[0]):
for j in range(np.shape(array)[1]):
f.write(str(array[i][j])+"\n");
def Record_Array3D(array,name,f):
for i in range(np.shape(array)[0]):
for j in range(np.shape(array)[1]):
for k in range(np.shape(array)[2]):
f.write(str(array[i][j][k])+"\n");
def Record_Array4D(array,name,f):
for i in range(np.shape(array)[0]):
for j in range(np.shape(array)[1]):
for k in range(np.shape(array)[2]):
for l in range(np.shape(array)[3]):
f.write(str(array[i][j][k][l])+"\n");
之后便是基于tensorflow语法进行神经网络搭建。
def weight_variable(shape):
initial = tf.compat.v1.truncated_normal(shape, stddev=0.1);
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2,1], padding='SAME')
#First Convolutional Layer
with tf.name_scope('1st_CNN'):
W_conv1 = weight_variable([3, 3, 1, 16])
b_conv1 = bias_variable([16])
x_image = tf.reshape(x, [-1,28,28,1])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
#Second Convolutional Layer
with tf.name_scope('2rd_CNN'):
W_conv2 = weight_variable([3, 3, 16, 32])
b_conv2 = bias_variable([32])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
#Densely Connected Layer
with tf.name_scope('Densely_NN'):
W_fc1 = weight_variable([ 7* 7* 32, 128])
b_fc1 = bias_variable([128])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*32])
h_fc1=tf.nn.relu(tf.matmul(h_pool2_flat , W_fc1) + b_fc1)
#Dropout
with tf.name_scope('Dropout'):
keep_prob = tf.placeholder("float")
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
#Readout Layer
with tf.name_scope('Softmax'):
W_fc2 = weight_variable([128, 10])
b_fc2 = bias_variable([10])
y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
with tf.name_scope('Loss'):
cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv))
with tf.name_scope('Train'):
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
with tf.name_scope('Accuracy'):
correct_prediction = tf.equal(tf.argmax(y_conv ,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction , "float"))
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter("logs/",sess.graph)
tf.initialize_all_variables().run()
for i in range(10000):
batch = mnist.train.next_batch(50);
if i%20 == 0:
train_accuracy = accuracy.eval(feed_dict={x:batch[0], y_: batch[1], keep_prob:1.0});
print("step %d, training accuracy %g"%(i, train_accuracy));
train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob:0.5});
print("test accuracy %g"%accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
在神经网络中存储的.dat文件并不能被C直接读取,需要进行文件名的转换,此时可以采用如下所示的C代码,也可以使用python中自带的tofile函数实现。
#include
#include
#include
char* filename_to_bin(char *filename_i)
{
int filename_length=0;
while(filename_i[filename_length]!='\0') filename_length++;
//printf("filename length=%d\n",filename_length);
char *filename_bin=(char *)malloc(filename_length+1);
int i=0;
while(!(filename_i[i]=='.' && filename_i[i+1]=='d'&& filename_i[i+2]=='a' && filename_i[i+3]=='t' && filename_i[i+4]=='\0'))//not '.dat\0'
{
if(i==filename_length-1)
{
free(filename_bin);
filename_bin=NULL;
return filename_bin;
}
filename_bin[i]=filename_i[i];
i++;
}
filename_bin[i]='.';filename_bin[i+1]='b';filename_bin[i+2]='i';filename_bin[i+3]='n';
filename_bin[i+4]='\0';
return filename_bin;
}
int main(int argc, char *argv[])
{
for(int i=1;i<argc;i++)
{
char *filename_i=argv[i];
char *filename_bin=filename_to_bin(filename_i);
if(filename_bin==NULL)
{
printf("%s is not a dat file\n",filename_i);
break;
}
FILE *fp_IN;
if((fp_IN=fopen(filename_i,"r"))==NULL)
{
printf("File %s cannot be opened/n",filename_i);
break;
}
FILE* fp_OUT = fopen(filename_bin,"wb");
if (fp_OUT == NULL)
{
printf("File %s cannot be created/n",filename_bin);
break;
}
char str[20];
while(fgets(str,20,fp_IN))
{
//printf("%s=%f\n",str,atof(str));
float tp=atof(str);
fwrite(&tp,sizeof(float),1,fp_OUT);
};
fclose(fp_IN);
fflush(fp_OUT);
fclose(fp_OUT);
free(filename_bin);
}
return 0;
}
上述神经网络具有较大的计算量,可以通过硬件端的并行化设计使其更为快速地执行。以下部分便是进行硬件端的代码编写。此处设计可以直接通过verilog进行编写,也可以通过hls进行编写,本工程中采用的为hls。
之后需要做的便是进行hsl编写。
conv部分的代码如下:
#include "conv_core.h"
//Feature: [H][W][C]
//kernel: [Ky][Kx][CHin][CHout]
void Conv(ap_uint<16> CHin,ap_uint<16> Hin,ap_uint<16> Win,ap_uint<16> CHout,
ap_uint<8> Kx,ap_uint<8> Ky,ap_uint<8> Sx,ap_uint<8> Sy,ap_uint<1> mode,ap_uint<1> relu_en,
Dtype_f feature_in[],Dtype_w W[],Dtype_w bias[],Dtype_f feature_out[]
)//mode: 0:VALID, 1:SAME
{
ap_uint<8> pad_x,pad_y;
if(mode==0)
{
pad_x=0;pad_y=0;
}
else
{
pad_x=(Kx-1)/2;pad_y=(Ky-1)/2;
}
ap_uint<16> Hout,Wout;
Wout=(Win+2*pad_x-Kx)/Sx+1;
Hout=(Hin+2*pad_y-Ky)/Sy+1;
for(int cout=0;cout<CHout;cout++)
for(int i=0;i<Hout;i++)
for(int j=0;j<Wout;j++)
{
Dtype_acc sum=0;
for(int ii=0;ii<Ky;ii++)
for(int jj=0;jj<Kx;jj++)
{
ap_int<16> h=i*Sy-pad_y+ii;
ap_int<16> w=j*Sx-pad_x+jj;
if(h>=0 && w>=0 && h<Hin && w<Win)
{
for(int cin=0;cin<CHin;cin++)
{
Dtype_mul tp=feature_in[h*CHin*Win+w*CHin+cin]*W[ii*Kx*CHin*CHout+jj*CHin*CHout+cin*CHout+cout];
sum+=tp;
}
}
}
sum+=bias[cout];
if(relu_en & sum<0)
sum=0;
feature_out[i*Wout*CHout+j*CHout+cout]=sum;
}
}
pool部分的代码:
#include "pool_core.h"
#define max(a,b) ((a>b)?a:b)
#define min(a,b) ((a>b)?b:a)
void Pool(ap_uint<16> CHin,ap_uint<16> Hin,ap_uint<16> Win,
ap_uint<8> Kx,ap_uint<8> Ky,ap_uint<2> mode,
Dtype_f feature_in[],Dtype_f feature_out[]
)//mode: 0:MEAN, 1:MIN, 2:MAX
{
ap_uint<16> Hout,Wout;
Wout=Win/Kx;
Hout=Hin/Ky;
for(int c=0;c<CHin;c++)
for(int i=0;i<Hout;i++)
for(int j=0;j<Wout;j++)
{
Dtype_f sum;
if(mode==0)
sum=0;
else
if(mode==1)
sum=99999999999999999;
else
sum=-99999999999999999;
for(int ii=0;ii<Ky;ii++)
for(int jj=0;jj<Kx;jj++)
{
ap_int<16> h=i*Ky+ii;
ap_int<16> w=j*Kx+jj;
switch(mode)
{
case 0:{sum+=feature_in[h*CHin*Win+w*CHin+c];break;}
case 1:{sum=min(sum,feature_in[h*CHin*Win+w*CHin+c]);break;}
case 2:{sum=max(sum,feature_in[h*CHin*Win+w*CHin+c]);break;}
default:break;
}
}
if(mode==0)
sum=sum/(Kx*Ky);
feature_out[i*Wout*CHin+j*CHin+c]=sum;
}
}
此时其实可以直接综合,vivado_hls将根据已有的代码生成电路,不过该电路是由vivado_hls自己生成因此性能上可能并不能达到预期效果,此时便需要添加约束;
以下代码将以一个简单的4x4矩阵为例展示如何给已有的C语言代码添加约束:
void Matrix_mult(float A[4][4],float B[4][4],float C[4][4]){
#pragma HLS INTERFACE s_axilite port=return
#pragma HLS INTERFACE m_axi port=C
#pragma HLS INTERFACE m_axi port=B
#pragma HLS INTERFACE m_axi port=A
#pragma HLS INTERFACE s_axilite port=return
for(int i=0;i<4;i++){
#pragma HLS UNROLL
for(int j=0;j<4;j++)
{
#pragma HLS UNROLL
double temp;
for(int k=0;k<4;k++)
{
#pragma HLS UNROLL
temp += A[i][k]*B[k][i];
}
C[i][j] = temp;
}
}
};
此处需要基本的pynq-z1的开发知识,如果不了解可以查看官方文档。
pynq的硬件驱动可以由python编写,对于没有任何pynq开发经验的人而言可以先试着调用官方ip进行LED灯点亮来进行熟悉,代码如下:
from pynq import Overlay
import numpy as np
ol = Overlay("testAAA.bit")
# ol.ip_dict 可以用查询导入电路中用到的ip核
ol.download()
gpio = ol.axi_gpio_0
gpio.write(0,15)
gpio.write(0,4)
# 神经网络中设计大量的数组运算,但是虚拟空间连续的数组地址,在真实的物理空间并不一定是连续的,
# 所以这里需要通过Xlnk,来创建数组,xlnk对于pynq就类似于numpy之于python。
from pynq import Xlnk
xlnk = Xlnk()
input_buffer=xlnk.cma_array(shape=(4000,),cacheable=0,dtype=np.int32);
# 通过这种方式在物理空间里开出来的数组地址都是连续的,cacheable表示缓冲也就是说对硬件写数据是否直接写入到ram中。
input_buffer.physical_address
熟悉了基本的pynq-ip调用过程以后就可以进行CPU驱动程序的开发了。
其实经过hls生成ip之后也会相应的生成一份硬件驱动,保存在以下目录中Documents/frb/source2/PYNQ/hls/pool_core/solution1/impl/ip/drivers/Pool_v1_0/src$ (同样这里是以笔者的文件为例)。但是这份驱动对于我们而言并不是十分适用,主要原因在于这份驱动文件是由C语言编写而成,而我们需要的则是一份python版本的代码,因此需要自己根据C语言生成的文件进行转义变成python文件。
以下展示pool部分的驱动头文件:
#ifndef __linux__
#include "xil_types.h"
#include "xil_assert.h"
#include "xstatus.h"
#include "xil_io.h"
#else
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#endif
#include "xpool_hw.h"
/**************************** Type Definitions ******************************/
#ifdef __linux__
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
#else
typedef struct {
u16 DeviceId;
u32 Axilites_BaseAddress;
} XPool_Config;
#endif
typedef struct {
u32 Axilites_BaseAddress;
u32 IsReady;
} XPool;
/***************** Macros (Inline Functions) Definitions *********************/
#ifndef __linux__
#define XPool_WriteReg(BaseAddress, RegOffset, Data) \
Xil_Out32((BaseAddress) + (RegOffset), (u32)(Data))
#define XPool_ReadReg(BaseAddress, RegOffset) \
Xil_In32((BaseAddress) + (RegOffset))
#else
#define XPool_WriteReg(BaseAddress, RegOffset, Data) \
*(volatile u32*)((BaseAddress) + (RegOffset)) = (u32)(Data)
#define XPool_ReadReg(BaseAddress, RegOffset) \
*(volatile u32*)((BaseAddress) + (RegOffset))
#define Xil_AssertVoid(expr) assert(expr)
#define Xil_AssertNonvoid(expr) assert(expr)
#define XST_SUCCESS 0
#define XST_DEVICE_NOT_FOUND 2
#define XST_OPEN_DEVICE_FAILED 3
#define XIL_COMPONENT_IS_READY 1
#endif
/************************** Function Prototypes *****************************/
#ifndef __linux__
int XPool_Initialize(XPool *InstancePtr, u16 DeviceId);
XPool_Config* XPool_LookupConfig(u16 DeviceId);
int XPool_CfgInitialize(XPool *InstancePtr, XPool_Config *ConfigPtr);
#else
int XPool_Initialize(XPool *InstancePtr, const char* InstanceName);
int XPool_Release(XPool *InstancePtr);
#endif
//控制电路的开关
void XPool_Start(XPool *InstancePtr);
u32 XPool_IsDone(XPool *InstancePtr);
u32 XPool_IsIdle(XPool *InstancePtr);
u32 XPool_IsReady(XPool *InstancePtr);
void XPool_EnableAutoRestart(XPool *InstancePtr);
void XPool_DisableAutoRestart(XPool *InstancePtr);
//这部分都是set和get对应用于数据的读写
void XPool_Set_CHin_V(XPool *InstancePtr, u32 Data);
u32 XPool_Get_CHin_V(XPool *InstancePtr);
void XPool_Set_Hin_V(XPool *InstancePtr, u32 Data);
u32 XPool_Get_Hin_V(XPool *InstancePtr);
void XPool_Set_Win_V(XPool *InstancePtr, u32 Data);
u32 XPool_Get_Win_V(XPool *InstancePtr);
void XPool_Set_Kx_V(XPool *InstancePtr, u32 Data);
u32 XPool_Get_Kx_V(XPool *InstancePtr);
void XPool_Set_Ky_V(XPool *InstancePtr, u32 Data);
u32 XPool_Get_Ky_V(XPool *InstancePtr);
void XPool_Set_mode_V(XPool *InstancePtr, u32 Data);
u32 XPool_Get_mode_V(XPool *InstancePtr);
//这里主要是用于告诉硬件从哪里读取输入图层的基地址,并将运算好的结果存储到哪里。
//这里的基地址可以通过array.physical_address进行读取
void XPool_Set_feature_in(XPool *InstancePtr, u32 Data);
u32 XPool_Get_feature_in(XPool *InstancePtr);
void XPool_Set_feature_out(XPool *InstancePtr, u32 Data);
u32 XPool_Get_feature_out(XPool *InstancePtr);
void XPool_InterruptGlobalEnable(XPool *InstancePtr);
void XPool_InterruptGlobalDisable(XPool *InstancePtr);
void XPool_InterruptEnable(XPool *InstancePtr, u32 Mask);
void XPool_InterruptDisable(XPool *InstancePtr, u32 Mask);
void XPool_InterruptClear(XPool *InstancePtr, u32 Mask);
u32 XPool_InterruptGetEnabled(XPool *InstancePtr);
u32 XPool_InterruptGetStatus(XPool *InstancePtr);
#ifdef __cplusplus
}
#endif
#endif
以上展示的主要是头文件,但是实际要转义的文件为xpool_hw.h里,主要代码如下所示:
此处告知pool中的各个参数分别存储在哪些寄存器中。
#define XPOOL_AXILITES_ADDR_AP_CTRL 0x00
#define XPOOL_AXILITES_ADDR_GIE 0x04
#define XPOOL_AXILITES_ADDR_IER 0x08
#define XPOOL_AXILITES_ADDR_ISR 0x0c
#define XPOOL_AXILITES_ADDR_CHIN_V_DATA 0x10
#define XPOOL_AXILITES_BITS_CHIN_V_DATA 16
#define XPOOL_AXILITES_ADDR_HIN_V_DATA 0x18
#define XPOOL_AXILITES_BITS_HIN_V_DATA 16
#define XPOOL_AXILITES_ADDR_WIN_V_DATA 0x20
#define XPOOL_AXILITES_BITS_WIN_V_DATA 16
#define XPOOL_AXILITES_ADDR_KX_V_DATA 0x28
#define XPOOL_AXILITES_BITS_KX_V_DATA 8
#define XPOOL_AXILITES_ADDR_KY_V_DATA 0x30
#define XPOOL_AXILITES_BITS_KY_V_DATA 8
#define XPOOL_AXILITES_ADDR_MODE_V_DATA 0x38
#define XPOOL_AXILITES_BITS_MODE_V_DATA 2
#define XPOOL_AXILITES_ADDR_FEATURE_IN_DATA 0x40
#define XPOOL_AXILITES_BITS_FEATURE_IN_DATA 32
#define XPOOL_AXILITES_ADDR_FEATURE_OUT_DATA 0x48
#define XPOOL_AXILITES_BITS_FEATURE_OUT_DATA 32
转义的python代码:
注:这一部分还需要参看xpool.c文件,查看原C语言中的函数是如何进行读写操作的
def RunPool(pool,Kx,Ky,mode,feature_in,feature_out):
#这里[0],[1]分别表示图层大小,[2]表示channel数;
pool.write(0x10,feature_in.shape[2]);
pool.write(0x18,feature_in.shape[0]);
pool.write(0x20,feature_in.shape[1]);
pool.write(0x28,Kx);
pool.write(0x30,Ky);
pool.write(0x38,mode);
pool.write(0x40,feature_in.physical_address);
pool.write(0x48,feature_out.physical_address);
pool.write(0, (pool.read(0)&0x80)|0x01 );
之后就可以在pynq开发板上运行编写好的驱动文件,显示效果如下:
注:运行是需要采用root账户
此时便只剩下最后一步,即根据Lenet-5模型编写python代码,并调用之前已经编写的pool和conv部分代码,搭建最后的功能电路。并将之前几页tensorflow搭建的网络的训练参数导入。
演示效果如下:
如上图所示是一张数字2;
之后运行编写好的代码:可以正确识别出数字2,至此工程全部结束。