经过几天的研究,终于实现了CNN在FPGA上运行(无任何优化,后续会进行代码优化)
本次实现的网络为LeNet,具体结构为(28,28)的输入,经过32个5x5的卷积核后输出24x24x32的特征图,池化后输出12x12x32,第二次卷积运算为16个3x3x32的卷积核,输出10x10x16的特征图,池化后展开后成为400x1的向量,全连接到120个神经元,这120个神经元全连接到84个神经元,最后是10个输出神经元,代表10个类别(数字0-9).
由于绝大多数的计算量集中在两个卷积层,因此只将卷积层实现在FPGA上,全连接层有ARM CPU实现。
ZYNQ CPU代码如下:
#include
#include
#include "platform.h"
#include "xil_printf.h"
#include "xtop_function_hw.h"
#include "xtop_function.h"
#include "xil_cache.h"
#include "xtime_l.h"
float W1[120][400]={
#include"F:\\cnn_parameter\\W1.h"
};
float b1[120]={
#include"F:\\cnn_parameter\\b1.h"
};
float W2[84][120]={
#include"F:\\cnn_parameter\\W2.h"
};
float b2[84]={
#include"F:\\cnn_parameter\\b2.h"
};
float W3[10][84]={
#include"F:\\cnn_parameter\\W3.h"
};
float b3[10]={
#include"F:\\cnn_parameter\\b3.h"
};
void fullconnected(short *in,float *out){
float inter1[120];
float inter2[84];
int i,j;
for(i=0;i<120;i++)
{
inter1[i]=b1[i];
for(j=0;j<400;j++)
inter1[i]+=((float)in[j]/256.0)*W1[i][j];
inter1[i]=(inter1[i]>0)?inter1[i]:0;
}
for(i=0;i<84;i++){
inter2[i]=b2[i];
for(j=0;j<120;j++)
inter2[i]+=inter1[j]*W2[i][j];
inter2[i]=(inter2[i]>0)?inter2[i]:0;
}
for(i=0;i<10;i++)
{
out[i]=b3[i];
for(j=0;j<84;j++)
out[i]+=inter2[j]*W3[i][j];
}
}
int main()
{
//Xil_DCacheDisable();
int i,j;
float in[400];
float out[10];
XTime tEnd,tCur;
u32 tUsed;
float fm[28][28]={
#include"F:\\cnn_parameter\\img.h"
};
short *src=(short*)malloc(784*sizeof(short));
short *dest=(short*)malloc(400*sizeof(short));
for(i=0;i<28;i++)
for(j=0;j<28;j++)
src[i*28+j]=(short)(fm[i][j]*256);
Xil_DCacheFlushRange((u32)src,784*sizeof(short));
XTime_GetTime(&tCur);
XTop_function HlsXtop_function;
XTop_function_Config *ExamplePtr;
printf("Look Up the device configuration.\n");
ExamplePtr = XTop_function_LookupConfig(XPAR_TOP_FUNCTION_0_DEVICE_ID);
if (!ExamplePtr) {
printf("ERROR: Lookup of accelerator configuration failed.\n\r");
return XST_FAILURE;
}
printf("Initialize the Device\n");
long status = XTop_function_CfgInitialize(&HlsXtop_function, ExamplePtr);
if (status != XST_SUCCESS) {
printf("ERROR: Could not initialize accelerator.\n\r");
return(-1);
}
XTop_function_Set_pixel_in_V(&HlsXtop_function,(u32)src);
XTop_function_Set_pixel_out_V(&HlsXtop_function,(u32)dest);
XTop_function_Start(&HlsXtop_function);
while (XTop_function_IsDone(&HlsXtop_function) == 0);
Xil_DCacheInvalidateRange((u32)dest,sizeof(short)*400);
fullconnected(dest,out);
XTime_GetTime(&tEnd);
tUsed=((tEnd-tCur)*1000000)/(COUNTS_PER_SECOND);
xil_printf("***********************************\n");
printf("time used is %d us\n",tUsed);
for(i=0;i<5;i++){
for(j=0;j<5;j++)
printf("%f,",(float)dest[i*5*16+j*16+9]/256.0);
printf("\n");
}
printf("\n\n");
for(i=0;i<10;i++)
printf("%f\n",out[i]);
return 0;
}
HLS代码如下:
#include"cnn.h"
#include
void conv1(data_t map_in[Ni][Ni], const data_t filter[M1][K1][K1],const data_t bias[M1], data_t map_out[N1][N1][M1]) {
int row, col, kx, ky, m;
data_t tmp[M1];
LOOP_R:
for (row = 0; row < N1; row++)
LOOP_C:
for (col = 0; col < N1; col++)
LOOP_KX:
for(kx=0;kx<K1;kx++)
LOOP_KY:
for(ky=0;ky<K1;ky++)
LOOP_N:
for(m=0;m<M1;m++){
if(kx==0&&ky==0)
tmp[m]=bias[m];
tmp[m]+=map_in[row+kx][col+ky]*filter[m][kx][ky];
if(kx==K1-1&&ky==K1-1)
map_out[row][col][m]=(tmp[m]>0)?tmp[m]:(data_t)0;
}
}
void maxpool1(data_t map_in[N1][N1][C1], data_t map_out[N2][N2][C1]) {
int row, col, n;
int kx, ky;
data_t tmp;
LOOP_R:
for(row=0;row<N2;row++)
LOOP_C:
for (col = 0; col < N2; col++)
LOOP_N:
for (n = 0; n < C1; n++) {
tmp = map_in[2 * row][2 * col][n];
LOOP_KX:
for (kx = 0; kx < 2; kx++)
LOOP_KY:
for (ky = 0; ky < 2; ky++)
tmp = (map_in[2 * row + kx][2 * col + ky][n] > tmp) ? map_in[2 * row + kx][2 * col + ky][n] : tmp;
map_out[row][col][n] = tmp;
}
}
void conv2(data_t map_in[N2][N2][C1],const data_t filter[M2][K2][K2][C1],const data_t bias[M2], data_t map_out[N3][N3][C2]) {
int row, col, n, m, kx, ky;
LOOP_R:
for(row=0;row<N3;row++)
LOOP_C:
for(col=0;col<N3;col++)
LOOP_M:
for(m=0;m<M2;m++){
map_out[row][col][m]=bias[m];
LOOP_KX:
for(kx=0;kx<K2;kx++)
LOOP_KY:
for(ky=0;ky<K2;ky++)
LOOP_N:
for(n=0;n<C1;n++)
map_out[row][col][m]+=filter[m][kx][ky][n]*map_in[row+kx][col+ky][n];
map_out[row][col][m]=(map_out[row][col][m]>0)?map_out[row][col][m]:(data_t)0;
}
}
void maxpool2(data_t map_in[N3][N3][C2], data_t map_out[N4][N4][C2])
{
int row, col, n;
int kx, ky;
data_t tmp;
LOOP_R:
for(row=0;row<N4;row++)
LOOP_C:
for (col = 0; col < N4; col++)
LOOP_N:
for (n = 0; n < C2; n++)
{
LOOP_KX:
for (kx = 0; kx < 2; kx++)
LOOP_KY:
for (ky = 0; ky < 2; ky++){
if(kx==0&&ky==0)
tmp = map_in[row * 2][col * 2][n];
else
tmp = (map_in[row * 2 + kx][col * 2 + ky][n] > tmp) ? map_in[row * 2 + kx][col * 2 + ky][n] : tmp;
if(kx==1&&ky==1)
map_out[row][col][n] = tmp;
}
}
}
void top_function(data_t *pixel_in,data_t *pixel_out) {
#pragma HLS INTERFACE m_axi depth=784 port=pixel_in offset=slave bundle=INPUT
#pragma HLS INTERFACE m_axi depth=400 port=pixel_out offset=slave bundle=OUTPUT
#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS
data_t fm_in[Ni][Ni];
memcpy(fm_in,(const data_t*)pixel_in,784*sizeof(data_t));
//convolutional layer
const data_t filter1[M1][K1][K1]={
#include"F:\\cnn_parameter\\filter1.h"
};
const data_t bias1[M1]={
#include"F:\\cnn_parameter\\bias1.h"
};
const data_t filter2[M2][K2][K2][C1]={
#include"F:\\cnn_parameter\\filter2.h"
};
const data_t bias2[M2]={
#include"F:\\cnn_parameter\\bias2.h"
};
//convolutional layer tmp variable
data_t conv1_out[N1][N1][C1];
data_t pool1_out[N2][N2][C1];
data_t conv2_out[N3][N3][C2];
data_t pool2_out[N4][N4][C2];
//start compute
//convolution and maxpool compute
conv1(fm_in, filter1,bias1, conv1_out);
maxpool1(conv1_out, pool1_out);
conv2(pool1_out,filter2,bias2,conv2_out);
maxpool2(conv2_out, pool2_out);
memcpy(pixel_out,(const data_t*)pool2_out,N4*N4*C2*sizeof(data_t));
}
这是头文件:
#pragma once
#include
#include
#include
#define Ni 28 //输入图像大小28x28
#define N1 24 //24x24x32
#define N2 12 //12x12x32
#define N3 10 //10x10x16
#define N4 5 //5x5x16
#define K1 5 //卷积层1卷积核大小 32x(5x5)
#define K2 3 //卷积层2卷积核大小 16x(3x3x32)
#define M1 32 //卷积层1卷积核个数 32
#define M2 16 //卷积层2卷积核个数 16
#define C1 M1 //卷积层1输出通道数
#define C2 M2 //卷积层2输出通道数
#define FIN (C2*N4*N4) //400
#define F1 120
#define F2 84
#define FOUT 10
//typedef float data_t;
typedef ap_fixed<16,8,AP_RND,AP_SAT> data_t;
//typedef half data_t;
void top_function(data_t *pixel_in,data_t *pixel_out);
//C1=M1,C2=M2
运行结果:
keras运行结果:
由于定点计算造成的误差尚可接受。
但是时间太慢,120ms一张图像,用c++实现用时75ms,而python keras(调用tensorflow)实现用时仅为650us左右,因此需要进一步对代码进行优化。