神经网络的FPGA实现:基础卷积操作(一)

卷积核 kernel_size=3*3
输入特征图fmap[width,high]=[9,9]
Verilog HDL
Xilinx VIVADO

源文件

`timescale 1ns / 1ps
module conv_pe(
	input clk,
	input rst,
	input input_en,//输出使能
	input [7:0] kernel_00, //卷积核窗口3*3
	input [7:0] kernel_01,
	input [7:0] kernel_02,
	input [7:0] kernel_10,
	input [7:0] kernel_11,
	input [7:0] kernel_12,
	input [7:0] kernel_20,
	input [7:0] kernel_21,
	input [7:0] kernel_22,
	
	input [7:0] fmap_00, //输入特征图的被卷积核窗口3*3
	input [7:0] fmap_01,
	input [7:0] fmap_02,
	input [7:0] fmap_10,
	input [7:0] fmap_11,
	input [7:0] fmap_12,
	input [7:0] fmap_20,
	input [7:0] fmap_21,
	input [7:0] fmap_22,

	output valid_out,  //输出使能
	output [31:0] sum_data//卷积输出和
    );
reg input_en_1,input_en_2,input_en_3,input_en_4;//卷积操作使能
always @(posedge clk) begin
	if(rst) begin//未开始卷积操作
		 input_en_1<= 0;
		 input_en_2<= 0;
		 input_en_3<= 0;
		 input_en_4<= 0;
	end else begin//依次卷积操作使能
		 input_en_1<=input_en ;
		 input_en_2<=input_en_1 ;
		 input_en_3<=input_en_2 ;
		 input_en_4<=input_en_3 ;
	end
end

reg valid_out_r;//卷积输出使能寄存器
reg [31:0] sum_data_r;//卷积输出和寄存器
wire [16:0]dsp_00,dsp_01,dsp_02,dsp_10,dsp_11,dsp_12,dsp_20,dsp_21,dsp_22;
//DSP乘法累加器,进行进行卷积核与被卷积窗口对应两位INT8的乘法运算
always @(posedge clk ) begin
	if(rst) begin//卷积未开始
		valid_out_r<= 0;
		sum_data_r<=32'd0;
	end else begin
		if(input_en_4) begin//卷积结束
			sum_data_r<={{15{dsp_00[16]}},dsp_00}+{{15{dsp_01[16]}},dsp_01}+{{15{dsp_02[16]}},dsp_02}+
						{{15{dsp_10[16]}},dsp_10}+{{15{dsp_11[16]}},dsp_11}+{{15{dsp_12[16]}},dsp_12}+
						{{15{dsp_20[16]}},dsp_20}+{{15{dsp_21[16]}},dsp_21}+{{15{dsp_22[16]}},dsp_22};//将所有卷积结果相加赋予卷积输出和寄存器
			valid_out_r<=1;//输出使能寄存器赋1
		end 
		else begin
			valid_out_r<=0;//输出使能寄存器赋0
		end
	end
end
assign valid_out=valid_out_r;//将输出使能寄存器值赋予输出使能
assign sum_data=sum_data_r;//将卷积输出和寄存器值赋予卷积输出



dsp48_macro_0 uut_dsp48_1(//UUT被测单元第一个DSP运算参数
	.CLK(clk),
	.A(kernel_00), //[7:0]
	.B(fmap_00), //[7:0]
	.C(2'd0),//[1:0]
	.P(dsp_00)  //[16:0]
	);

dsp48_macro_0 uut_dsp48_2(
	.CLK(clk),
	.A(kernel_01), //[7:0]
	.B(fmap_01), //[7:0]
	.C(2'd0),//[1:0]
	.P(dsp_01)  //[16:0]
	);

dsp48_macro_0 uut_dsp48_3(
	.CLK(clk),
	.A(kernel_02), //[7:0]
	.B(fmap_02), //[7:0]
	.C(2'd0),//[1:0]
	.P(dsp_02)  //[16:0]
	);

dsp48_macro_0 uut_dsp48_4(
	.CLK(clk),
	.A(kernel_10), //[7:0]
	.B(fmap_10), //[7:0]
	.C(2'd0),//[1:0]
	.P(dsp_10)  //[16:0]
	);

dsp48_macro_0 uut_dsp48_5(
	.CLK(clk),
	.A(kernel_11), //[7:0]
	.B(fmap_11), //[7:0]
	.C(2'd0),//[1:0]
	.P(dsp_11)  //[16:0]
	);

dsp48_macro_0 uut_dsp48_6(
	.CLK(clk),
	.A(kernel_12), //[7:0]
	.B(fmap_12), //[7:0]
	.C(2'd0),//[1:0]
	.P(dsp_12)  //[16:0]
	);

dsp48_macro_0 uut_dsp48_7(
	.CLK(clk),
	.A(kernel_20), //[7:0]
	.B(fmap_20), //[7:0]
	.C(2'd0),//[1:0]
	.P(dsp_20)  //[16:0]
	);

dsp48_macro_0 uut_dsp48_8(
	.CLK(clk),
	.A(kernel_21), //[7:0]
	.B(fmap_21), //[7:0]
	.C(2'd0),//[1:0]
	.P(dsp_21)  //[16:0]
	);

dsp48_macro_0 uut_dsp48_9(
	.CLK(clk),
	.A(kernel_22), //[7:0]
	.B(fmap_22), //[7:0]
	.C(2'd0),//[1:0]
	.P(dsp_22)  //[16:0]
	);

reg [31:0]conv_pe_count;//存储器前为存储单元储存位宽32位,后为存储器的大小即多少个这样的存储器。
always @(posedge clk) begin : proc_
	if(rst) begin
		 conv_pe_count<= 0;
	end else begin
		if(valid_out)begin//若输出使能
			conv_pe_count<=conv_pe_count+1;
		end
	end
end
endmodule

仿真文件

`timescale 1ns / 1ps

module tb_conv_pe;
reg clk;
reg rst;

initial begin//initial块仅仅在仿真开始时执行一次
rst=1;
#1000//延迟1000个单位时间
rst=0;
end
always begin:clk1_blk
clk=0;
forever#5 clk=~clk;//循环产生周期信号,周期10个时间单位
end
reg[7:0]kernel_00,kernel_01,kernel_02,kernel_10,kernel_11,kernel_12,kernel_20,kernel_21,kernel_22;//卷积核3*3
reg[7:0]fmap_00,fmap_01,fmap_02,fmap_10,fmap_11,fmap_12,fmap_20,fmap_21,fmap_22;//对应被卷积特征窗口3*3
reg input_en;//输入使能
wire valid_out;//输出使能
wire[31:0]sum_data;//卷积结果数据


conv_pe uut_conv_pe(//被测单元UUT
.clk(clk),
.rst(rst),
.input_en(input_en),
.kernel_00(kernel_00),//卷积核3*3
.kernel_01(kernel_01),
.kernel_02(kernel_02),
.kernel_10(kernel_10),
.kernel_11(kernel_11),
.kernel_12(kernel_12),
.kernel_20(kernel_20),
.kernel_21(kernel_21),
.kernel_22(kernel_22),

.fmap_00(fmap_00),//对应被卷积特征窗口3*3
.fmap_01(fmap_01),
.fmap_02(fmap_02),
.fmap_10(fmap_10),
.fmap_11(fmap_11),
.fmap_12(fmap_12),
.fmap_20(fmap_20),
.fmap_21(fmap_21),
.fmap_22(fmap_22),
.valid_out(valid_out),
.sum_data(sum_data)
);
reg[7:0]fmap[80:0];//9*9的输入特征矩阵
initial begin
  $readmemh("F://vivado_project//conv_pe//fmap.txt",fmap);//读取输入特征数据
  end
  
 integer i,j;//卷积窗口右移与下移
 reg[2:0]states;
 always@(posedge clk)begin
   if(rst)begin
   i<=0;
   j<=0;
   states<=0;
   input_en<=0;
   fmap_00<=8'd0;//输入特征被卷积窗口大小初始
   fmap_01<=8'd0;
   fmap_02<=8'd0;
   fmap_10<=8'd0;
   fmap_11<=8'd0;
   fmap_12<=8'd0;
   fmap_20<=8'd0;
   fmap_21<=8'd0;
   fmap_22<=8'd0;
   
   kernel_00<=8'd1;//卷积窗口核数值
   kernel_01<=8'd0;
   kernel_02<=-8'd1;
   kernel_10<=8'd2;
   kernel_11<=8'd0;
   kernel_12<=-8'd2;
   kernel_20<=8'd1;
   kernel_21<=8'd0;
   kernel_22<=-8'd1;           
   end
   
   else begin
      case(states)
         0:begin
             if(i<7)begin//卷积输出width=fmap_width-kernel_size+1=9-3+1=7。0-6
               input_en<=1;//输入使能
               fmap_00<=fmap[i+0+9*j];//i=0,j=0:第一行前三个fmap[0]、fmap[1]、fmap[2]
               fmap_01<=fmap[i+1+9*j];
               fmap_02<=fmap[i+2+9*j];
               
               fmap_10<=fmap[i+0+9*(j+1)];//i=0,j=0:第二行前三个fmap[9]、fmap[10]、fmap[11]
               fmap_11<=fmap[i+1+9*(j+1)];
               fmap_12<=fmap[i+2+9*(j+1)];
               
               fmap_20<=fmap[i+0+9*(j+2)];//i=0,j=0:第三行前三个fmap[18]、fmap[19]、fmap[20]
               fmap_21<=fmap[i+1+9*(j+2)];
               fmap_22<=fmap[i+2+9*(j+2)];
               i<=i+1;//i依次等于0-6,j=0,卷积窗口右移。
               states<=1;
             end
             else begin
               input_en<=0;
             end
           end
           
           1:begin
             input_en<=0;
             if(i==7)begin//第一轮卷积结束
             i<=0;
             j<=j+1;/下一轮,/从下一行开始窗口右移卷积
             end
             if(j<7)begin//卷积输出high=fmap_high-kernel_size+1=9-3+1=7。0-6
             states<=0;
             end
             if(j==6&&i==7)begin//卷积窗口遍历结束
             states<=2;
             end
           end
         2:begin
            input_en<=0;
            fmap_00<=8'd0;//卷积结束,初始化
            fmap_01<=8'd0;
            fmap_02<=8'd0;
            fmap_10<=8'd0;
            fmap_11<=8'd0;
            fmap_12<=8'd0;
            fmap_20<=8'd0;
            fmap_21<=8'd0;
            fmap_22<=8'd0;  
         end
     
         endcase
   end
end

integer end_temp;//整型
integer count;
initial begin
 end_temp=$fopen("F://vivado_project//conv_pe//result.txt","w");//打开输出结果文件,写操作
end

always@(posedge clk)begin
   if(rst)begin
      count<=0;
   end
   if(uut_conv_pe.valid_out)begin//uut模块中输出使能为1,写入卷积结果数据
      $fwrite(end_temp,"%d",$signed(uut_conv_pe.sum_data));
      count<=count+1;
      if(count==6)begin//一行写入7个数据后,重新计数,换行写入
         count<=0;
         $fwrite(end_temp,"\n");
      end
   end
end  
endmodule

你可能感兴趣的:(通信与FPGA)