本文是我近段时间的学习总结,主要参考了Xilinx的技术文档以及部分网上其他资料。文档主要包括ug998《Introduction to FPGA Design Using High-Level Synthesis》,ug871《Vivado Design Suite Tutorial :High-level Synthesis》,ug902《Vivado Design Suite User Guide:High-level Synthesis》。受限于个人的FPGA水平,且对于Vivado hls了解不多,如有错误及不当之处,还请指正。
HLS是FPGA代码的综合技术。FPGA的基本知识可以从FPGA学习之基本结构得到。Xilinx的文档《Introduction to FPGA Design with Vivado High-Level Synthesis》中的两幅图可以很好的回答这一问题。
上图表明,虽然FPGA具有的高的性能,然而采用RTL设计FPGA代码需要较长的开发时间。
然而,采用HLS之后,FPGA开发的时间大大降低了,甚至低于DSP和GPU。
《Introduction to FPGA Design with Vivado High-Level Synthesis》在Xilinx Documentation 的分类中,这本用户手册被分到了Methodology Guides中去。与大部分文档不同,这本文档的角度很不一样。这本文档尽可能用简单的方式介绍FPGA,面相对象是软件工程师。在之前的文档中是没有Methodology Guides这个分类的,这大抵也说明了HLS的意义。
FPGA等的产生使得开发具有了更强的灵活性和高效性,HLS的逐步完善使得FPGA的开发高效性更进一步。技术的发展使得人们可以把精力放在设计上,而更少的去关注底层的东西。
Vivado的HLS工具的前世今生可以从AutoESL与Xilinx那些人和事中看到,这篇文章写得很有趣。HLS是高层次综合的的简称,“综合”即“Synthesis”,在ug627《XST User Guide》中解释综合是将程序代码翻译为称为NGC的特殊网表文件中,这样才能够对其进行实现。
至于“层次”,或许可以这样理解。书中一般把FPGA设计分为以下几个级别(对于这个分级实际上没有一个特定的说法,可以参考第13章抽象级别的描述):
一般认为RTL级及以下设计是可用的,“层次”即从什么角度去描述想要实现的功能。譬如,a xor b采用门级描述就是a,b是一个异或门的输入;而采用高一点层次描述就是a+b。显然,越低层次的描述越困难,后文例子中也能发现这一点。
HLS就是从高层次描述,之后综合成可用的网表文件的技术。这里的“高”指采用C、C++等编写程序,而不是传统的HDL语言。然而,实际上Vivado套件中是预先采用Vivado HLS这个软件将C程序转换成为Verilog HDL或者VHDL代码,之后进行下一步操作的,并不是直接综合C代码。
对于Vivado HLS(注意“HLS”,不是vivado这个软件)的使用,《Vivado Design Suite Tutorial :High-level Synthesis》是一本很好的入门指南。通过几个具体的例子,文档手把手的介绍了Vivado HLS的使用方式以及功能。《Vivado Design Suite User Guide :High-level Synthesis》则致力于教你如何编写合适的C代码以及test bench。本节介绍其功能,使用方法参考Xilinx文档。以下内容来自上述两文档。
C Synthesis
Vivado HLS实现的最基本的功能是将C代码综合为HDL代码。下面是其中的一个例子(代码为Xilinx例程)
c代码
1 *******************************************************************************/ 2 #include "fir.h" 3 4 void fir ( 5 data_t *y, 6 coef_t c[N], 7 data_t x 8 ) { 9 #pragma HLS INTERFACE ap_vld port=x 10 11 #pragma HLS RESOURCE variable=c core=RAM_1P_BRAM 12 13 14 static data_t shift_reg[N]; 15 acc_t acc; 16 data_t data; 17 int i; 18 19 acc=0; 20 Shift_Accum_Loop: for (i=N-1;i>=0;i--) { 21 if (i==0) { 22 shift_reg[0]=x; 23 data = x; 24 } else { 25 shift_reg[i]=shift_reg[i-1]; 26 data = shift_reg[i]; 27 } 28 acc+=data*c[i];; 29 } 30 *y=acc; 31 }
程序中采用了移位寄存器,是一个fir滤波器。实际上实现了以下功能:
verilog代码(可复杂了,不要点开看……),分了三个部分
// ============================================================== // RTL generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC // Version: 2014.1 // Copyright (C) 2014 Xilinx Inc. All rights reserved. // // =========================================================== `timescale 1 ns / 1 ps (* CORE_GENERATION_INFO="fir,hls_ip_2014_1,{HLS_INPUT_TYPE=c,HLS_INPUT_FLOAT=0,HLS_INPUT_FIXED=0,HLS_INPUT_PART=xc7k160tfbg484-2,HLS_INPUT_CLOCK=10.000000,HLS_INPUT_ARCH=others,HLS_SYN_CLOCK=8.430000,HLS_SYN_LAT=78,HLS_SYN_TPT=none,HLS_SYN_MEM=0,HLS_SYN_DSP=0,HLS_SYN_FF=0,HLS_SYN_LUT=0}" *) module fir ( ap_clk, ap_rst, ap_start, ap_done, ap_idle, ap_ready, y, y_ap_vld, c_address0, c_ce0, c_q0, x ); parameter ap_const_logic_1 = 1'b1; parameter ap_const_logic_0 = 1'b0; parameter ap_ST_st1_fsm_0 = 3'b000; parameter ap_ST_st2_fsm_1 = 3'b1; parameter ap_ST_st3_fsm_2 = 3'b10; parameter ap_ST_st4_fsm_3 = 3'b11; parameter ap_ST_st5_fsm_4 = 3'b100; parameter ap_ST_st6_fsm_5 = 3'b101; parameter ap_ST_st7_fsm_6 = 3'b110; parameter ap_ST_st8_fsm_7 = 3'b111; parameter ap_const_lv1_0 = 1'b0; parameter ap_const_lv32_0 = 32'b00000000000000000000000000000000; parameter ap_const_lv5_A = 5'b1010; parameter ap_const_lv4_0 = 4'b0000; parameter ap_const_lv32_4 = 32'b100; parameter ap_const_lv5_0 = 5'b00000; parameter ap_const_lv4_F = 4'b1111; parameter ap_const_lv5_1F = 5'b11111; parameter ap_true = 1'b1; input ap_clk; input ap_rst; input ap_start; output ap_done; output ap_idle; output ap_ready; output [31:0] y; output y_ap_vld; output [3:0] c_address0; output c_ce0; input [31:0] c_q0; input [31:0] x; reg ap_done; reg ap_idle; reg ap_ready; reg y_ap_vld; reg c_ce0; reg [2:0] ap_CS_fsm = 3'b000; reg [3:0] shift_reg_address0; reg shift_reg_ce0; reg shift_reg_we0; reg [31:0] shift_reg_d0; wire [31:0] shift_reg_q0; wire [31:0] i_cast_fu_127_p1; reg [31:0] i_cast_reg_190; wire [0:0] tmp_1_fu_139_p2; reg [0:0] tmp_1_reg_199; wire [0:0] tmp_fu_131_p3; wire [4:0] i_1_fu_168_p2; reg [4:0] i_1_reg_218; reg [31:0] c_load_reg_223; wire [31:0] grp_fu_174_p2; reg [31:0] tmp_6_reg_228; wire [31:0] acc_1_fu_179_p2; reg [31:0] acc_reg_91; reg [4:0] i_reg_104; reg [31:0] data1_reg_116; wire [63:0] tmp_3_fu_155_p1; wire [63:0] tmp_4_fu_160_p1; wire [63:0] tmp_5_fu_164_p1; wire [3:0] tmp_7_fu_145_p1; wire [3:0] tmp_2_fu_149_p2; wire [31:0] grp_fu_174_p0; wire [31:0] grp_fu_174_p1; wire grp_fu_174_ce; reg [2:0] ap_NS_fsm; fir_shift_reg #( .DataWidth( 32 ), .AddressRange( 11 ), .AddressWidth( 4 )) shift_reg_U( .clk( ap_clk ), .reset( ap_rst ), .address0( shift_reg_address0 ), .ce0( shift_reg_ce0 ), .we0( shift_reg_we0 ), .d0( shift_reg_d0 ), .q0( shift_reg_q0 ) ); fir_mul_32s_32s_32_3 #( .ID( 0 ), .NUM_STAGE( 3 ), .din0_WIDTH( 32 ), .din1_WIDTH( 32 ), .dout_WIDTH( 32 )) fir_mul_32s_32s_32_3_U0( .clk( ap_clk ), .reset( ap_rst ), .din0( grp_fu_174_p0 ), .din1( grp_fu_174_p1 ), .ce( grp_fu_174_ce ), .dout( grp_fu_174_p2 ) ); /// the current state (ap_CS_fsm) of the state machine. /// always @ (posedge ap_clk) begin : ap_ret_ap_CS_fsm if (ap_rst == 1'b1) begin ap_CS_fsm <= ap_ST_st1_fsm_0; end else begin ap_CS_fsm <= ap_NS_fsm; end end /// assign process. /// always @(posedge ap_clk) begin if ((ap_ST_st8_fsm_7 == ap_CS_fsm)) begin acc_reg_91 <= acc_1_fu_179_p2; end else if (((ap_ST_st1_fsm_0 == ap_CS_fsm) & ~(ap_start == ap_const_logic_0))) begin acc_reg_91 <= ap_const_lv32_0; end end /// assign process. /// always @(posedge ap_clk) begin if (((ap_ST_st3_fsm_2 == ap_CS_fsm) & (tmp_1_reg_199 == ap_const_lv1_0))) begin data1_reg_116 <= shift_reg_q0; end else if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & ~(tmp_1_fu_139_p2 == ap_const_lv1_0))) begin data1_reg_116 <= x; end end /// assign process. /// always @(posedge ap_clk) begin if ((ap_ST_st8_fsm_7 == ap_CS_fsm)) begin i_reg_104 <= i_1_reg_218; end else if (((ap_ST_st1_fsm_0 == ap_CS_fsm) & ~(ap_start == ap_const_logic_0))) begin i_reg_104 <= ap_const_lv5_A; end end /// assign process. /// always @(posedge ap_clk) begin if ((ap_ST_st4_fsm_3 == ap_CS_fsm)) begin c_load_reg_223 <= c_q0; end end /// assign process. /// always @(posedge ap_clk) begin if ((ap_ST_st3_fsm_2 == ap_CS_fsm)) begin i_1_reg_218 <= i_1_fu_168_p2; end end /// assign process. /// always @(posedge ap_clk) begin if ((ap_ST_st2_fsm_1 == ap_CS_fsm)) begin i_cast_reg_190 <= i_cast_fu_127_p1; end end /// assign process. /// always @(posedge ap_clk) begin if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0))) begin tmp_1_reg_199 <= tmp_1_fu_139_p2; end end /// assign process. /// always @(posedge ap_clk) begin if ((ap_ST_st7_fsm_6 == ap_CS_fsm)) begin tmp_6_reg_228 <= grp_fu_174_p2; end end /// ap_done assign process. /// always @ (ap_CS_fsm or tmp_fu_131_p3) begin if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & ~(tmp_fu_131_p3 == ap_const_lv1_0))) begin ap_done = ap_const_logic_1; end else begin ap_done = ap_const_logic_0; end end /// ap_idle assign process. /// always @ (ap_start or ap_CS_fsm) begin if ((~(ap_const_logic_1 == ap_start) & (ap_ST_st1_fsm_0 == ap_CS_fsm))) begin ap_idle = ap_const_logic_1; end else begin ap_idle = ap_const_logic_0; end end /// ap_ready assign process. /// always @ (ap_CS_fsm or tmp_fu_131_p3) begin if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & ~(tmp_fu_131_p3 == ap_const_lv1_0))) begin ap_ready = ap_const_logic_1; end else begin ap_ready = ap_const_logic_0; end end /// c_ce0 assign process. /// always @ (ap_CS_fsm) begin if ((ap_ST_st3_fsm_2 == ap_CS_fsm)) begin c_ce0 = ap_const_logic_1; end else begin c_ce0 = ap_const_logic_0; end end /// shift_reg_address0 assign process. /// always @ (ap_CS_fsm or tmp_1_fu_139_p2 or tmp_fu_131_p3 or tmp_3_fu_155_p1 or tmp_4_fu_160_p1) begin if ((ap_ST_st3_fsm_2 == ap_CS_fsm)) begin shift_reg_address0 = tmp_4_fu_160_p1; end else if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & ~(tmp_1_fu_139_p2 == ap_const_lv1_0))) begin shift_reg_address0 = ap_const_lv4_0; end else if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & (tmp_1_fu_139_p2 == ap_const_lv1_0))) begin shift_reg_address0 = tmp_3_fu_155_p1; end else begin shift_reg_address0 = 'bx; end end /// shift_reg_ce0 assign process. /// always @ (ap_CS_fsm or tmp_1_fu_139_p2 or tmp_fu_131_p3) begin if ((((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & (tmp_1_fu_139_p2 == ap_const_lv1_0)) | (ap_ST_st3_fsm_2 == ap_CS_fsm) | ((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & ~(tmp_1_fu_139_p2 == ap_const_lv1_0)))) begin shift_reg_ce0 = ap_const_logic_1; end else begin shift_reg_ce0 = ap_const_logic_0; end end /// shift_reg_d0 assign process. /// always @ (ap_CS_fsm or x or shift_reg_q0 or tmp_1_fu_139_p2 or tmp_fu_131_p3) begin if ((ap_ST_st3_fsm_2 == ap_CS_fsm)) begin shift_reg_d0 = shift_reg_q0; end else if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & ~(tmp_1_fu_139_p2 == ap_const_lv1_0))) begin shift_reg_d0 = x; end else begin shift_reg_d0 = 'bx; end end /// shift_reg_we0 assign process. /// always @ (ap_CS_fsm or tmp_1_fu_139_p2 or tmp_1_reg_199 or tmp_fu_131_p3) begin if ((((ap_ST_st3_fsm_2 == ap_CS_fsm) & (tmp_1_reg_199 == ap_const_lv1_0)) | ((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & ~(tmp_1_fu_139_p2 == ap_const_lv1_0)))) begin shift_reg_we0 = ap_const_logic_1; end else begin shift_reg_we0 = ap_const_logic_0; end end /// y_ap_vld assign process. /// always @ (ap_CS_fsm or tmp_fu_131_p3) begin if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & ~(tmp_fu_131_p3 == ap_const_lv1_0))) begin y_ap_vld = ap_const_logic_1; end else begin y_ap_vld = ap_const_logic_0; end end always @ (ap_start or ap_CS_fsm or tmp_fu_131_p3) begin case (ap_CS_fsm) ap_ST_st1_fsm_0 : if (~(ap_start == ap_const_logic_0)) begin ap_NS_fsm = ap_ST_st2_fsm_1; end else begin ap_NS_fsm = ap_ST_st1_fsm_0; end ap_ST_st2_fsm_1 : if (~(tmp_fu_131_p3 == ap_const_lv1_0)) begin ap_NS_fsm = ap_ST_st1_fsm_0; end else begin ap_NS_fsm = ap_ST_st3_fsm_2; end ap_ST_st3_fsm_2 : ap_NS_fsm = ap_ST_st4_fsm_3; ap_ST_st4_fsm_3 : ap_NS_fsm = ap_ST_st5_fsm_4; ap_ST_st5_fsm_4 : ap_NS_fsm = ap_ST_st6_fsm_5; ap_ST_st6_fsm_5 : ap_NS_fsm = ap_ST_st7_fsm_6; ap_ST_st7_fsm_6 : ap_NS_fsm = ap_ST_st8_fsm_7; ap_ST_st8_fsm_7 : ap_NS_fsm = ap_ST_st2_fsm_1; default : ap_NS_fsm = 'bx; endcase end assign acc_1_fu_179_p2 = (tmp_6_reg_228 + acc_reg_91); assign c_address0 = tmp_5_fu_164_p1; assign grp_fu_174_ce = ap_const_logic_1; assign grp_fu_174_p0 = c_load_reg_223; assign grp_fu_174_p1 = data1_reg_116; assign i_1_fu_168_p2 = (i_reg_104 + ap_const_lv5_1F); assign i_cast_fu_127_p1 = $signed(i_reg_104); assign tmp_1_fu_139_p2 = (i_reg_104 == ap_const_lv5_0? 1'b1: 1'b0); assign tmp_2_fu_149_p2 = (tmp_7_fu_145_p1 + ap_const_lv4_F); assign tmp_3_fu_155_p1 = $unsigned(tmp_2_fu_149_p2); assign tmp_4_fu_160_p1 = $unsigned(i_cast_reg_190); assign tmp_5_fu_164_p1 = $unsigned(i_cast_reg_190); assign tmp_7_fu_145_p1 = i_reg_104[3:0]; assign tmp_fu_131_p3 = i_reg_104[ap_const_lv32_4]; assign y = acc_reg_91; endmodule //fir
// ============================================================== // File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC // Version: 2014.1 // Copyright (C) 2014 Xilinx Inc. All rights reserved. // // ============================================================== `timescale 1 ns / 1 ps module fir_shift_reg_ram (addr0, ce0, d0, we0, q0, clk); parameter DWIDTH = 32; parameter AWIDTH = 4; parameter MEM_SIZE = 11; input[AWIDTH-1:0] addr0; input ce0; input[DWIDTH-1:0] d0; input we0; output reg[DWIDTH-1:0] q0; input clk; (* ram_style = "distributed" *)reg [DWIDTH-1:0] ram[MEM_SIZE-1:0]; initial begin $readmemh("./fir_shift_reg_ram.dat", ram); end always @(posedge clk) begin if (ce0) begin if (we0) begin ram[addr0] <= d0; q0 <= d0; end else q0 <= ram[addr0]; end end endmodule `timescale 1 ns / 1 ps module fir_shift_reg( reset, clk, address0, ce0, we0, d0, q0); parameter DataWidth = 32'd32; parameter AddressRange = 32'd11; parameter AddressWidth = 32'd4; input reset; input clk; input[AddressWidth - 1:0] address0; input ce0; input we0; input[DataWidth - 1:0] d0; output[DataWidth - 1:0] q0; fir_shift_reg_ram fir_shift_reg_ram_U( .clk( clk ), .addr0( address0 ), .ce0( ce0 ), .d0( d0 ), .we0( we0 ), .q0( q0 )); endmodule
// ============================================================== // File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC // Version: 2014.1 // Copyright (C) 2014 Xilinx Inc. All rights reserved. // // ============================================================== `timescale 1 ns / 1 ps module fir_mul_32s_32s_32_3_Mul3S_0(clk, ce, a, b, p); input clk; input ce; input[32 - 1 : 0] a; // synthesis attribute keep a "true" input[32 - 1 : 0] b; // synthesis attribute keep b "true" output[32 - 1 : 0] p; reg[32 - 1 : 0] a_reg; reg[32 - 1 : 0] b_reg; wire [32 - 1 : 0] tmp_product; reg[32 - 1 : 0] buff0; assign p = buff0; assign tmp_product = $signed(a_reg) * $signed(b_reg); always @ (posedge clk) begin if (ce) begin a_reg <= a; b_reg <= b; buff0 <= tmp_product; end end endmodule `timescale 1 ns / 1 ps module fir_mul_32s_32s_32_3( clk, reset, ce, din0, din1, dout); parameter ID = 32'd1; parameter NUM_STAGE = 32'd1; parameter din0_WIDTH = 32'd1; parameter din1_WIDTH = 32'd1; parameter dout_WIDTH = 32'd1; input clk; input reset; input ce; input[din0_WIDTH - 1:0] din0; input[din1_WIDTH - 1:0] din1; output[dout_WIDTH - 1:0] dout; fir_mul_32s_32s_32_3_Mul3S_0 fir_mul_32s_32s_32_3_Mul3S_0_U( .clk( clk ), .ce( ce ), .a( din0 ), .b( din1 ), .p( dout )); endmodule
Optimization&analysis
通过指定不同的Directive,可以得到不同要求下的优化结果。下面的报告是指定Loop Unroll前后的对比。可以看出solution3中只需要约1/5的时钟周期就能完成计算,随之而来的是成倍的资源需求。
上述优化以吞吐量为目标的,利用流水结构以及Unrolled loop可以优化吞吐量,理由如下图
吞吐量优化正是利用了FPGA全并行的特性。其他优化目标的具体内容可以参考ug902《Vivado Design Suite User Guide :High-level Synthesis》.
RTL验证及导出
RTL验证的流程很简单,写好testbench,软件就能自动验证了,之后自动对比输出值和目标值,相等就通过了。
至于导出,如果所有的事情Vivado HLS都做完了,那么还要Vivado干什么呢?点击RTL export,就能以Vivado工程的形式或者IP core将生成的代码导出,进行下一步处理了。
懂得不多,没什么好总结的。越来越觉得Xilinx的文档无所不有了,或许正是这个原因导致网上的其他资料比较少吧。Xilinx还有很多入门视频,不过我都打不开……这个网站上有很多资料,不过比较混乱……所以还是看文档吧。