高层次综合(HLS)-简介

本文是我近段时间的学习总结,主要参考了Xilinx的技术文档以及部分网上其他资料。文档主要包括ug998《Introduction to FPGA Design Using High-Level Synthesis》,ug871《Vivado Design Suite Tutorial :High-level Synthesis》,ug902《Vivado Design Suite User Guide:High-level Synthesis》。受限于个人的FPGA水平,且对于Vivado hls了解不多,如有错误及不当之处,还请指正。

 

1.为什么要HLS?

    HLS是FPGA代码的综合技术。FPGA的基本知识可以从FPGA学习之基本结构得到。Xilinx的文档《Introduction to FPGA Design with Vivado High-Level Synthesis》中的两幅图可以很好的回答这一问题。

image

    上图表明,虽然FPGA具有的高的性能,然而采用RTL设计FPGA代码需要较长的开发时间。

image

    然而,采用HLS之后,FPGA开发的时间大大降低了,甚至低于DSP和GPU。

    《Introduction to FPGA Design with Vivado High-Level Synthesis》在Xilinx Documentation 的分类中,这本用户手册被分到了Methodology Guides中去。与大部分文档不同,这本文档的角度很不一样。这本文档尽可能用简单的方式介绍FPGA,面相对象是软件工程师。在之前的文档中是没有Methodology Guides这个分类的,这大抵也说明了HLS的意义。

    FPGA等的产生使得开发具有了更强的灵活性和高效性,HLS的逐步完善使得FPGA的开发高效性更进一步。技术的发展使得人们可以把精力放在设计上,而更少的去关注底层的东西。

2.HLS是什么?

    Vivado的HLS工具的前世今生可以从AutoESL与Xilinx那些人和事中看到,这篇文章写得很有趣。HLS是高层次综合的的简称,“综合”即“Synthesis”,在ug627《XST User Guide》中解释综合是将程序代码翻译为称为NGC的特殊网表文件中,这样才能够对其进行实现。

    至于“层次”,或许可以这样理解。书中一般把FPGA设计分为以下几个级别(对于这个分级实际上没有一个特定的说法,可以参考第13章抽象级别的描述):

  • 系统级
  • 算法级
  • RTL级
  • 门级、开关级

    一般认为RTL级及以下设计是可用的,“层次”即从什么角度去描述想要实现的功能。譬如,a xor b采用门级描述就是a,b是一个异或门的输入;而采用高一点层次描述就是a+b。显然,越低层次的描述越困难,后文例子中也能发现这一点。

    HLS就是从高层次描述,之后综合成可用的网表文件的技术。这里的“高”指采用C、C++等编写程序,而不是传统的HDL语言。然而,实际上Vivado套件中是预先采用Vivado HLS这个软件将C程序转换成为Verilog HDL或者VHDL代码,之后进行下一步操作的,并不是直接综合C代码。

3.Vivado HLS功能

    对于Vivado HLS(注意“HLS”,不是vivado这个软件)的使用,《Vivado Design Suite Tutorial :High-level Synthesis》是一本很好的入门指南。通过几个具体的例子,文档手把手的介绍了Vivado HLS的使用方式以及功能。《Vivado Design Suite User Guide :High-level Synthesis》则致力于教你如何编写合适的C代码以及test bench。本节介绍其功能,使用方法参考Xilinx文档。以下内容来自上述两文档。

image

 

C Synthesis

    Vivado HLS实现的最基本的功能是将C代码综合为HDL代码。下面是其中的一个例子(代码为Xilinx例程)

c代码

高层次综合(HLS)-简介
 1 *******************************************************************************/

 2 #include "fir.h"

 3 

 4 void fir (

 5   data_t *y,

 6   coef_t c[N],

 7   data_t x

 8   ) {

 9 #pragma HLS INTERFACE ap_vld port=x

10 

11 #pragma HLS RESOURCE variable=c core=RAM_1P_BRAM

12 

13 

14   static data_t shift_reg[N];

15   acc_t acc;

16   data_t data;

17   int i;

18   

19   acc=0;

20   Shift_Accum_Loop: for (i=N-1;i>=0;i--) {

21         if (i==0) {

22             shift_reg[0]=x;

23             data = x;

24     } else {

25             shift_reg[i]=shift_reg[i-1];

26             data = shift_reg[i];

27     }

28     acc+=data*c[i];;       

29   }

30   *y=acc;

31 }
View Code

    程序中采用了移位寄存器,是一个fir滤波器。实际上实现了以下功能:

verilog代码(可复杂了,不要点开看……),分了三个部分

高层次综合(HLS)-简介
// ==============================================================

// RTL generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC

// Version: 2014.1

// Copyright (C) 2014 Xilinx Inc. All rights reserved.

// 

// ===========================================================



`timescale 1 ns / 1 ps 



(* CORE_GENERATION_INFO="fir,hls_ip_2014_1,{HLS_INPUT_TYPE=c,HLS_INPUT_FLOAT=0,HLS_INPUT_FIXED=0,HLS_INPUT_PART=xc7k160tfbg484-2,HLS_INPUT_CLOCK=10.000000,HLS_INPUT_ARCH=others,HLS_SYN_CLOCK=8.430000,HLS_SYN_LAT=78,HLS_SYN_TPT=none,HLS_SYN_MEM=0,HLS_SYN_DSP=0,HLS_SYN_FF=0,HLS_SYN_LUT=0}" *)



module fir (

        ap_clk,

        ap_rst,

        ap_start,

        ap_done,

        ap_idle,

        ap_ready,

        y,

        y_ap_vld,

        c_address0,

        c_ce0,

        c_q0,

        x

);



parameter    ap_const_logic_1 = 1'b1;

parameter    ap_const_logic_0 = 1'b0;

parameter    ap_ST_st1_fsm_0 = 3'b000;

parameter    ap_ST_st2_fsm_1 = 3'b1;

parameter    ap_ST_st3_fsm_2 = 3'b10;

parameter    ap_ST_st4_fsm_3 = 3'b11;

parameter    ap_ST_st5_fsm_4 = 3'b100;

parameter    ap_ST_st6_fsm_5 = 3'b101;

parameter    ap_ST_st7_fsm_6 = 3'b110;

parameter    ap_ST_st8_fsm_7 = 3'b111;

parameter    ap_const_lv1_0 = 1'b0;

parameter    ap_const_lv32_0 = 32'b00000000000000000000000000000000;

parameter    ap_const_lv5_A = 5'b1010;

parameter    ap_const_lv4_0 = 4'b0000;

parameter    ap_const_lv32_4 = 32'b100;

parameter    ap_const_lv5_0 = 5'b00000;

parameter    ap_const_lv4_F = 4'b1111;

parameter    ap_const_lv5_1F = 5'b11111;

parameter    ap_true = 1'b1;



input   ap_clk;

input   ap_rst;

input   ap_start;

output   ap_done;

output   ap_idle;

output   ap_ready;

output  [31:0] y;

output   y_ap_vld;

output  [3:0] c_address0;

output   c_ce0;

input  [31:0] c_q0;

input  [31:0] x;



reg ap_done;

reg ap_idle;

reg ap_ready;

reg y_ap_vld;

reg c_ce0;

reg   [2:0] ap_CS_fsm = 3'b000;

reg   [3:0] shift_reg_address0;

reg    shift_reg_ce0;

reg    shift_reg_we0;

reg   [31:0] shift_reg_d0;

wire   [31:0] shift_reg_q0;

wire   [31:0] i_cast_fu_127_p1;

reg   [31:0] i_cast_reg_190;

wire   [0:0] tmp_1_fu_139_p2;

reg   [0:0] tmp_1_reg_199;

wire   [0:0] tmp_fu_131_p3;

wire   [4:0] i_1_fu_168_p2;

reg   [4:0] i_1_reg_218;

reg   [31:0] c_load_reg_223;

wire   [31:0] grp_fu_174_p2;

reg   [31:0] tmp_6_reg_228;

wire   [31:0] acc_1_fu_179_p2;

reg   [31:0] acc_reg_91;

reg   [4:0] i_reg_104;

reg   [31:0] data1_reg_116;

wire   [63:0] tmp_3_fu_155_p1;

wire   [63:0] tmp_4_fu_160_p1;

wire   [63:0] tmp_5_fu_164_p1;

wire   [3:0] tmp_7_fu_145_p1;

wire   [3:0] tmp_2_fu_149_p2;

wire   [31:0] grp_fu_174_p0;

wire   [31:0] grp_fu_174_p1;

wire    grp_fu_174_ce;

reg   [2:0] ap_NS_fsm;





fir_shift_reg #(

    .DataWidth( 32 ),

    .AddressRange( 11 ),

    .AddressWidth( 4 ))

shift_reg_U(

    .clk( ap_clk ),

    .reset( ap_rst ),

    .address0( shift_reg_address0 ),

    .ce0( shift_reg_ce0 ),

    .we0( shift_reg_we0 ),

    .d0( shift_reg_d0 ),

    .q0( shift_reg_q0 )

);



fir_mul_32s_32s_32_3 #(

    .ID( 0 ),

    .NUM_STAGE( 3 ),

    .din0_WIDTH( 32 ),

    .din1_WIDTH( 32 ),

    .dout_WIDTH( 32 ))

fir_mul_32s_32s_32_3_U0(

    .clk( ap_clk ),

    .reset( ap_rst ),

    .din0( grp_fu_174_p0 ),

    .din1( grp_fu_174_p1 ),

    .ce( grp_fu_174_ce ),

    .dout( grp_fu_174_p2 )

);







/// the current state (ap_CS_fsm) of the state machine. ///

always @ (posedge ap_clk)

begin : ap_ret_ap_CS_fsm

    if (ap_rst == 1'b1) begin

        ap_CS_fsm <= ap_ST_st1_fsm_0;

    end else begin

        ap_CS_fsm <= ap_NS_fsm;

    end

end



/// assign process. ///

always @(posedge ap_clk)

begin

    if ((ap_ST_st8_fsm_7 == ap_CS_fsm)) begin

        acc_reg_91 <= acc_1_fu_179_p2;

    end else if (((ap_ST_st1_fsm_0 == ap_CS_fsm) & ~(ap_start == ap_const_logic_0))) begin

        acc_reg_91 <= ap_const_lv32_0;

    end

end



/// assign process. ///

always @(posedge ap_clk)

begin

    if (((ap_ST_st3_fsm_2 == ap_CS_fsm) & (tmp_1_reg_199 == ap_const_lv1_0))) begin

        data1_reg_116 <= shift_reg_q0;

    end else if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & ~(tmp_1_fu_139_p2 == ap_const_lv1_0))) begin

        data1_reg_116 <= x;

    end

end



/// assign process. ///

always @(posedge ap_clk)

begin

    if ((ap_ST_st8_fsm_7 == ap_CS_fsm)) begin

        i_reg_104 <= i_1_reg_218;

    end else if (((ap_ST_st1_fsm_0 == ap_CS_fsm) & ~(ap_start == ap_const_logic_0))) begin

        i_reg_104 <= ap_const_lv5_A;

    end

end



/// assign process. ///

always @(posedge ap_clk)

begin

    if ((ap_ST_st4_fsm_3 == ap_CS_fsm)) begin

        c_load_reg_223 <= c_q0;

    end

end



/// assign process. ///

always @(posedge ap_clk)

begin

    if ((ap_ST_st3_fsm_2 == ap_CS_fsm)) begin

        i_1_reg_218 <= i_1_fu_168_p2;

    end

end



/// assign process. ///

always @(posedge ap_clk)

begin

    if ((ap_ST_st2_fsm_1 == ap_CS_fsm)) begin

        i_cast_reg_190 <= i_cast_fu_127_p1;

    end

end



/// assign process. ///

always @(posedge ap_clk)

begin

    if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0))) begin

        tmp_1_reg_199 <= tmp_1_fu_139_p2;

    end

end



/// assign process. ///

always @(posedge ap_clk)

begin

    if ((ap_ST_st7_fsm_6 == ap_CS_fsm)) begin

        tmp_6_reg_228 <= grp_fu_174_p2;

    end

end



/// ap_done assign process. ///

always @ (ap_CS_fsm or tmp_fu_131_p3)

begin

    if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & ~(tmp_fu_131_p3 == ap_const_lv1_0))) begin

        ap_done = ap_const_logic_1;

    end else begin

        ap_done = ap_const_logic_0;

    end

end



/// ap_idle assign process. ///

always @ (ap_start or ap_CS_fsm)

begin

    if ((~(ap_const_logic_1 == ap_start) & (ap_ST_st1_fsm_0 == ap_CS_fsm))) begin

        ap_idle = ap_const_logic_1;

    end else begin

        ap_idle = ap_const_logic_0;

    end

end



/// ap_ready assign process. ///

always @ (ap_CS_fsm or tmp_fu_131_p3)

begin

    if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & ~(tmp_fu_131_p3 == ap_const_lv1_0))) begin

        ap_ready = ap_const_logic_1;

    end else begin

        ap_ready = ap_const_logic_0;

    end

end



/// c_ce0 assign process. ///

always @ (ap_CS_fsm)

begin

    if ((ap_ST_st3_fsm_2 == ap_CS_fsm)) begin

        c_ce0 = ap_const_logic_1;

    end else begin

        c_ce0 = ap_const_logic_0;

    end

end



/// shift_reg_address0 assign process. ///

always @ (ap_CS_fsm or tmp_1_fu_139_p2 or tmp_fu_131_p3 or tmp_3_fu_155_p1 or tmp_4_fu_160_p1)

begin

    if ((ap_ST_st3_fsm_2 == ap_CS_fsm)) begin

        shift_reg_address0 = tmp_4_fu_160_p1;

    end else if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & ~(tmp_1_fu_139_p2 == ap_const_lv1_0))) begin

        shift_reg_address0 = ap_const_lv4_0;

    end else if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & (tmp_1_fu_139_p2 == ap_const_lv1_0))) begin

        shift_reg_address0 = tmp_3_fu_155_p1;

    end else begin

        shift_reg_address0 = 'bx;

    end

end



/// shift_reg_ce0 assign process. ///

always @ (ap_CS_fsm or tmp_1_fu_139_p2 or tmp_fu_131_p3)

begin

    if ((((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & (tmp_1_fu_139_p2 == ap_const_lv1_0)) | (ap_ST_st3_fsm_2 == ap_CS_fsm) | ((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & ~(tmp_1_fu_139_p2 == ap_const_lv1_0)))) begin

        shift_reg_ce0 = ap_const_logic_1;

    end else begin

        shift_reg_ce0 = ap_const_logic_0;

    end

end



/// shift_reg_d0 assign process. ///

always @ (ap_CS_fsm or x or shift_reg_q0 or tmp_1_fu_139_p2 or tmp_fu_131_p3)

begin

    if ((ap_ST_st3_fsm_2 == ap_CS_fsm)) begin

        shift_reg_d0 = shift_reg_q0;

    end else if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & ~(tmp_1_fu_139_p2 == ap_const_lv1_0))) begin

        shift_reg_d0 = x;

    end else begin

        shift_reg_d0 = 'bx;

    end

end



/// shift_reg_we0 assign process. ///

always @ (ap_CS_fsm or tmp_1_fu_139_p2 or tmp_1_reg_199 or tmp_fu_131_p3)

begin

    if ((((ap_ST_st3_fsm_2 == ap_CS_fsm) & (tmp_1_reg_199 == ap_const_lv1_0)) | ((ap_ST_st2_fsm_1 == ap_CS_fsm) & (tmp_fu_131_p3 == ap_const_lv1_0) & ~(tmp_1_fu_139_p2 == ap_const_lv1_0)))) begin

        shift_reg_we0 = ap_const_logic_1;

    end else begin

        shift_reg_we0 = ap_const_logic_0;

    end

end



/// y_ap_vld assign process. ///

always @ (ap_CS_fsm or tmp_fu_131_p3)

begin

    if (((ap_ST_st2_fsm_1 == ap_CS_fsm) & ~(tmp_fu_131_p3 == ap_const_lv1_0))) begin

        y_ap_vld = ap_const_logic_1;

    end else begin

        y_ap_vld = ap_const_logic_0;

    end

end

always @ (ap_start or ap_CS_fsm or tmp_fu_131_p3)

begin

    case (ap_CS_fsm)

        ap_ST_st1_fsm_0 : 

            if (~(ap_start == ap_const_logic_0)) begin

                ap_NS_fsm = ap_ST_st2_fsm_1;

            end else begin

                ap_NS_fsm = ap_ST_st1_fsm_0;

            end

        ap_ST_st2_fsm_1 : 

            if (~(tmp_fu_131_p3 == ap_const_lv1_0)) begin

                ap_NS_fsm = ap_ST_st1_fsm_0;

            end else begin

                ap_NS_fsm = ap_ST_st3_fsm_2;

            end

        ap_ST_st3_fsm_2 : 

            ap_NS_fsm = ap_ST_st4_fsm_3;

        ap_ST_st4_fsm_3 : 

            ap_NS_fsm = ap_ST_st5_fsm_4;

        ap_ST_st5_fsm_4 : 

            ap_NS_fsm = ap_ST_st6_fsm_5;

        ap_ST_st6_fsm_5 : 

            ap_NS_fsm = ap_ST_st7_fsm_6;

        ap_ST_st7_fsm_6 : 

            ap_NS_fsm = ap_ST_st8_fsm_7;

        ap_ST_st8_fsm_7 : 

            ap_NS_fsm = ap_ST_st2_fsm_1;

        default : 

            ap_NS_fsm = 'bx;

    endcase

end

assign acc_1_fu_179_p2 = (tmp_6_reg_228 + acc_reg_91);

assign c_address0 = tmp_5_fu_164_p1;

assign grp_fu_174_ce = ap_const_logic_1;

assign grp_fu_174_p0 = c_load_reg_223;

assign grp_fu_174_p1 = data1_reg_116;

assign i_1_fu_168_p2 = (i_reg_104 + ap_const_lv5_1F);

assign i_cast_fu_127_p1 = $signed(i_reg_104);

assign tmp_1_fu_139_p2 = (i_reg_104 == ap_const_lv5_0? 1'b1: 1'b0);

assign tmp_2_fu_149_p2 = (tmp_7_fu_145_p1 + ap_const_lv4_F);

assign tmp_3_fu_155_p1 = $unsigned(tmp_2_fu_149_p2);

assign tmp_4_fu_160_p1 = $unsigned(i_cast_reg_190);

assign tmp_5_fu_164_p1 = $unsigned(i_cast_reg_190);

assign tmp_7_fu_145_p1 = i_reg_104[3:0];

assign tmp_fu_131_p3 = i_reg_104[ap_const_lv32_4];

assign y = acc_reg_91;





endmodule //fir
View Code
高层次综合(HLS)-简介
// ==============================================================

// File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC

// Version: 2014.1

// Copyright (C) 2014 Xilinx Inc. All rights reserved.

// 

// ==============================================================



`timescale 1 ns / 1 ps

module fir_shift_reg_ram (addr0, ce0, d0, we0, q0,  clk);



parameter DWIDTH = 32;

parameter AWIDTH = 4;

parameter MEM_SIZE = 11;



input[AWIDTH-1:0] addr0;

input ce0;

input[DWIDTH-1:0] d0;

input we0;

output reg[DWIDTH-1:0] q0;

input clk;



(* ram_style = "distributed" *)reg [DWIDTH-1:0] ram[MEM_SIZE-1:0];



initial begin

    $readmemh("./fir_shift_reg_ram.dat", ram);

end







always @(posedge clk)  

begin 

    if (ce0) 

    begin

        if (we0) 

        begin 

            ram[addr0] <= d0; 

            q0 <= d0;

        end 

        else 

            q0 <= ram[addr0];

    end

end





endmodule





`timescale 1 ns / 1 ps

module fir_shift_reg(

    reset,

    clk,

    address0,

    ce0,

    we0,

    d0,

    q0);



parameter DataWidth = 32'd32;

parameter AddressRange = 32'd11;

parameter AddressWidth = 32'd4;

input reset;

input clk;

input[AddressWidth - 1:0] address0;

input ce0;

input we0;

input[DataWidth - 1:0] d0;

output[DataWidth - 1:0] q0;









fir_shift_reg_ram fir_shift_reg_ram_U(

    .clk( clk ),

    .addr0( address0 ),

    .ce0( ce0 ),

    .d0( d0 ),

    .we0( we0 ),

    .q0( q0 ));



endmodule
View Code
高层次综合(HLS)-简介
// ==============================================================

// File generated by Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC

// Version: 2014.1

// Copyright (C) 2014 Xilinx Inc. All rights reserved.

// 

// ==============================================================





`timescale 1 ns / 1 ps



module fir_mul_32s_32s_32_3_Mul3S_0(clk, ce, a, b, p);

input clk;

input ce;

input[32 - 1 : 0] a; // synthesis attribute keep a "true"

input[32 - 1 : 0] b; // synthesis attribute keep b "true"

output[32 - 1 : 0] p;

reg[32 - 1 : 0] a_reg;

reg[32 - 1 : 0] b_reg;

wire [32 - 1 : 0] tmp_product;

reg[32 - 1 : 0] buff0;



assign p = buff0;

assign tmp_product = $signed(a_reg) * $signed(b_reg);

always @ (posedge clk) begin

    if (ce) begin

        a_reg <= a;

        b_reg <= b;

        buff0 <= tmp_product;

    end

end

endmodule



`timescale 1 ns / 1 ps

module fir_mul_32s_32s_32_3(

    clk,

    reset,

    ce,

    din0,

    din1,

    dout);



parameter ID = 32'd1;

parameter NUM_STAGE = 32'd1;

parameter din0_WIDTH = 32'd1;

parameter din1_WIDTH = 32'd1;

parameter dout_WIDTH = 32'd1;

input clk;

input reset;

input ce;

input[din0_WIDTH - 1:0] din0;

input[din1_WIDTH - 1:0] din1;

output[dout_WIDTH - 1:0] dout;









fir_mul_32s_32s_32_3_Mul3S_0 fir_mul_32s_32s_32_3_Mul3S_0_U(

    .clk( clk ),

    .ce( ce ),

    .a( din0 ),

    .b( din1 ),

    .p( dout ));



endmodule
View Code

Optimization&analysis

    通过指定不同的Directive,可以得到不同要求下的优化结果。下面的报告是指定Loop Unroll前后的对比。可以看出solution3中只需要约1/5的时钟周期就能完成计算,随之而来的是成倍的资源需求。image

    上述优化以吞吐量为目标的,利用流水结构以及Unrolled loop可以优化吞吐量,理由如下图

image

 

image

    吞吐量优化正是利用了FPGA全并行的特性。其他优化目标的具体内容可以参考ug902《Vivado Design Suite User Guide :High-level Synthesis》.

RTL验证及导出

    RTL验证的流程很简单,写好testbench,软件就能自动验证了,之后自动对比输出值和目标值,相等就通过了。

    至于导出,如果所有的事情Vivado HLS都做完了,那么还要Vivado干什么呢?点击RTL export,就能以Vivado工程的形式或者IP core将生成的代码导出,进行下一步处理了。

4.总结

     懂得不多,没什么好总结的。越来越觉得Xilinx的文档无所不有了,或许正是这个原因导致网上的其他资料比较少吧。Xilinx还有很多入门视频,不过我都打不开……这个网站上有很多资料,不过比较混乱……所以还是看文档吧。

 

你可能感兴趣的:(简介)