Vortex GPGPU的硬件代码分析(Cache篇3)

文章目录

  • 前言
  • 一、VX_cache.sv代码部分解读3——bank
    • 1.1 bank request dispatch
      • 1.1.1 解释VX_stream_xbar模块
      • 1.1.2 解释VX_stream_arb模块
      • 1.1.3 解释VX_generic_arbiter模块
      • 1.1.4 解释VX_priority_arbiter模块
      • 1.1.5 解释VX_rr_arbiter模块
      • 1.1.6 解释VX_fair_arbiter模块
      • 1.1.7 解释VX_matrix_arbiter模块
      • 1.1.8 解释VX_cyclic_arbiter模块
      • 1.1.9 解释VX_popcount模块
    • 1.2 bank access
    • 1.3 bank response gather
  • 总结


前言

前面已经分析了Vortex GPGPU的架构:Vortex GPGPU的硬件设计和代码结构分析

前面也分析了Vortex GPGPU中关于Cache设计的一部分代码:
1、Vortex GPGPU的硬件代码分析(Cache篇1)
2、Vortex GPGPU的硬件代码分析(Cache篇2)

本文接着分析VX_cache.sv代码


一、VX_cache.sv代码部分解读3——bank

1.1 bank request dispatch

    ///
    
    wire [NUM_BANKS-1:0]                        per_bank_core_req_valid;
    wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
    wire [NUM_BANKS-1:0]                        per_bank_core_req_rw;
    wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0]    per_bank_core_req_wsel;
    wire [NUM_BANKS-1:0][WORD_SIZE-1:0]         per_bank_core_req_byteen;
    wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0]    per_bank_core_req_data;
    wire [NUM_BANKS-1:0][TAG_WIDTH-1:0]         per_bank_core_req_tag;
    wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0]     per_bank_core_req_idx;
    wire [NUM_BANKS-1:0]                        per_bank_core_req_ready;
    
    wire [NUM_BANKS-1:0]                        per_bank_core_rsp_valid;
    wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0]    per_bank_core_rsp_data;
    wire [NUM_BANKS-1:0][TAG_WIDTH-1:0]         per_bank_core_rsp_tag;
    wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0]     per_bank_core_rsp_idx;
    wire [NUM_BANKS-1:0]                        per_bank_core_rsp_ready;

    wire [NUM_BANKS-1:0]                        per_bank_mem_req_valid;    
    wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
    wire [NUM_BANKS-1:0]                        per_bank_mem_req_rw;
    wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0]    per_bank_mem_req_wsel;
    wire [NUM_BANKS-1:0][WORD_SIZE-1:0]         per_bank_mem_req_byteen;
    wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0]    per_bank_mem_req_data;
    wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0]   per_bank_mem_req_id;
    wire [NUM_BANKS-1:0]                        per_bank_mem_req_ready;

    wire [NUM_BANKS-1:0]                        per_bank_mem_rsp_ready;
    
    if (NUM_BANKS == 1) begin
        assign mem_rsp_ready_s = per_bank_mem_rsp_ready;
    end else begin
        assign mem_rsp_ready_s = per_bank_mem_rsp_ready[`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s)];
    end

    // Bank requests dispatch

    wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0]  core_req_data_in;    
    wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
    wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
    wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0]  core_req_bid;
    wire [NUM_REQS-1:0][WORD_SEL_WIDTH-1:0]  core_req_wsel;

    for (genvar i = 0; i < NUM_REQS; ++i) begin
        if (WORDS_PER_LINE > 1) begin
            assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS];
        end else begin
            assign core_req_wsel[i] = '0;
        end
        assign core_req_line_addr[i] = core_req_addr[i][(BANK_SEL_BITS + WORD_SEL_BITS) +: LINE_ADDR_WIDTH];
    end

    if (NUM_BANKS > 1) begin
        for (genvar i = 0; i < NUM_REQS; ++i) begin
            assign core_req_bid[i] = core_req_addr[i][WORD_SEL_BITS +: BANK_SEL_BITS];
        end
    end else begin
        assign core_req_bid = '0;
    end

    for (genvar i = 0; i < NUM_REQS; ++i) begin
        assign core_req_data_in[i] = {
            core_req_line_addr[i],
            core_req_rw[i],
            core_req_wsel[i],
            core_req_byteen[i],            
            core_req_data[i],
            core_req_tag[i]};
    end


    `RESET_RELAY (req_xbar_reset, reset);

     VX_stream_xbar #(
        .NUM_INPUTS  (NUM_REQS),
        .NUM_OUTPUTS (NUM_BANKS),
        .DATAW       (CORE_REQ_DATAW),
        .PERF_CTR_BITS (`PERF_CTR_BITS),
        .OUT_BUF     ((NUM_REQS > 4) ? 2 : 0)
    ) req_xbar (
        .clk       (clk),
        .reset     (req_xbar_reset),

        `UNUSED_PIN(collisions),
        .valid_in  (core_req_valid),
        .data_in   (core_req_data_in),
        .sel_in    (core_req_bid),
        .ready_in  (core_req_ready),
        .valid_out (per_bank_core_req_valid),
        .data_out  (core_req_data_out),
        .sel_out   (per_bank_core_req_idx),
        .ready_out (per_bank_core_req_ready)
    );

    for (genvar i = 0; i < NUM_BANKS; ++i) begin
        assign {
            per_bank_core_req_addr[i],
            per_bank_core_req_rw[i],
            per_bank_core_req_wsel[i],
            per_bank_core_req_byteen[i],            
            per_bank_core_req_data[i],
            per_bank_core_req_tag[i]} = core_req_data_out[i];
    end

还是先整理这里所有的变量涉及的位宽常数:

    wire [NUM_BANKS-1:0]                        per_bank_core_req_valid;
    wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
    wire [NUM_BANKS-1:0]                        per_bank_core_req_rw;
    wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0]    per_bank_core_req_wsel;
    wire [NUM_BANKS-1:0][WORD_SIZE-1:0]         per_bank_core_req_byteen;
    wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0]    per_bank_core_req_data;
    wire [NUM_BANKS-1:0][TAG_WIDTH-1:0]         per_bank_core_req_tag;
    wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0]     per_bank_core_req_idx;
    wire [NUM_BANKS-1:0]                        per_bank_core_req_ready;
    
    wire [NUM_BANKS-1:0]                        per_bank_core_rsp_valid;
    wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0]    per_bank_core_rsp_data;
    wire [NUM_BANKS-1:0][TAG_WIDTH-1:0]         per_bank_core_rsp_tag;
    wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0]     per_bank_core_rsp_idx;
    wire [NUM_BANKS-1:0]                        per_bank_core_rsp_ready;

    wire [NUM_BANKS-1:0]                        per_bank_mem_req_valid;    
    wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
    wire [NUM_BANKS-1:0]                        per_bank_mem_req_rw;
    wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0]    per_bank_mem_req_wsel;
    wire [NUM_BANKS-1:0][WORD_SIZE-1:0]         per_bank_mem_req_byteen;
    wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0]    per_bank_mem_req_data;
    wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0]   per_bank_mem_req_id;
    wire [NUM_BANKS-1:0]                        per_bank_mem_req_ready;

    wire [NUM_BANKS-1:0]                        per_bank_mem_rsp_ready;
常量 推导
CS_LINE_ADDR_WIDTH 其表达式为CS_MEM_ADDR_WIDTH-CLOG2(NUM_BANKS),其中CS_MEM_ADDR_WIDTH的表达式为MEM_ADDR_WIDTH-CLOG2(LINE_SIZE)=32-log2(64)=24,那么CS_LINE_ADDR_WDITH=24-log2(1)=24,所以CS_LINE_ADDR_WIDTH=24
WORD_SEL_WIDTH UP(CS_WORD_SEL_BITS) ; define CS_WORD_SEL_BITS CLOG2(CS_WORDS_PER_LINE) ; define CS_WORDS_PER_LINE (LINE_SIZE / WORD_SIZE) ,所以WORD_SEL_WIDTH=log2(64/4)=4
WORD_SIZE WORD_SIZE=4
CS_WORD_WIDTH define CS_WORD_WIDTH (8 * WORD_SIZE),因此CS_WORD_WIDTH=8*4=32
TAG_WIDTH TAG_WIDTH = UUID_WIDTH + 1,因此TAG_WIDTH=0+1=1
REQ_SEL_WIDTH 表达式为CLOG2(NUM_REQS),所以REG_SEL_WIDTH=log2(4)=2
CS_MEM_ADDR_WIDTH CS_MEM_ADDR_WIDTH的表达式为MEM_ADDR_WIDTH-CLOG2(LINE_SIZE)=32-log2(64)=24
MSHR_ADDR_WIDTH MSHR_ADDR_WITDH=LOG2UP(MSHR_SIZE)=log2(8)=3

另外NUM_BANKS=1
关于中间一段assign赋值参考这里关于地址各个field的解释。

1.1.1 解释VX_stream_xbar模块

关于例化该模块时的参数为:

        .NUM_INPUTS  (NUM_REQS), // NUM_REQS = 4
        .NUM_OUTPUTS (NUM_BANKS), // NUM_BANKS = 1
        .DATAW       (CORE_REQ_DATAW), // CORE_REQ_DATAW = 68
        .PERF_CTR_BITS (`PERF_CTR_BITS), 
        .OUT_BUF     ((NUM_REQS > 4) ? 2 : 0) // OUT_BUF传入为0

该模块见于hw/rtl/libs/VX_stream_xbar.sv,代码如下:

`include "VX_define.vh"

`TRACING_OFF
module VX_stream_xbar #(
    parameter NUM_INPUTS    = 4, // 4 
    parameter NUM_OUTPUTS   = 4, // 1
    parameter DATAW         = 4, // 68
    parameter IN_WIDTH      = `LOG2UP(NUM_INPUTS), // 2
    parameter OUT_WIDTH     = `LOG2UP(NUM_OUTPUTS), // 1
    parameter ARBITER       = "P",
    parameter OUT_BUF       = 0, // 0
    parameter MAX_FANOUT    = `MAX_FANOUT,
    parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1)
) (
    input wire                              clk,
    input wire                              reset,

    output wire [PERF_CTR_BITS-1:0]         collisions,

    input wire [NUM_INPUTS-1:0]             valid_in,
    input wire [NUM_INPUTS-1:0][DATAW-1:0]  data_in,
    input wire [NUM_INPUTS-1:0][OUT_WIDTH-1:0] sel_in,
    output wire [NUM_INPUTS-1:0]            ready_in,

    output wire [NUM_OUTPUTS-1:0]           valid_out,
    output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,  
    output wire [NUM_OUTPUTS-1:0][IN_WIDTH-1:0] sel_out,
    input  wire [NUM_OUTPUTS-1:0]           ready_out
);
    `UNUSED_VAR (clk)
    `UNUSED_VAR (reset)

    if (NUM_INPUTS != 1) begin  // Yes - Branch

        if (NUM_OUTPUTS != 1) begin  // No

            // (#inputs > 1) and (#outputs > 1)

            wire [NUM_OUTPUTS-1:0][NUM_INPUTS-1:0] per_output_ready_in;

            for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin

                wire [NUM_INPUTS-1:0] valid_in_q;
                for (genvar j = 0; j < NUM_INPUTS; ++j) begin
                    assign valid_in_q[j] = valid_in[j] && (sel_in[j] == i);
                end

                `RESET_RELAY (slice_reset, reset);

                VX_stream_arb #(
                    .NUM_INPUTS  (NUM_INPUTS),
                    .NUM_OUTPUTS (1),
                    .DATAW       (DATAW),
                    .ARBITER     (ARBITER),
                    .MAX_FANOUT  (MAX_FANOUT),
                    .OUT_BUF     (OUT_BUF)
                ) xbar_arb (
                    .clk       (clk),
                    .reset     (slice_reset),
                    .valid_in  (valid_in_q),
                    .data_in   (data_in),
                    .ready_in  (per_output_ready_in[i]),
                    .valid_out (valid_out[i]),
                    .data_out  (data_out[i]),
                    .sel_out   (sel_out[i]),
                    .ready_out (ready_out[i])
                );
            end

            for (genvar i = 0; i < NUM_INPUTS; ++i) begin
                assign ready_in[i] = per_output_ready_in[sel_in[i]][i];
            end

        end else begin  // Yes - Branch

            // (#inputs >= 1) and (#outputs == 1)

            VX_stream_arb #(
                .NUM_INPUTS  (NUM_INPUTS),  // 4
                .NUM_OUTPUTS (1),           // 1
                .DATAW       (DATAW),       // 68
                .ARBITER     (ARBITER),     // `P`
                .MAX_FANOUT  (MAX_FANOUT), 
                .OUT_BUF     (OUT_BUF)      // 0
            ) xbar_arb (
                .clk       (clk),
                .reset     (reset),
                .valid_in  (valid_in),
                .data_in   (data_in),
                .ready_in  (ready_in),
                .valid_out (valid_out),
                .data_out  (data_out),
                .sel_out   (sel_out),
                .ready_out (ready_out)
            );

            `UNUSED_VAR (sel_in)
        end

    end else if (NUM_OUTPUTS != 1) begin // No

        // (#inputs == 1) and (#outputs > 1)

        logic [NUM_OUTPUTS-1:0] valid_out_r, ready_out_r;
        logic [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_r;
        always @(*) begin
            valid_out_r = '0;
            valid_out_r[sel_in] = valid_in;
        end
        assign data_out_r = {NUM_OUTPUTS{data_in}};
        assign ready_in = ready_out_r[sel_in];

        for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
            
            `RESET_RELAY (out_buf_reset, reset);

            VX_elastic_buffer #(
                .DATAW   (DATAW),
                .SIZE    (`TO_OUT_BUF_SIZE(OUT_BUF)),
                .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
            ) out_buf (
                .clk       (clk),
                .reset     (out_buf_reset),
                .valid_in  (valid_out_r[i]),
                .ready_in  (ready_out_r[i]),
                .data_in   (data_out_r[i]),
                .data_out  (data_out[i]),
                .valid_out (valid_out[i]),
                .ready_out (ready_out[i])
            );
        end

        assign sel_out = 0;

    end else begin

        // (#inputs == 1) and (#outputs == 1)

        VX_elastic_buffer #(
            .DATAW   (DATAW),
            .SIZE    (`TO_OUT_BUF_SIZE(OUT_BUF)),
            .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
        ) out_buf (
            .clk       (clk),
            .reset     (reset),
            .valid_in  (valid_in),
            .ready_in  (ready_in),
            .data_in   (data_in),
            .data_out  (data_out),
            .valid_out (valid_out),
            .ready_out (ready_out)
        );

        `UNUSED_VAR (sel_in)
        assign sel_out = 0;

    end

    // compute inputs collision
    // we have a collision when there exists a valid transfer with multiple input candicates
    // we count the unique duplicates each cycle.
    
    reg [NUM_INPUTS-1:0] per_cycle_collision, per_cycle_collision_r;
    wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
    reg [PERF_CTR_BITS-1:0] collisions_r;

    always @(*) begin
        per_cycle_collision = 0;
        for (integer i = 0; i < NUM_INPUTS; ++i) begin
            for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
                per_cycle_collision[i] |= valid_in[i]
                                       && valid_in[j+i] 
                                       && (sel_in[i] == sel_in[j+i])
                                       && (ready_in[i] | ready_in[j+i]);
            end
        end
    end
    
    `BUFFER(per_cycle_collision_r, per_cycle_collision);    
    `POP_COUNT(collision_count, per_cycle_collision_r);

    always @(posedge clk) begin
        if (reset) begin
            collisions_r <= '0;
        end else begin
            collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count);
        end
    end

    assign collisions = collisions_r;

endmodule
`TRACING_ON

上述代码中已经根据参数值标记了使用的分支,可以看到使用了VX_stream_arb模块。最后的cycle collision是指在一个周期内存在多个有效输入竞争一个transfer时的情况。单独标记一下:

                per_cycle_collision[i] |= valid_in[i]
                                       && valid_in[j+i] 
                                       && (sel_in[i] == sel_in[j+i])
                                       && (ready_in[i] | ready_in[j+i]);

另外BUFFER宏函数定义如下:

`define BUFFER_EX(dst, src, ena, latency) \
    VX_pipe_register #( \
        .DATAW  ($bits(dst)), \
        .RESETW ($bits(dst)), \
        .DEPTH  (latency) \
    ) __``dst``__ ( \
        .clk      (clk), \
        .reset    (reset), \
        .enable   (ena), \
        .data_in  (src), \
        .data_out (dst) \
    )

`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)

直观来看就是寄存器。
POP_COUNT宏函数定义如下:

`define POP_COUNT_EX(out, in, model) \
    VX_popcount #( \
        .N ($bits(in)), \
        .MODEL (model) \
    ) __``out``__ ( \
        .data_in  (in), \
        .data_out (out) \
    )

`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)

其中VX_popcount的详细解释见后。

1.1.2 解释VX_stream_arb模块

VX_stream_arb的定义见于hw/rtl/libs/VX_stream_arb.sv,其代码如下:

`include "VX_platform.vh"

`TRACING_OFF
module VX_stream_arb #(
    parameter NUM_INPUTS    = 1,
    parameter NUM_OUTPUTS   = 1,
    parameter DATAW         = 1,
    parameter `STRING ARBITER = "P",
    parameter MAX_FANOUT    = `MAX_FANOUT,
    parameter OUT_BUF       = 0 ,
    parameter NUM_REQS      = `CDIV(NUM_INPUTS, NUM_OUTPUTS),
    parameter LOG_NUM_REQS  = `CLOG2(NUM_REQS),
    parameter NUM_REQS_W    = `UP(LOG_NUM_REQS)
) (
    input  wire clk,
    input  wire reset,

    input  wire [NUM_INPUTS-1:0]             valid_in,
    input  wire [NUM_INPUTS-1:0][DATAW-1:0]  data_in,
    output wire [NUM_INPUTS-1:0]             ready_in,

    output wire [NUM_OUTPUTS-1:0]            valid_out,
    output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
    output wire [NUM_OUTPUTS-1:0][NUM_REQS_W-1:0] sel_out,
    input  wire [NUM_OUTPUTS-1:0]            ready_out
);
    if (NUM_INPUTS > NUM_OUTPUTS) begin

        if (NUM_OUTPUTS > 1) begin

            // (#inputs > #outputs) and (#outputs > 1)
            
            for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin

                localparam BATCH_BEGIN = i * NUM_REQS;
                localparam BATCH_END   = `MIN(BATCH_BEGIN + NUM_REQS, NUM_INPUTS);
                localparam BATCH_SIZE  = BATCH_END - BATCH_BEGIN;

                `RESET_RELAY (slice_reset, reset);

                VX_stream_arb #(
                    .NUM_INPUTS  (BATCH_SIZE),
                    .NUM_OUTPUTS (1),
                    .DATAW       (DATAW),
                    .ARBITER     (ARBITER),
                    .MAX_FANOUT  (MAX_FANOUT),
                    .OUT_BUF     (OUT_BUF)
                ) arb_slice (
                    .clk       (clk),
                    .reset     (slice_reset),
                    .valid_in  (valid_in[BATCH_END-1: BATCH_BEGIN]),
                    .ready_in  (ready_in[BATCH_END-1: BATCH_BEGIN]),
                    .data_in   (data_in[BATCH_END-1: BATCH_BEGIN]),
                    .data_out  (data_out[i]),
                    .sel_out   (sel_out[i]),
                    .valid_out (valid_out[i]),
                    .ready_out (ready_out[i])
                );
            end

        end else if (MAX_FANOUT != 0 && (NUM_INPUTS > MAX_FANOUT)) begin

            // (#inputs > max_fanout) and (#outputs == 1)

            localparam NUM_BATCHES = `CDIV(NUM_INPUTS, MAX_FANOUT);
            localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT);
            localparam LOG_NUM_REQS3 = `CLOG2(NUM_BATCHES);

            wire [NUM_BATCHES-1:0]                  valid_tmp;
            wire [NUM_BATCHES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp;
            wire [NUM_BATCHES-1:0]                  ready_tmp;            
                        
            for (genvar i = 0; i < NUM_BATCHES; ++i) begin

                localparam BATCH_BEGIN = i * MAX_FANOUT;
                localparam BATCH_END   = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_INPUTS);
                localparam BATCH_SIZE  = BATCH_END - BATCH_BEGIN;

                wire [DATAW-1:0] data_tmp_u;
                wire [`LOG2UP(BATCH_SIZE)-1:0] sel_tmp_u;

                `RESET_RELAY (slice_reset, reset);

                if (MAX_FANOUT != 1) begin
                    VX_stream_arb #(
                        .NUM_INPUTS  (BATCH_SIZE),
                        .NUM_OUTPUTS (1),   
                        .DATAW       (DATAW),
                        .ARBITER     (ARBITER),
                        .MAX_FANOUT  (MAX_FANOUT),
                        .OUT_BUF     (OUT_BUF)
                    ) fanout_slice_arb (
                        .clk       (clk),
                        .reset     (slice_reset),
                        .valid_in  (valid_in[BATCH_END-1: BATCH_BEGIN]),
                        .data_in   (data_in[BATCH_END-1: BATCH_BEGIN]),
                        .ready_in  (ready_in[BATCH_END-1: BATCH_BEGIN]),   
                        .valid_out (valid_tmp[i]),   
                        .data_out  (data_tmp_u),
                        .sel_out   (sel_tmp_u),
                        .ready_out (ready_tmp[i])
                    );
                end

                assign data_tmp[i] = {data_tmp_u, LOG_NUM_REQS2'(sel_tmp_u)};
            end

            wire [DATAW+LOG_NUM_REQS2-1:0] data_out_u;
            wire [LOG_NUM_REQS3-1:0] sel_out_u;

            VX_stream_arb #(
                .NUM_INPUTS  (NUM_BATCHES),
                .NUM_OUTPUTS (1),   
                .DATAW       (DATAW + LOG_NUM_REQS2),
                .ARBITER     (ARBITER),
                .MAX_FANOUT  (MAX_FANOUT),
                .OUT_BUF     (OUT_BUF)
            ) fanout_join_arb (
                .clk       (clk),
                .reset     (reset),
                .valid_in  (valid_tmp),
                .ready_in  (ready_tmp),
                .data_in   (data_tmp),
                .data_out  (data_out_u),
                .sel_out   (sel_out_u),
                .valid_out (valid_out),
                .ready_out (ready_out)
            );

            assign data_out = data_out_u[LOG_NUM_REQS2 +: DATAW];
            assign sel_out = {sel_out_u, data_out_u[0 +: LOG_NUM_REQS2]};

        end else begin

            // (#inputs <= max_fanout) and (#outputs == 1)

            wire                    valid_in_r;
            wire [DATAW-1:0]        data_in_r;
            wire                    ready_in_r;
        
            wire                    arb_valid;
            wire [NUM_REQS_W-1:0]   arb_index;
            wire [NUM_REQS-1:0]     arb_onehot;
            wire                    arb_ready;

            VX_generic_arbiter #(
                .NUM_REQS    (NUM_REQS),
                .LOCK_ENABLE (1),
                .TYPE        (ARBITER)
            ) arbiter (
                .clk          (clk),
                .reset        (reset),
                .requests     (valid_in),
                .grant_valid  (arb_valid),
                .grant_index  (arb_index),
                .grant_onehot (arb_onehot),
                .grant_unlock (arb_ready)
            );

            assign valid_in_r = arb_valid;
            assign data_in_r  = data_in[arb_index];
            assign arb_ready  = ready_in_r;

            for (genvar i = 0; i < NUM_REQS; ++i) begin
                assign ready_in[i] = ready_in_r & arb_onehot[i];
            end

            VX_elastic_buffer #(
                .DATAW   (LOG_NUM_REQS + DATAW),
                .SIZE    (`TO_OUT_BUF_SIZE(OUT_BUF)),
                .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
            ) out_buf (
                .clk       (clk),
                .reset     (reset),
                .valid_in  (valid_in_r),
                .ready_in  (ready_in_r),
                .data_in   ({arb_index, data_in_r}),
                .data_out  ({sel_out, data_out}),
                .valid_out (valid_out),
                .ready_out (ready_out)
            );
        end

    end else if (NUM_OUTPUTS > NUM_INPUTS) begin

        if (NUM_INPUTS > 1) begin

            // (#inputs > 1) and (#outputs > #inputs)

            for (genvar i = 0; i < NUM_INPUTS; ++i) begin

                localparam BATCH_BEGIN = i * NUM_REQS;
                localparam BATCH_END   = `MIN(BATCH_BEGIN + NUM_REQS, NUM_OUTPUTS);
                localparam BATCH_SIZE  = BATCH_END - BATCH_BEGIN;

                `RESET_RELAY (slice_reset, reset);

                VX_stream_arb #(
                    .NUM_INPUTS  (1),
                    .NUM_OUTPUTS (BATCH_SIZE),
                    .DATAW       (DATAW),
                    .ARBITER     (ARBITER),
                    .MAX_FANOUT  (MAX_FANOUT),
                    .OUT_BUF     (OUT_BUF)
                ) arb_slice (
                    .clk       (clk),
                    .reset     (slice_reset),
                    .valid_in  (valid_in[i]),
                    .ready_in  (ready_in[i]),
                    .data_in   (data_in[i]),
                    .data_out  (data_out[BATCH_END-1: BATCH_BEGIN]),
                    .valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]),
                    .ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]),
                    `UNUSED_PIN (sel_out)
                );

                for (genvar j = BATCH_BEGIN; j < BATCH_END; ++j) begin
                    assign sel_out[j] = i;
                end
            end

        end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > MAX_FANOUT)) begin

            // (#inputs == 1) and (#outputs > max_fanout)

            localparam NUM_BATCHES = `CDIV(NUM_OUTPUTS, MAX_FANOUT);

            wire [NUM_BATCHES-1:0]            valid_tmp;
            wire [NUM_BATCHES-1:0][DATAW-1:0] data_tmp;
            wire [NUM_BATCHES-1:0]            ready_tmp;

            VX_stream_arb #(
                .NUM_INPUTS  (1),
                .NUM_OUTPUTS (NUM_BATCHES),
                .DATAW       (DATAW),
                .ARBITER     (ARBITER),
                .MAX_FANOUT  (MAX_FANOUT),
                .OUT_BUF     (OUT_BUF)
            ) fanout_fork_arb (
                .clk       (clk),
                .reset     (reset),
                .valid_in  (valid_in),
                .ready_in  (ready_in),
                .data_in   (data_in),               
                .data_out  (data_tmp),
                .valid_out (valid_tmp),
                .ready_out (ready_tmp),
                `UNUSED_PIN (sel_out)
            );
            
            for (genvar i = 0; i < NUM_BATCHES; ++i) begin

                localparam BATCH_BEGIN = i * MAX_FANOUT;
                localparam BATCH_END   = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_OUTPUTS);
                localparam BATCH_SIZE  = BATCH_END - BATCH_BEGIN;

                `RESET_RELAY (slice_reset, reset);

                VX_stream_arb #(
                    .NUM_INPUTS  (1),
                    .NUM_OUTPUTS (BATCH_SIZE), 
                    .DATAW       (DATAW),
                    .ARBITER     (ARBITER),
                    .MAX_FANOUT  (MAX_FANOUT),
                    .OUT_BUF     (OUT_BUF)
                ) fanout_slice_arb (
                    .clk       (clk),
                    .reset     (slice_reset),
                    .valid_in  (valid_tmp[i]),
                    .ready_in  (ready_tmp[i]),
                    .data_in   (data_tmp[i]),
                    .data_out  (data_out[BATCH_END-1: BATCH_BEGIN]),
                    .valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]),
                    .ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]),
                    `UNUSED_PIN (sel_out)
                );
            end

        end else begin

            // (#inputs == 1) and (#outputs <= max_fanout)

            wire [NUM_OUTPUTS-1:0]  ready_in_r;        
        
            wire [NUM_OUTPUTS-1:0]  arb_requests;
            wire                    arb_valid;
            wire [NUM_OUTPUTS-1:0]  arb_onehot;
            wire                    arb_ready;

            VX_generic_arbiter #(
                .NUM_REQS    (NUM_OUTPUTS),
                .LOCK_ENABLE (1),
                .TYPE        (ARBITER)
            ) arbiter (
                .clk          (clk),
                .reset        (reset),
                .requests     (arb_requests),
                .grant_valid  (arb_valid),
                `UNUSED_PIN (grant_index),
                .grant_onehot (arb_onehot),
                .grant_unlock (arb_ready)
            );

            assign arb_requests = ready_in_r;
            assign arb_ready    = valid_in[0];
            assign ready_in     = arb_valid;

            for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
                VX_elastic_buffer #(
                    .DATAW    (DATAW),
                    .SIZE     (`TO_OUT_BUF_SIZE(OUT_BUF)),
                    .OUT_REG  (`TO_OUT_BUF_REG(OUT_BUF))
                ) out_buf (
                    .clk       (clk),
                    .reset     (reset),
                    .valid_in  (valid_in && arb_onehot[i]),
                    .ready_in  (ready_in_r[i]),
                    .data_in   (data_in),
                    .data_out  (data_out[i]),
                    .valid_out (valid_out[i]),
                    .ready_out (ready_out[i])
                );
            end
        end

        assign sel_out = 0;
    
    end else begin

        // #Inputs == #Outputs

        for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin

            `RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1));

            VX_elastic_buffer #(
                .DATAW   (DATAW),
                .SIZE    (`TO_OUT_BUF_SIZE(OUT_BUF)),
                .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
            ) out_buf (
                .clk       (clk),
                .reset     (out_buf_reset),
                .valid_in  (valid_in[i]),
                .ready_in  (ready_in[i]),
                .data_in   (data_in[i]),
                .data_out  (data_out[i]),
                .valid_out (valid_out[i]),
                .ready_out (ready_out[i])
            );
            assign sel_out[i] = NUM_REQS_W'(i);
        end
    end
    
endmodule
`TRACING_ON

根据前述例化时的parameter参数:

            VX_stream_arb #(
                .NUM_INPUTS  (NUM_INPUTS),  // 4
                .NUM_OUTPUTS (1),           // 1
                .DATAW       (DATAW),       // 68
                .ARBITER     (ARBITER),     // `P`
                .MAX_FANOUT  (MAX_FANOUT),  // 20
                .OUT_BUF     (OUT_BUF)      // 0
            )

被执行的分支代码是:

            wire                    valid_in_r;
            wire [DATAW-1:0]        data_in_r;
            wire                    ready_in_r;
        
            wire                    arb_valid;
            wire [NUM_REQS_W-1:0]   arb_index;
            wire [NUM_REQS-1:0]     arb_onehot;
            wire                    arb_ready;

            VX_generic_arbiter #(
                .NUM_REQS    (NUM_REQS),
                .LOCK_ENABLE (1),
                .TYPE        (ARBITER)
            ) arbiter (
                .clk          (clk),
                .reset        (reset),
                .requests     (valid_in),
                .grant_valid  (arb_valid),
                .grant_index  (arb_index),
                .grant_onehot (arb_onehot),
                .grant_unlock (arb_ready)
            );

            assign valid_in_r = arb_valid;
            assign data_in_r  = data_in[arb_index];
            assign arb_ready  = ready_in_r;

            for (genvar i = 0; i < NUM_REQS; ++i) begin
                assign ready_in[i] = ready_in_r & arb_onehot[i];
            end

            VX_elastic_buffer #(
                .DATAW   (LOG_NUM_REQS + DATAW),
                .SIZE    (`TO_OUT_BUF_SIZE(OUT_BUF)),
                .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
            ) out_buf (
                .clk       (clk),
                .reset     (reset),
                .valid_in  (valid_in_r),
                .ready_in  (ready_in_r),
                .data_in   ({arb_index, data_in_r}),
                .data_out  ({sel_out, data_out}),
                .valid_out (valid_out),
                .ready_out (ready_out)
            );
        end

1.1.3 解释VX_generic_arbiter模块

VX_generic_arbiter的定义建议hw/rtl/libs/VX_generic_arbiter.sv,其代码如下:

`include "VX_platform.vh"

`TRACING_OFF
module VX_generic_arbiter #(
    parameter NUM_REQS     = 1,
    parameter LOCK_ENABLE  = 0,
    parameter `STRING TYPE  = "P",
    parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
    input  wire                     clk,
    input  wire                     reset,    
    input  wire [NUM_REQS-1:0]      requests,
    output wire [LOG_NUM_REQS-1:0]  grant_index,
    output wire [NUM_REQS-1:0]      grant_onehot,   
    output wire                     grant_valid,
    input  wire                     grant_unlock
);
    if (TYPE == "P") begin // 执行该分支

        `UNUSED_PARAM (LOCK_ENABLE)
        `UNUSED_VAR (clk)
        `UNUSED_VAR (reset)
        `UNUSED_VAR (grant_unlock)

        VX_priority_arbiter #(
            .NUM_REQS (NUM_REQS)
        ) priority_arbiter (
            .requests     (requests),              
            .grant_valid  (grant_valid),
            .grant_index  (grant_index),
            .grant_onehot (grant_onehot)
        );

    end else if (TYPE == "R") begin

        VX_rr_arbiter #(
            .NUM_REQS    (NUM_REQS),
            .LOCK_ENABLE (LOCK_ENABLE)
        ) rr_arbiter (
            .clk          (clk),
            .reset        (reset),            
            .requests     (requests),  
            .grant_valid  (grant_valid),
            .grant_index  (grant_index),
            .grant_onehot (grant_onehot),
            .grant_unlock (grant_unlock)
        );

    end else if (TYPE == "F") begin

        VX_fair_arbiter #(
            .NUM_REQS    (NUM_REQS),
            .LOCK_ENABLE (LOCK_ENABLE)
        ) fair_arbiter (
            .clk          (clk),
            .reset        (reset),
            .requests     (requests),   
            .grant_valid  (grant_valid),
            .grant_index  (grant_index),
            .grant_onehot (grant_onehot),
            .grant_unlock (grant_unlock)
        );

    end else if (TYPE == "M") begin

        VX_matrix_arbiter #(
            .NUM_REQS    (NUM_REQS),
            .LOCK_ENABLE (LOCK_ENABLE)
        ) matrix_arbiter (
            .clk          (clk),
            .reset        (reset),
            .requests     (requests), 
            .grant_valid  (grant_valid),
            .grant_index  (grant_index),
            .grant_onehot (grant_onehot),
            .grant_unlock (grant_unlock)
        );

    end else if (TYPE == "C") begin

        VX_cyclic_arbiter #(
            .NUM_REQS    (NUM_REQS),
            .LOCK_ENABLE (LOCK_ENABLE)
        ) cyclic_arbiter (
            .clk          (clk),
            .reset        (reset),
            .requests     (requests), 
            .grant_valid  (grant_valid),
            .grant_index  (grant_index),
            .grant_onehot (grant_onehot),
            .grant_unlock (grant_unlock)
        );

    end else begin

        `ERROR(("invalid parameter"));
        
    end
    
endmodule
`TRACING_ON

整理上述提到的各种各样的arbiter

TYPE 选择的仲裁器 使用的参数
"P" VX_priority_arbiter requests, grant_valid, grant_index, grant_onehot
"R" VX_rr_arbiter clk, reset, requests, grant_valid, grant_index, grant_onehot, grant_unlock
"F" VX_fair_arbiter clk, reset, requests, grant_valid, grant_index, grant_onehot, grant_unlock
"M" VX_matrix_arbiter clk, reset, requests, grant_valid, grant_index, grant_onehot, grant_unlock
"C" VX_cyclic_arbiter clk, reset, requests, grant_valid, grant_index, grant_onehot, grant_unlock
其他 报告错误 -

在例化该模块时使用"P",当然还是都拿来分析一下!

1.1.4 解释VX_priority_arbiter模块

代码如下:

`include "VX_platform.vh"

`TRACING_OFF
module VX_priority_arbiter #(
    parameter NUM_REQS     = 1,
    parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
    input  wire [NUM_REQS-1:0]      requests,
    output wire [LOG_NUM_REQS-1:0]  grant_index,
    output wire [NUM_REQS-1:0]      grant_onehot,
    output wire                     grant_valid
);
    if (NUM_REQS == 1)  begin        
        
        assign grant_index  = '0;
        assign grant_onehot = requests;
        assign grant_valid  = requests[0];

    end else begin

        VX_priority_encoder #(
            .N (NUM_REQS)
        ) priority_encoder (
            .data_in   (requests),
            .index     (grant_index),
            .onehot    (grant_onehot),
            .valid_out (grant_valid)
        );

    end
    
endmodule
`TRACING_ON

其中VX_priority_encoder定义如下:

`include "VX_platform.vh"

`TRACING_OFF
module VX_priority_encoder #( 
    parameter N       = 1,  
    parameter REVERSE = 0,
    parameter MODEL   = 1,
    parameter LN      = `LOG2UP(N)
) (
    input  wire [N-1:0]  data_in,  
    output wire [N-1:0]  onehot,
    output wire [LN-1:0] index,
    output wire          valid_out
);
    wire [N-1:0] reversed; 

    if (REVERSE != 0) begin
        for (genvar i = 0; i < N; ++i) begin
            assign reversed[N-i-1] = data_in[i];
        end        
    end else begin
        assign reversed = data_in;
    end

    if (N == 1) begin

        assign onehot    = reversed;
        assign index     = '0;
        assign valid_out = reversed;

    end else if (N == 2) begin

        assign onehot    = {~reversed[0], reversed[0]};
        assign index     = ~reversed[0];
        assign valid_out = (| reversed);

    end else if (MODEL == 1) begin

        wire [N-1:0] scan_lo;

        VX_scan #(
            .N  (N),
            .OP (2)
        ) scan (
            .data_in  (reversed),
            .data_out (scan_lo)
        );

        VX_lzc #(
            .N       (N),
            .REVERSE (1)
        ) lzc (
            .data_in  (reversed),
            .data_out (index),
            `UNUSED_PIN (valid_out)
        );

        assign onehot    = scan_lo & {(~scan_lo[N-2:0]), 1'b1};
        assign valid_out = scan_lo[N-1];

    end else if (MODEL == 2) begin

    `IGNORE_WARNINGS_BEGIN
        wire [N-1:0] higher_pri_regs;
    `IGNORE_WARNINGS_END
        assign higher_pri_regs[N-1:1] = higher_pri_regs[N-2:0] | reversed[N-2:0];
        assign higher_pri_regs[0]     = 1'b0;
        assign onehot[N-1:0] = reversed[N-1:0] & ~higher_pri_regs[N-1:0];

        VX_lzc #(
            .N       (N),
            .REVERSE (1)
        ) lzc (
            .data_in   (reversed),
            .data_out  (index),
            .valid_out (valid_out)
        );

    end else if (MODEL == 3) begin

        assign onehot = reversed & -reversed;

        VX_lzc #(
            .N       (N),
            .REVERSE (1)
        ) lzc (
            .data_in   (reversed),
            .data_out  (index),
            .valid_out (valid_out)
        );

    end else begin

        reg [LN-1:0] index_r;
        reg [N-1:0]  onehot_r;

        always @(*) begin
            index_r  = 'x;
            onehot_r = 'x;
            for (integer i = N-1; i >= 0; --i) begin
                if (reversed[i]) begin
                    index_r     = LN'(i);
                    onehot_r    = '0;
                    onehot_r[i] = 1'b1;
                end
            end
        end        

        assign index  = index_r;
        assign onehot = onehot_r;
        assign valid_out = (| reversed);

    end    

endmodule
`TRACING_ON

其中VX_lzc模块的功能是用于计算数据中的前导零或尾随零的数量,取决于REVERSE参数。

N REVERSE 操作 结果
1 任意 data_out 设为 0,valid_out 直接赋值为 data_in data_out = 0, valid_out = data_in
>1 0 计算前导零,通过 VX_find_first 模块计算 data_out = 计算结果, valid_out = 有效性
>1 1 计算尾随零,通过 VX_find_first 模块计算 data_out = 计算结果, valid_out = 有效性

其中VX_scan模块的功能是用于进行扫描操作(例如前缀和、前缀逻辑运算等),并支持不同的运算操作和方向。

参数 操作类型 描述
N 任意 数据位宽
OP 0 XOR 扫描
1 AND 扫描(特殊情况优化:2, 3, 4 位)
2 OR 扫描
REVERSE 0 从低位到高位扫描
1 从高位到低位扫描

综上分析可以得到VX_priority_arbiter模块的功能:选择NUM_REQS为1时,直接输出请求结果。否则,调用VX_priority_encoder模块进行优先级编码,输出授权索引(grant_index)单一热编码(grant_onehot)有效信号 (grant_valid)

1.1.5 解释VX_rr_arbiter模块

代码不贴了,太长了。
该模块实现了一个优先级选择器,用于从多个请求中选择一个。该模块根据请求的数量 (NUM_REQS) 和是否启用锁定 (LOCK_ENABLE) 来调整其行为。
该模块用到VX_onehot_encoder模块,其功能是将一个一位的one-hot编码输入转换为其对应的二进制索引。根据N的值和MODEL的设置,模块的实现方式有所不同:

条件 说明
N == 1 直接将 data_in 赋值给 data_outvalid_outdata_in 相同。
N == 2 选择 data_in 的一个比特作为 data_outvalid_out 表示 data_in 中是否有有效位。
MODEL == 1 使用分层地址生成方法,将 one-hot 编码转换为二进制索引。处理非2的幂次的输入。
MODEL == 2REVERSE == 0 使用逐位掩码方法找到设置位的索引。
其他情况 使用循环检测输入位并确定索引,支持 REVERSE 参数来决定索引计算的方向。

1.1.6 解释VX_fair_arbiter模块

VX_fair_arbiter模块是一个公平调度器,用于在多个请求之间进行优先级调度。模块的行为会根据参数和输入信号的不同而有所变化。

条件 功能说明
NUM_REQS == 1 当请求数量为1时,模块直接将请求信号 requests 赋值给 grant_onehot,并且 grant_validrequests[0] 相同。grant_index 始终为0。
NUM_REQS > 1 当请求数量大于1时,模块使用一个内部缓冲寄存器 buffer 和请求信号 requests 来管理请求。具体行为如下:
- buffer 用于存储当前被授予的请求状态。
- buffer_qualbufferrequests 的按位与操作结果,用于限定已激活的请求。
- requests_qual 是当 buffer 不全为0时,用 buffer_qual 替代 requests,否则直接使用 requests
- buffer_n 是未授予的请求信号,用于更新 buffer
- 在时钟上升沿,若 reset 信号有效,则 buffer 被重置为0;若 LOCK_ENABLE 关闭或 grant_unlock 为高,则 buffer 被更新为 buffer_n
- 调用 VX_priority_arbiter 模块进行优先级调度,得到 grant_indexgrant_onehotgrant_valid

1.1.7 解释VX_matrix_arbiter模块

VX_matrix_arbiter模块实现了一种基于矩阵的优先级仲裁算法,用于在多个请求中选择一个有效的请求。

条件 功能说明
NUM_REQS == 1 当只有一个请求时,模块直接将 requests 赋值给 grant_onehot,并且 grant_validrequests[0] 相同。grant_index 始终为0。
NUM_REQS > 1 当请求数量大于1时,模块使用矩阵状态和优先级判断来选择请求。具体行为如下:
- 状态矩阵: state[i][j] 用于跟踪请求的优先级。
- 优先级矩阵: pri[j][i] 根据 requestsstate 判断请求 i 是否优先于请求 j
- 未授权请求: grant_unqual[i] 表示请求 i 是否被授权。
- 状态更新: state 在每个时钟周期内根据 grant_unqual 更新,用于记录请求的状态。
- 锁定机制: 如果 LOCK_ENABLE 为0,则 grant_onehot 直接赋值为 grant_unqual;否则,使用 grant_unlock 信号来决定是否更新 grant_onehot
- 优先级编码: 使用 VX_onehot_encoder 模块将 grant_unqual 编码为 grant_index
- 有效信号: grant_valid 取决于 requests 是否有有效请求。

VX_matrix_arbiter 模块通过矩阵状态和优先级算法,实现了在多个请求之间的公平且有序的调度。

1.1.8 解释VX_cyclic_arbiter模块

VX_cyclic_arbiter模块实现了一个循环优先级仲裁器,用于在多个请求中周期性地选择一个有效的请求。

条件 功能说明
NUM_REQS == 1 当只有一个请求时,模块直接将 requests 赋值给 grant_onehot,并且 grant_validrequests[0] 相同。grant_index 始终为0。
NUM_REQS > 1 当请求数量大于1时,模块执行循环仲裁,具体行为如下:
- 计算是否为2的幂: IS_POW2 用于确定请求数量是否为2的幂,这影响仲裁的循环行为。
- 轮询索引更新: grant_index_r 是当前的仲裁索引,按时钟周期递增,循环回到0。
- 索引更新规则: 在非2的幂请求数时,当索引达到最大值时会重置为0。否则,在仲裁器不锁定或 grant_unlock 信号激活时,索引递增。
- 输出编码: grant_onehot_r 根据 grant_index_r 生成一位热编码的授权信号。
- 输出赋值: grant_index 赋值为当前的仲裁索引,grant_onehot 赋值为一位热编码,grant_valid 表示当前索引处的请求是否有效。

1.1.9 解释VX_popcount模块

VX_popcount的定义见于/hw/rtl/libs/VX_popcount.sv,其代码如下:

`include "VX_platform.vh"

`TRACING_OFF
module VX_popcount63(
    input  wire [5:0] data_in,
    output wire [2:0] data_out
);
    reg [2:0] sum;
    always @(*) begin
        case (data_in)
         6'd0: sum=3'd0;   6'd1: sum=3'd1;   6'd2: sum=3'd1;   6'd3: sum=3'd2;   
         6'd4: sum=3'd1;   6'd5: sum=3'd2;   6'd6: sum=3'd2;   6'd7: sum=3'd3;
         6'd8: sum=3'd1;   6'd9: sum=3'd2;  6'd10: sum=3'd2;  6'd11: sum=3'd3;  
        6'd12: sum=3'd2;  6'd13: sum=3'd3;  6'd14: sum=3'd3;  6'd15: sum=3'd4;
        6'd16: sum=3'd1;  6'd17: sum=3'd2;  6'd18: sum=3'd2;  6'd19: sum=3'd3;  
        6'd20: sum=3'd2;  6'd21: sum=3'd3;  6'd22: sum=3'd3;  6'd23: sum=3'd4;
        6'd24: sum=3'd2;  6'd25: sum=3'd3;  6'd26: sum=3'd3;  6'd27: sum=3'd4;  
        6'd28: sum=3'd3;  6'd29: sum=3'd4;  6'd30: sum=3'd4;  6'd31: sum=3'd5;
        6'd32: sum=3'd1;  6'd33: sum=3'd2;  6'd34: sum=3'd2;  6'd35: sum=3'd3;  
        6'd36: sum=3'd2;  6'd37: sum=3'd3;  6'd38: sum=3'd3;  6'd39: sum=3'd4;
        6'd40: sum=3'd2;  6'd41: sum=3'd3;  6'd42: sum=3'd3;  6'd43: sum=3'd4;  
        6'd44: sum=3'd3;  6'd45: sum=3'd4;  6'd46: sum=3'd4;  6'd47: sum=3'd5;
        6'd48: sum=3'd2;  6'd49: sum=3'd3;  6'd50: sum=3'd3;  6'd51: sum=3'd4;  
        6'd52: sum=3'd3;  6'd53: sum=3'd4;  6'd54: sum=3'd4;  6'd55: sum=3'd5;
        6'd56: sum=3'd3;  6'd57: sum=3'd4;  6'd58: sum=3'd4;  6'd59: sum=3'd5;  
        6'd60: sum=3'd4;  6'd61: sum=3'd5;  6'd62: sum=3'd5;  6'd63: sum=3'd6;
        endcase
    end
    assign data_out = sum;
endmodule

module VX_popcount32(
    input  wire [2:0] data_in,
    output wire [1:0] data_out
);
    reg [1:0] sum;
    always @(*) begin
        case (data_in)
        3'd0: sum=2'd0;   3'd1: sum=2'd1;   3'd2: sum=2'd1;   3'd3: sum=2'd2;   
        3'd4: sum=2'd1;   3'd5: sum=2'd2;   3'd6: sum=2'd2;   3'd7: sum=2'd3;
        endcase
    end
    assign data_out = sum;
endmodule

module VX_sum33(
    input  wire [2:0] data_in1,
    input  wire [2:0] data_in2,
    output wire [3:0] data_out
);
    reg [3:0] sum;
    always @(*) begin
        case ({data_in1, data_in2})
        6'd0:  sum=4'd0;   6'd1: sum=4'd1;   6'd2: sum=4'd2;   6'd3: sum=4'd3;
        6'd4:  sum=4'd4;   6'd5: sum=4'd5;   6'd6: sum=4'd6;   6'd7: sum=4'd7;
        6'd8:  sum=4'd1;   6'd9: sum=4'd2;  6'd10: sum=4'd3;  6'd11: sum=4'd4;
        6'd12: sum=4'd5;  6'd13: sum=4'd6;  6'd14: sum=4'd7;  6'd15: sum=4'd8;
        6'd16: sum=4'd2;  6'd17: sum=4'd3;  6'd18: sum=4'd4;  6'd19: sum=4'd5;
        6'd20: sum=4'd6;  6'd21: sum=4'd7;  6'd22: sum=4'd8;  6'd23: sum=4'd9;
        6'd24: sum=4'd3;  6'd25: sum=4'd4;  6'd26: sum=4'd5;  6'd27: sum=4'd6;
        6'd28: sum=4'd7;  6'd29: sum=4'd8;  6'd30: sum=4'd9;  6'd31: sum=4'd10;
        6'd32: sum=4'd4;  6'd33: sum=4'd5;  6'd34: sum=4'd6;  6'd35: sum=4'd7;
        6'd36: sum=4'd8;  6'd37: sum=4'd9;  6'd38: sum=4'd10; 6'd39: sum=4'd11;
        6'd40: sum=4'd5;  6'd41: sum=4'd6;  6'd42: sum=4'd7;  6'd43: sum=4'd8;
        6'd44: sum=4'd9;  6'd45: sum=4'd10; 6'd46: sum=4'd11; 6'd47: sum=4'd12;
        6'd48: sum=4'd6;  6'd49: sum=4'd7;  6'd50: sum=4'd8;  6'd51: sum=4'd9;
        6'd52: sum=4'd10; 6'd53: sum=4'd11; 6'd54: sum=4'd12; 6'd55: sum=4'd13;
        6'd56: sum=4'd7;  6'd57: sum=4'd8;  6'd58: sum=4'd9;  6'd59: sum=4'd10;
        6'd60: sum=4'd11; 6'd61: sum=4'd12; 6'd62: sum=4'd13; 6'd63: sum=4'd14;
        endcase
    end
    assign data_out = sum;
endmodule

module VX_popcount #(
    parameter MODEL = 1,
    parameter N     = 1,
    parameter M     = `CLOG2(N+1)  
) (
    input  wire [N-1:0] data_in,
    output wire [M-1:0] data_out
);
    `UNUSED_PARAM (MODEL)    

`ifndef SYNTHESIS
    assign data_out = $countones(data_in);
`elsif QUARTUS
    assign data_out = $countones(data_in);
`else
    if (N == 1) begin

        assign data_out = data_in;

    end else if (N <= 3) begin

        reg [2:0] t_in;
        wire [1:0] t_out;
        always @(*) begin
            t_in = '0;
            t_in[N-1:0] = data_in;
        end
        VX_popcount32 pc32(t_in, t_out);
        assign data_out = t_out[M-1:0];    
    
    end else if (N <= 6) begin
    
        reg [5:0] t_in;
        wire [2:0] t_out;
        always @(*) begin
            t_in = '0;
            t_in[N-1:0] = data_in;
        end
        VX_popcount63 pc63(t_in, t_out);
        assign data_out = t_out[M-1:0];
    
    end else if (N <= 9) begin
    
        reg [8:0] t_in;
        wire [4:0] t1_out;
        wire [3:0] t2_out;
        always @(*) begin
            t_in = '0;
            t_in[N-1:0] = data_in;
        end
        VX_popcount63 pc63(t_in[5:0], t1_out[2:0]);
        VX_popcount32 pc32(t_in[8:6], t1_out[4:3]);
        VX_sum33 sum33(t1_out[2:0], {1'b0, t1_out[4:3]}, t2_out);
        assign data_out = t2_out[M-1:0];

    end else if (N <= 12) begin
    
        reg [11:0] t_in;
        wire [5:0] t1_out;
        wire [3:0] t2_out;
        always @(*) begin
            t_in = '0;
            t_in[N-1:0] = data_in;
        end
        VX_popcount63 pc63a(t_in[5:0],  t1_out[2:0]);
        VX_popcount63 pc63b(t_in[11:6], t1_out[5:3]);
        VX_sum33 sum33(t1_out[2:0], t1_out[5:3], t2_out);
        assign data_out = t2_out[M-1:0];

    end else if (N <= 18) begin
    
        reg [17:0] t_in;
        wire [8:0] t1_out;
        wire [5:0] t2_out;
        always @(*) begin
            t_in = '0;
            t_in[N-1:0] = data_in;
        end
        VX_popcount63 pc63a(t_in[5:0],   t1_out[2:0]);
        VX_popcount63 pc63b(t_in[11:6],  t1_out[5:3]);
        VX_popcount63 pc63c(t_in[17:12], t1_out[8:6]);
        VX_popcount32 pc32a({t1_out[0], t1_out[3], t1_out[6]}, t2_out[1:0]);
        VX_popcount32 pc32b({t1_out[1], t1_out[4], t1_out[7]}, t2_out[3:2]);
        VX_popcount32 pc32c({t1_out[2], t1_out[5], t1_out[8]}, t2_out[5:4]);
        assign data_out = {2'b0,t2_out[1:0]} + {1'b0,t2_out[3:2],1'b0} + {t2_out[5:4],2'b0};

    end else if (MODEL == 1) begin

        localparam PN = 1 << `CLOG2(N);
        localparam LOGPN = `CLOG2(PN);

    `IGNORE_UNOPTFLAT_BEGIN
        wire [M-1:0] tmp [LOGPN-1:0][PN-1:0];    
    `IGNORE_UNOPTFLAT_END

        for (genvar j = 0; j < LOGPN; ++j) begin
            localparam D = j + 1;
            localparam Q = (D < LOGPN) ? (D + 1) : M;
            for (genvar i = 0; i < (1 << (LOGPN-j-1)); ++i) begin                
                localparam l = i * 2;
                localparam r = i * 2 + 1;
                wire [Q-1:0] res;                
                if (j == 0) begin                    
                    if (r < N) begin
                        assign res = data_in[l] + data_in[r];
                    end else if (l < N) begin
                        assign res = 2'(data_in[l]);
                    end else begin
                        assign res = 2'b0;
                    end
                end else begin
                    assign res = D'(tmp[j-1][l]) + D'(tmp[j-1][r]);
                end
                assign tmp[j][i] = M'(res);
            end
        end

        assign data_out = tmp[LOGPN-1][0];
    
    end else begin

        reg [M-1:0] cnt_r;

        always @(*) begin
            cnt_r = '0;
            for (integer i = 0; i < N; ++i) begin
                cnt_r = cnt_r + M'(data_in[i]);
            end
        end

        assign data_out = cnt_r;
    
    end
`endif

endmodule
`TRACING_ON

分别解释:

1、VX_popcount63计算6位输入数据中1的个数。
2、VX_popcount32计算3位输入数据中1的个数。
3、VX_sum33计算两个3位数据的和。
4、VX_popcount计算任意宽度的输入数据中1的个数,这是一个参数化模块,能够处理不同位宽的输入数据。根据N的值选择合适的实现方式(如使用VX_popcount32、VX_popcount63 或更复杂的组合),最终输出data_out为M位,表示输入数据中1的个数。

1.2 bank access

    // Banks access
    for (genvar i = 0; i < NUM_BANKS; ++i) begin
        wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
        wire curr_bank_mem_rsp_valid;

        if (NUM_BANKS == 1) begin
            assign curr_bank_mem_rsp_valid = mem_rsp_valid_s;
        end else begin
            assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == i);
        end

        `RESET_RELAY (bank_reset, reset);
        
        VX_cache_bank #(                
            .BANK_ID      (i),
            .INSTANCE_ID  (INSTANCE_ID),
            .CACHE_SIZE   (CACHE_SIZE),
            .LINE_SIZE    (LINE_SIZE),
            .NUM_BANKS    (NUM_BANKS),
            .NUM_WAYS     (NUM_WAYS),
            .WORD_SIZE    (WORD_SIZE),
            .NUM_REQS     (NUM_REQS),
            .CRSQ_SIZE    (CRSQ_SIZE),
            .MSHR_SIZE    (MSHR_SIZE),
            .MREQ_SIZE    (MREQ_SIZE),
            .WRITE_ENABLE (WRITE_ENABLE),
            .UUID_WIDTH   (UUID_WIDTH),
            .TAG_WIDTH    (TAG_WIDTH),
            .CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF),
            .MEM_OUT_BUF  (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_BUF)
        ) bank (          
            .clk                (clk),
            .reset              (bank_reset),

                    
            // Core request
            .core_req_valid     (per_bank_core_req_valid[i]),
            .core_req_addr      (per_bank_core_req_addr[i]),
            .core_req_rw        (per_bank_core_req_rw[i]),
            .core_req_wsel      (per_bank_core_req_wsel[i]),
            .core_req_byteen    (per_bank_core_req_byteen[i]),
            .core_req_data      (per_bank_core_req_data[i]),
            .core_req_tag       (per_bank_core_req_tag[i]),
            .core_req_idx       (per_bank_core_req_idx[i]),
            .core_req_ready     (per_bank_core_req_ready[i]),

            // Core response                
            .core_rsp_valid     (per_bank_core_rsp_valid[i]),
            .core_rsp_data      (per_bank_core_rsp_data[i]),
            .core_rsp_tag       (per_bank_core_rsp_tag[i]),
            .core_rsp_idx       (per_bank_core_rsp_idx[i]),
            .core_rsp_ready     (per_bank_core_rsp_ready[i]),

            // Memory request
            .mem_req_valid      (per_bank_mem_req_valid[i]),
            .mem_req_addr       (curr_bank_mem_req_addr),
            .mem_req_rw         (per_bank_mem_req_rw[i]),
            .mem_req_wsel       (per_bank_mem_req_wsel[i]),
            .mem_req_byteen     (per_bank_mem_req_byteen[i]),
            .mem_req_data       (per_bank_mem_req_data[i]),
            .mem_req_id         (per_bank_mem_req_id[i]),
            .mem_req_ready      (per_bank_mem_req_ready[i]),

            // Memory response
            .mem_rsp_valid      (curr_bank_mem_rsp_valid),
            .mem_rsp_data       (mem_rsp_data_s),
            .mem_rsp_id         (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
            .mem_rsp_ready      (per_bank_mem_rsp_ready[i]),

            // initialization    
            .init_enable        (init_enable),
            .init_line_sel      (init_line_sel)
        );

        if (NUM_BANKS == 1) begin
            assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr;
        end else begin
            assign per_bank_mem_req_addr[i] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, i);
        end
    end   

这里最重要的无疑是VX_cache_bank模块。因为bank结构比较庞大,当前这一章的内容已经足够多,因此这块放到后面分析!

1.3 bank response gather

    // Bank responses gather

    wire [NUM_BANKS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_in;
    wire [NUM_REQS-1:0][CORE_RSP_DATAW-1:0]  core_rsp_data_out;

    for (genvar i = 0; i < NUM_BANKS; ++i) begin
        assign core_rsp_data_in[i] = {per_bank_core_rsp_data[i], per_bank_core_rsp_tag[i]};
    end

    `RESET_RELAY (rsp_xbar_reset, reset);

    VX_stream_xbar #(
        .NUM_INPUTS  (NUM_BANKS),
        .NUM_OUTPUTS (NUM_REQS),
        .DATAW       (CORE_RSP_DATAW)
    ) rsp_xbar (
        .clk       (clk),
        .reset     (rsp_xbar_reset),
        `UNUSED_PIN (collisions),
        .valid_in  (per_bank_core_rsp_valid),
        .data_in   (core_rsp_data_in),
        .sel_in    (per_bank_core_rsp_idx),
        .ready_in  (per_bank_core_rsp_ready),
        .valid_out (core_rsp_valid_s),
        .data_out  (core_rsp_data_out),
        .ready_out (core_rsp_ready_s),
        `UNUSED_PIN (sel_out)
    );

    for (genvar i = 0; i < NUM_REQS; ++i) begin
        assign {core_rsp_data_s[i], core_rsp_tag_s[i]} = core_rsp_data_out[i];
    end

关于这一段,核心模块已经在bank request dispatch中讲解。回过头来看,这一章的代码在于手机对bank的访问请求、随后执行对bank的访问,最后将bank的响应收集。根据细节上的代码可以知道作者在设计访问请求的过程中考虑多个请求在同一拍出现并设置如何选择的solution


总结

详细讲解了bank request dispatch模块中的功能,并大致梳理了对bank访问的请求-应答流程。

你可能感兴趣的:(Vortex,GPGPU硬件代码分析,架构开发,缓存)