前面已经分析了Vortex GPGPU
的架构:Vortex GPGPU的硬件设计和代码结构分析
前面也分析了Vortex GPGPU
中关于Cache
设计的一部分代码:
1、Vortex GPGPU的硬件代码分析(Cache篇1)
2、Vortex GPGPU的硬件代码分析(Cache篇2)
本文接着分析VX_cache.sv
代码
///
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
wire [NUM_BANKS-1:0] per_bank_core_req_rw;
wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0] per_bank_core_req_wsel;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_rsp_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_rsp_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_rsp_idx;
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0] per_bank_mem_req_wsel;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
if (NUM_BANKS == 1) begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready;
end else begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready[`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s)];
end
// Bank requests dispatch
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] core_req_bid;
wire [NUM_REQS-1:0][WORD_SEL_WIDTH-1:0] core_req_wsel;
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (WORDS_PER_LINE > 1) begin
assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS];
end else begin
assign core_req_wsel[i] = '0;
end
assign core_req_line_addr[i] = core_req_addr[i][(BANK_SEL_BITS + WORD_SEL_BITS) +: LINE_ADDR_WIDTH];
end
if (NUM_BANKS > 1) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_bid[i] = core_req_addr[i][WORD_SEL_BITS +: BANK_SEL_BITS];
end
end else begin
assign core_req_bid = '0;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_data_in[i] = {
core_req_line_addr[i],
core_req_rw[i],
core_req_wsel[i],
core_req_byteen[i],
core_req_data[i],
core_req_tag[i]};
end
`RESET_RELAY (req_xbar_reset, reset);
VX_stream_xbar #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS),
.OUT_BUF ((NUM_REQS > 4) ? 2 : 0)
) req_xbar (
.clk (clk),
.reset (req_xbar_reset),
`UNUSED_PIN(collisions),
.valid_in (core_req_valid),
.data_in (core_req_data_in),
.sel_in (core_req_bid),
.ready_in (core_req_ready),
.valid_out (per_bank_core_req_valid),
.data_out (core_req_data_out),
.sel_out (per_bank_core_req_idx),
.ready_out (per_bank_core_req_ready)
);
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign {
per_bank_core_req_addr[i],
per_bank_core_req_rw[i],
per_bank_core_req_wsel[i],
per_bank_core_req_byteen[i],
per_bank_core_req_data[i],
per_bank_core_req_tag[i]} = core_req_data_out[i];
end
还是先整理这里所有的变量涉及的位宽常数:
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
wire [NUM_BANKS-1:0] per_bank_core_req_rw;
wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0] per_bank_core_req_wsel;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_rsp_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_rsp_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_rsp_idx;
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0] per_bank_mem_req_wsel;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
常量 | 推导 |
---|---|
CS_LINE_ADDR_WIDTH |
其表达式为CS_MEM_ADDR_WIDTH-CLOG2(NUM_BANKS) ,其中CS_MEM_ADDR_WIDTH 的表达式为MEM_ADDR_WIDTH-CLOG2(LINE_SIZE) =32-log2(64)=24 ,那么CS_LINE_ADDR_WDITH=24-log2(1)=24 ,所以CS_LINE_ADDR_WIDTH=24 |
WORD_SEL_WIDTH |
UP(CS_WORD_SEL_BITS) ; define CS_WORD_SEL_BITS CLOG2(CS_WORDS_PER_LINE) ; define CS_WORDS_PER_LINE (LINE_SIZE / WORD_SIZE) ,所以WORD_SEL_WIDTH=log2(64/4)=4 |
WORD_SIZE |
WORD_SIZE=4 |
CS_WORD_WIDTH |
define CS_WORD_WIDTH (8 * WORD_SIZE) ,因此CS_WORD_WIDTH=8*4=32 |
TAG_WIDTH |
TAG_WIDTH = UUID_WIDTH + 1 ,因此TAG_WIDTH=0+1=1 |
REQ_SEL_WIDTH |
表达式为CLOG2(NUM_REQS) ,所以REG_SEL_WIDTH=log2(4)=2 |
CS_MEM_ADDR_WIDTH |
CS_MEM_ADDR_WIDTH 的表达式为MEM_ADDR_WIDTH-CLOG2(LINE_SIZE) =32-log2(64)=24 |
MSHR_ADDR_WIDTH |
MSHR_ADDR_WITDH=LOG2UP(MSHR_SIZE)=log2(8)=3 |
另外NUM_BANKS=1
。
关于中间一段assign赋值
参考这里关于地址各个field
的解释。
关于例化该模块时的参数为:
.NUM_INPUTS (NUM_REQS), // NUM_REQS = 4
.NUM_OUTPUTS (NUM_BANKS), // NUM_BANKS = 1
.DATAW (CORE_REQ_DATAW), // CORE_REQ_DATAW = 68
.PERF_CTR_BITS (`PERF_CTR_BITS),
.OUT_BUF ((NUM_REQS > 4) ? 2 : 0) // OUT_BUF传入为0
该模块见于hw/rtl/libs/VX_stream_xbar.sv
,代码如下:
`include "VX_define.vh"
`TRACING_OFF
module VX_stream_xbar #(
parameter NUM_INPUTS = 4, // 4
parameter NUM_OUTPUTS = 4, // 1
parameter DATAW = 4, // 68
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS), // 2
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS), // 1
parameter ARBITER = "P",
parameter OUT_BUF = 0, // 0
parameter MAX_FANOUT = `MAX_FANOUT,
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1)
) (
input wire clk,
input wire reset,
output wire [PERF_CTR_BITS-1:0] collisions,
input wire [NUM_INPUTS-1:0] valid_in,
input wire [NUM_INPUTS-1:0][DATAW-1:0] data_in,
input wire [NUM_INPUTS-1:0][OUT_WIDTH-1:0] sel_in,
output wire [NUM_INPUTS-1:0] ready_in,
output wire [NUM_OUTPUTS-1:0] valid_out,
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
output wire [NUM_OUTPUTS-1:0][IN_WIDTH-1:0] sel_out,
input wire [NUM_OUTPUTS-1:0] ready_out
);
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
if (NUM_INPUTS != 1) begin // Yes - Branch
if (NUM_OUTPUTS != 1) begin // No
// (#inputs > 1) and (#outputs > 1)
wire [NUM_OUTPUTS-1:0][NUM_INPUTS-1:0] per_output_ready_in;
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
wire [NUM_INPUTS-1:0] valid_in_q;
for (genvar j = 0; j < NUM_INPUTS; ++j) begin
assign valid_in_q[j] = valid_in[j] && (sel_in[j] == i);
end
`RESET_RELAY (slice_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
) xbar_arb (
.clk (clk),
.reset (slice_reset),
.valid_in (valid_in_q),
.data_in (data_in),
.ready_in (per_output_ready_in[i]),
.valid_out (valid_out[i]),
.data_out (data_out[i]),
.sel_out (sel_out[i]),
.ready_out (ready_out[i])
);
end
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
assign ready_in[i] = per_output_ready_in[sel_in[i]][i];
end
end else begin // Yes - Branch
// (#inputs >= 1) and (#outputs == 1)
VX_stream_arb #(
.NUM_INPUTS (NUM_INPUTS), // 4
.NUM_OUTPUTS (1), // 1
.DATAW (DATAW), // 68
.ARBITER (ARBITER), // `P`
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF) // 0
) xbar_arb (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.data_in (data_in),
.ready_in (ready_in),
.valid_out (valid_out),
.data_out (data_out),
.sel_out (sel_out),
.ready_out (ready_out)
);
`UNUSED_VAR (sel_in)
end
end else if (NUM_OUTPUTS != 1) begin // No
// (#inputs == 1) and (#outputs > 1)
logic [NUM_OUTPUTS-1:0] valid_out_r, ready_out_r;
logic [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_r;
always @(*) begin
valid_out_r = '0;
valid_out_r[sel_in] = valid_in;
end
assign data_out_r = {NUM_OUTPUTS{data_in}};
assign ready_in = ready_out_r[sel_in];
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_out_r[i]),
.ready_in (ready_out_r[i]),
.data_in (data_out_r[i]),
.data_out (data_out[i]),
.valid_out (valid_out[i]),
.ready_out (ready_out[i])
);
end
assign sel_out = 0;
end else begin
// (#inputs == 1) and (#outputs == 1)
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
.data_out (data_out),
.valid_out (valid_out),
.ready_out (ready_out)
);
`UNUSED_VAR (sel_in)
assign sel_out = 0;
end
// compute inputs collision
// we have a collision when there exists a valid transfer with multiple input candicates
// we count the unique duplicates each cycle.
reg [NUM_INPUTS-1:0] per_cycle_collision, per_cycle_collision_r;
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
reg [PERF_CTR_BITS-1:0] collisions_r;
always @(*) begin
per_cycle_collision = 0;
for (integer i = 0; i < NUM_INPUTS; ++i) begin
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
per_cycle_collision[i] |= valid_in[i]
&& valid_in[j+i]
&& (sel_in[i] == sel_in[j+i])
&& (ready_in[i] | ready_in[j+i]);
end
end
end
`BUFFER(per_cycle_collision_r, per_cycle_collision);
`POP_COUNT(collision_count, per_cycle_collision_r);
always @(posedge clk) begin
if (reset) begin
collisions_r <= '0;
end else begin
collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count);
end
end
assign collisions = collisions_r;
endmodule
`TRACING_ON
上述代码中已经根据参数值标记了使用的分支,可以看到使用了VX_stream_arb
模块。最后的cycle collision
是指在一个周期内存在多个有效输入竞争一个transfer
时的情况。单独标记一下:
per_cycle_collision[i] |= valid_in[i]
&& valid_in[j+i]
&& (sel_in[i] == sel_in[j+i])
&& (ready_in[i] | ready_in[j+i]);
另外BUFFER
宏函数定义如下:
`define BUFFER_EX(dst, src, ena, latency) \
VX_pipe_register #( \
.DATAW ($bits(dst)), \
.RESETW ($bits(dst)), \
.DEPTH (latency) \
) __``dst``__ ( \
.clk (clk), \
.reset (reset), \
.enable (ena), \
.data_in (src), \
.data_out (dst) \
)
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)
直观来看就是寄存器。
POP_COUNT
宏函数定义如下:
`define POP_COUNT_EX(out, in, model) \
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out``__ ( \
.data_in (in), \
.data_out (out) \
)
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
其中VX_popcount
的详细解释见后。
VX_stream_arb
的定义见于hw/rtl/libs/VX_stream_arb.sv
,其代码如下:
`include "VX_platform.vh"
`TRACING_OFF
module VX_stream_arb #(
parameter NUM_INPUTS = 1,
parameter NUM_OUTPUTS = 1,
parameter DATAW = 1,
parameter `STRING ARBITER = "P",
parameter MAX_FANOUT = `MAX_FANOUT,
parameter OUT_BUF = 0 ,
parameter NUM_REQS = `CDIV(NUM_INPUTS, NUM_OUTPUTS),
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
parameter NUM_REQS_W = `UP(LOG_NUM_REQS)
) (
input wire clk,
input wire reset,
input wire [NUM_INPUTS-1:0] valid_in,
input wire [NUM_INPUTS-1:0][DATAW-1:0] data_in,
output wire [NUM_INPUTS-1:0] ready_in,
output wire [NUM_OUTPUTS-1:0] valid_out,
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
output wire [NUM_OUTPUTS-1:0][NUM_REQS_W-1:0] sel_out,
input wire [NUM_OUTPUTS-1:0] ready_out
);
if (NUM_INPUTS > NUM_OUTPUTS) begin
if (NUM_OUTPUTS > 1) begin
// (#inputs > #outputs) and (#outputs > 1)
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
localparam BATCH_BEGIN = i * NUM_REQS;
localparam BATCH_END = `MIN(BATCH_BEGIN + NUM_REQS, NUM_INPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
`RESET_RELAY (slice_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (BATCH_SIZE),
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
) arb_slice (
.clk (clk),
.reset (slice_reset),
.valid_in (valid_in[BATCH_END-1: BATCH_BEGIN]),
.ready_in (ready_in[BATCH_END-1: BATCH_BEGIN]),
.data_in (data_in[BATCH_END-1: BATCH_BEGIN]),
.data_out (data_out[i]),
.sel_out (sel_out[i]),
.valid_out (valid_out[i]),
.ready_out (ready_out[i])
);
end
end else if (MAX_FANOUT != 0 && (NUM_INPUTS > MAX_FANOUT)) begin
// (#inputs > max_fanout) and (#outputs == 1)
localparam NUM_BATCHES = `CDIV(NUM_INPUTS, MAX_FANOUT);
localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT);
localparam LOG_NUM_REQS3 = `CLOG2(NUM_BATCHES);
wire [NUM_BATCHES-1:0] valid_tmp;
wire [NUM_BATCHES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp;
wire [NUM_BATCHES-1:0] ready_tmp;
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
localparam BATCH_BEGIN = i * MAX_FANOUT;
localparam BATCH_END = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_INPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
wire [DATAW-1:0] data_tmp_u;
wire [`LOG2UP(BATCH_SIZE)-1:0] sel_tmp_u;
`RESET_RELAY (slice_reset, reset);
if (MAX_FANOUT != 1) begin
VX_stream_arb #(
.NUM_INPUTS (BATCH_SIZE),
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
) fanout_slice_arb (
.clk (clk),
.reset (slice_reset),
.valid_in (valid_in[BATCH_END-1: BATCH_BEGIN]),
.data_in (data_in[BATCH_END-1: BATCH_BEGIN]),
.ready_in (ready_in[BATCH_END-1: BATCH_BEGIN]),
.valid_out (valid_tmp[i]),
.data_out (data_tmp_u),
.sel_out (sel_tmp_u),
.ready_out (ready_tmp[i])
);
end
assign data_tmp[i] = {data_tmp_u, LOG_NUM_REQS2'(sel_tmp_u)};
end
wire [DATAW+LOG_NUM_REQS2-1:0] data_out_u;
wire [LOG_NUM_REQS3-1:0] sel_out_u;
VX_stream_arb #(
.NUM_INPUTS (NUM_BATCHES),
.NUM_OUTPUTS (1),
.DATAW (DATAW + LOG_NUM_REQS2),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
) fanout_join_arb (
.clk (clk),
.reset (reset),
.valid_in (valid_tmp),
.ready_in (ready_tmp),
.data_in (data_tmp),
.data_out (data_out_u),
.sel_out (sel_out_u),
.valid_out (valid_out),
.ready_out (ready_out)
);
assign data_out = data_out_u[LOG_NUM_REQS2 +: DATAW];
assign sel_out = {sel_out_u, data_out_u[0 +: LOG_NUM_REQS2]};
end else begin
// (#inputs <= max_fanout) and (#outputs == 1)
wire valid_in_r;
wire [DATAW-1:0] data_in_r;
wire ready_in_r;
wire arb_valid;
wire [NUM_REQS_W-1:0] arb_index;
wire [NUM_REQS-1:0] arb_onehot;
wire arb_ready;
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (1),
.TYPE (ARBITER)
) arbiter (
.clk (clk),
.reset (reset),
.requests (valid_in),
.grant_valid (arb_valid),
.grant_index (arb_index),
.grant_onehot (arb_onehot),
.grant_unlock (arb_ready)
);
assign valid_in_r = arb_valid;
assign data_in_r = data_in[arb_index];
assign arb_ready = ready_in_r;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign ready_in[i] = ready_in_r & arb_onehot[i];
end
VX_elastic_buffer #(
.DATAW (LOG_NUM_REQS + DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_in_r),
.ready_in (ready_in_r),
.data_in ({arb_index, data_in_r}),
.data_out ({sel_out, data_out}),
.valid_out (valid_out),
.ready_out (ready_out)
);
end
end else if (NUM_OUTPUTS > NUM_INPUTS) begin
if (NUM_INPUTS > 1) begin
// (#inputs > 1) and (#outputs > #inputs)
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
localparam BATCH_BEGIN = i * NUM_REQS;
localparam BATCH_END = `MIN(BATCH_BEGIN + NUM_REQS, NUM_OUTPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
`RESET_RELAY (slice_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (BATCH_SIZE),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
) arb_slice (
.clk (clk),
.reset (slice_reset),
.valid_in (valid_in[i]),
.ready_in (ready_in[i]),
.data_in (data_in[i]),
.data_out (data_out[BATCH_END-1: BATCH_BEGIN]),
.valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]),
.ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]),
`UNUSED_PIN (sel_out)
);
for (genvar j = BATCH_BEGIN; j < BATCH_END; ++j) begin
assign sel_out[j] = i;
end
end
end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > MAX_FANOUT)) begin
// (#inputs == 1) and (#outputs > max_fanout)
localparam NUM_BATCHES = `CDIV(NUM_OUTPUTS, MAX_FANOUT);
wire [NUM_BATCHES-1:0] valid_tmp;
wire [NUM_BATCHES-1:0][DATAW-1:0] data_tmp;
wire [NUM_BATCHES-1:0] ready_tmp;
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (NUM_BATCHES),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
) fanout_fork_arb (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
.data_out (data_tmp),
.valid_out (valid_tmp),
.ready_out (ready_tmp),
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
localparam BATCH_BEGIN = i * MAX_FANOUT;
localparam BATCH_END = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_OUTPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
`RESET_RELAY (slice_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (BATCH_SIZE),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
) fanout_slice_arb (
.clk (clk),
.reset (slice_reset),
.valid_in (valid_tmp[i]),
.ready_in (ready_tmp[i]),
.data_in (data_tmp[i]),
.data_out (data_out[BATCH_END-1: BATCH_BEGIN]),
.valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]),
.ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]),
`UNUSED_PIN (sel_out)
);
end
end else begin
// (#inputs == 1) and (#outputs <= max_fanout)
wire [NUM_OUTPUTS-1:0] ready_in_r;
wire [NUM_OUTPUTS-1:0] arb_requests;
wire arb_valid;
wire [NUM_OUTPUTS-1:0] arb_onehot;
wire arb_ready;
VX_generic_arbiter #(
.NUM_REQS (NUM_OUTPUTS),
.LOCK_ENABLE (1),
.TYPE (ARBITER)
) arbiter (
.clk (clk),
.reset (reset),
.requests (arb_requests),
.grant_valid (arb_valid),
`UNUSED_PIN (grant_index),
.grant_onehot (arb_onehot),
.grant_unlock (arb_ready)
);
assign arb_requests = ready_in_r;
assign arb_ready = valid_in[0];
assign ready_in = arb_valid;
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_in && arb_onehot[i]),
.ready_in (ready_in_r[i]),
.data_in (data_in),
.data_out (data_out[i]),
.valid_out (valid_out[i]),
.ready_out (ready_out[i])
);
end
end
assign sel_out = 0;
end else begin
// #Inputs == #Outputs
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
`RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1));
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_in[i]),
.ready_in (ready_in[i]),
.data_in (data_in[i]),
.data_out (data_out[i]),
.valid_out (valid_out[i]),
.ready_out (ready_out[i])
);
assign sel_out[i] = NUM_REQS_W'(i);
end
end
endmodule
`TRACING_ON
根据前述例化时的parameter参数:
VX_stream_arb #(
.NUM_INPUTS (NUM_INPUTS), // 4
.NUM_OUTPUTS (1), // 1
.DATAW (DATAW), // 68
.ARBITER (ARBITER), // `P`
.MAX_FANOUT (MAX_FANOUT), // 20
.OUT_BUF (OUT_BUF) // 0
)
被执行的分支代码是:
wire valid_in_r;
wire [DATAW-1:0] data_in_r;
wire ready_in_r;
wire arb_valid;
wire [NUM_REQS_W-1:0] arb_index;
wire [NUM_REQS-1:0] arb_onehot;
wire arb_ready;
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (1),
.TYPE (ARBITER)
) arbiter (
.clk (clk),
.reset (reset),
.requests (valid_in),
.grant_valid (arb_valid),
.grant_index (arb_index),
.grant_onehot (arb_onehot),
.grant_unlock (arb_ready)
);
assign valid_in_r = arb_valid;
assign data_in_r = data_in[arb_index];
assign arb_ready = ready_in_r;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign ready_in[i] = ready_in_r & arb_onehot[i];
end
VX_elastic_buffer #(
.DATAW (LOG_NUM_REQS + DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_in_r),
.ready_in (ready_in_r),
.data_in ({arb_index, data_in_r}),
.data_out ({sel_out, data_out}),
.valid_out (valid_out),
.ready_out (ready_out)
);
end
VX_generic_arbiter
的定义建议hw/rtl/libs/VX_generic_arbiter.sv
,其代码如下:
`include "VX_platform.vh"
`TRACING_OFF
module VX_generic_arbiter #(
parameter NUM_REQS = 1,
parameter LOCK_ENABLE = 0,
parameter `STRING TYPE = "P",
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
input wire clk,
input wire reset,
input wire [NUM_REQS-1:0] requests,
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid,
input wire grant_unlock
);
if (TYPE == "P") begin // 执行该分支
`UNUSED_PARAM (LOCK_ENABLE)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (grant_unlock)
VX_priority_arbiter #(
.NUM_REQS (NUM_REQS)
) priority_arbiter (
.requests (requests),
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot)
);
end else if (TYPE == "R") begin
VX_rr_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE)
) rr_arbiter (
.clk (clk),
.reset (reset),
.requests (requests),
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
);
end else if (TYPE == "F") begin
VX_fair_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE)
) fair_arbiter (
.clk (clk),
.reset (reset),
.requests (requests),
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
);
end else if (TYPE == "M") begin
VX_matrix_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE)
) matrix_arbiter (
.clk (clk),
.reset (reset),
.requests (requests),
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
);
end else if (TYPE == "C") begin
VX_cyclic_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE)
) cyclic_arbiter (
.clk (clk),
.reset (reset),
.requests (requests),
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
);
end else begin
`ERROR(("invalid parameter"));
end
endmodule
`TRACING_ON
整理上述提到的各种各样的arbiter
:
TYPE 值 |
选择的仲裁器 | 使用的参数 |
---|---|---|
"P" |
VX_priority_arbiter |
requests , grant_valid , grant_index , grant_onehot |
"R" |
VX_rr_arbiter |
clk , reset , requests , grant_valid , grant_index , grant_onehot , grant_unlock |
"F" |
VX_fair_arbiter |
clk , reset , requests , grant_valid , grant_index , grant_onehot , grant_unlock |
"M" |
VX_matrix_arbiter |
clk , reset , requests , grant_valid , grant_index , grant_onehot , grant_unlock |
"C" |
VX_cyclic_arbiter |
clk , reset , requests , grant_valid , grant_index , grant_onehot , grant_unlock |
其他 | 报告错误 | - |
在例化该模块时使用"P"
,当然还是都拿来分析一下!
代码如下:
`include "VX_platform.vh"
`TRACING_OFF
module VX_priority_arbiter #(
parameter NUM_REQS = 1,
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
input wire [NUM_REQS-1:0] requests,
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid
);
if (NUM_REQS == 1) begin
assign grant_index = '0;
assign grant_onehot = requests;
assign grant_valid = requests[0];
end else begin
VX_priority_encoder #(
.N (NUM_REQS)
) priority_encoder (
.data_in (requests),
.index (grant_index),
.onehot (grant_onehot),
.valid_out (grant_valid)
);
end
endmodule
`TRACING_ON
其中VX_priority_encoder
定义如下:
`include "VX_platform.vh"
`TRACING_OFF
module VX_priority_encoder #(
parameter N = 1,
parameter REVERSE = 0,
parameter MODEL = 1,
parameter LN = `LOG2UP(N)
) (
input wire [N-1:0] data_in,
output wire [N-1:0] onehot,
output wire [LN-1:0] index,
output wire valid_out
);
wire [N-1:0] reversed;
if (REVERSE != 0) begin
for (genvar i = 0; i < N; ++i) begin
assign reversed[N-i-1] = data_in[i];
end
end else begin
assign reversed = data_in;
end
if (N == 1) begin
assign onehot = reversed;
assign index = '0;
assign valid_out = reversed;
end else if (N == 2) begin
assign onehot = {~reversed[0], reversed[0]};
assign index = ~reversed[0];
assign valid_out = (| reversed);
end else if (MODEL == 1) begin
wire [N-1:0] scan_lo;
VX_scan #(
.N (N),
.OP (2)
) scan (
.data_in (reversed),
.data_out (scan_lo)
);
VX_lzc #(
.N (N),
.REVERSE (1)
) lzc (
.data_in (reversed),
.data_out (index),
`UNUSED_PIN (valid_out)
);
assign onehot = scan_lo & {(~scan_lo[N-2:0]), 1'b1};
assign valid_out = scan_lo[N-1];
end else if (MODEL == 2) begin
`IGNORE_WARNINGS_BEGIN
wire [N-1:0] higher_pri_regs;
`IGNORE_WARNINGS_END
assign higher_pri_regs[N-1:1] = higher_pri_regs[N-2:0] | reversed[N-2:0];
assign higher_pri_regs[0] = 1'b0;
assign onehot[N-1:0] = reversed[N-1:0] & ~higher_pri_regs[N-1:0];
VX_lzc #(
.N (N),
.REVERSE (1)
) lzc (
.data_in (reversed),
.data_out (index),
.valid_out (valid_out)
);
end else if (MODEL == 3) begin
assign onehot = reversed & -reversed;
VX_lzc #(
.N (N),
.REVERSE (1)
) lzc (
.data_in (reversed),
.data_out (index),
.valid_out (valid_out)
);
end else begin
reg [LN-1:0] index_r;
reg [N-1:0] onehot_r;
always @(*) begin
index_r = 'x;
onehot_r = 'x;
for (integer i = N-1; i >= 0; --i) begin
if (reversed[i]) begin
index_r = LN'(i);
onehot_r = '0;
onehot_r[i] = 1'b1;
end
end
end
assign index = index_r;
assign onehot = onehot_r;
assign valid_out = (| reversed);
end
endmodule
`TRACING_ON
其中VX_lzc
模块的功能是用于计算数据中的前导零或尾随零的数量,取决于REVERSE
参数。
N | REVERSE | 操作 | 结果 |
---|---|---|---|
1 | 任意 | data_out 设为 0,valid_out 直接赋值为 data_in | data_out = 0, valid_out = data_in |
>1 | 0 | 计算前导零,通过 VX_find_first 模块计算 | data_out = 计算结果, valid_out = 有效性 |
>1 | 1 | 计算尾随零,通过 VX_find_first 模块计算 | data_out = 计算结果, valid_out = 有效性 |
其中VX_scan
模块的功能是用于进行扫描操作(例如前缀和、前缀逻辑运算等),并支持不同的运算操作和方向。
参数 | 操作类型 | 描述 |
---|---|---|
N | 任意 | 数据位宽 |
OP | 0 | XOR 扫描 |
1 | AND 扫描(特殊情况优化:2, 3, 4 位) | |
2 | OR 扫描 | |
REVERSE | 0 | 从低位到高位扫描 |
1 | 从高位到低位扫描 |
综上分析可以得到VX_priority_arbiter
模块的功能:选择NUM_REQS
为1时,直接输出请求结果。否则,调用VX_priority_encoder
模块进行优先级编码,输出授权索引(grant_index)
、单一热编码(grant_onehot)
和有效信号 (grant_valid)
。
代码不贴了,太长了。
该模块实现了一个优先级选择器,用于从多个请求中选择一个。该模块根据请求的数量 (NUM_REQS) 和是否启用锁定 (LOCK_ENABLE) 来调整其行为。
该模块用到VX_onehot_encoder
模块,其功能是将一个一位的one-hot编码
输入转换为其对应的二进制索引
。根据N
的值和MODEL
的设置,模块的实现方式有所不同:
条件 | 说明 |
---|---|
N == 1 |
直接将 data_in 赋值给 data_out ,valid_out 与 data_in 相同。 |
N == 2 |
选择 data_in 的一个比特作为 data_out ,valid_out 表示 data_in 中是否有有效位。 |
MODEL == 1 |
使用分层地址生成方法,将 one-hot 编码转换为二进制索引。处理非2的幂次的输入。 |
MODEL == 2 和 REVERSE == 0 |
使用逐位掩码方法找到设置位的索引。 |
其他情况 | 使用循环检测输入位并确定索引,支持 REVERSE 参数来决定索引计算的方向。 |
VX_fair_arbiter
模块是一个公平调度器,用于在多个请求之间进行优先级调度。模块的行为会根据参数和输入信号的不同而有所变化。
条件 | 功能说明 |
---|---|
NUM_REQS == 1 |
当请求数量为1时,模块直接将请求信号 requests 赋值给 grant_onehot ,并且 grant_valid 与 requests[0] 相同。grant_index 始终为0。 |
NUM_REQS > 1 |
当请求数量大于1时,模块使用一个内部缓冲寄存器 buffer 和请求信号 requests 来管理请求。具体行为如下: |
- buffer 用于存储当前被授予的请求状态。 |
|
- buffer_qual 是 buffer 和 requests 的按位与操作结果,用于限定已激活的请求。 |
|
- requests_qual 是当 buffer 不全为0时,用 buffer_qual 替代 requests ,否则直接使用 requests 。 |
|
- buffer_n 是未授予的请求信号,用于更新 buffer 。 |
|
- 在时钟上升沿,若 reset 信号有效,则 buffer 被重置为0;若 LOCK_ENABLE 关闭或 grant_unlock 为高,则 buffer 被更新为 buffer_n 。 |
|
- 调用 VX_priority_arbiter 模块进行优先级调度,得到 grant_index 、grant_onehot 和 grant_valid 。 |
VX_matrix_arbiter
模块实现了一种基于矩阵的优先级仲裁算法,用于在多个请求中选择一个有效的请求。
条件 | 功能说明 |
---|---|
NUM_REQS == 1 |
当只有一个请求时,模块直接将 requests 赋值给 grant_onehot ,并且 grant_valid 与 requests[0] 相同。grant_index 始终为0。 |
NUM_REQS > 1 |
当请求数量大于1时,模块使用矩阵状态和优先级判断来选择请求。具体行为如下: |
- 状态矩阵: state[i][j] 用于跟踪请求的优先级。 |
|
- 优先级矩阵: pri[j][i] 根据 requests 和 state 判断请求 i 是否优先于请求 j 。 |
|
- 未授权请求: grant_unqual[i] 表示请求 i 是否被授权。 |
|
- 状态更新: state 在每个时钟周期内根据 grant_unqual 更新,用于记录请求的状态。 |
|
- 锁定机制: 如果 LOCK_ENABLE 为0,则 grant_onehot 直接赋值为 grant_unqual ;否则,使用 grant_unlock 信号来决定是否更新 grant_onehot 。 |
|
- 优先级编码: 使用 VX_onehot_encoder 模块将 grant_unqual 编码为 grant_index 。 |
|
- 有效信号: grant_valid 取决于 requests 是否有有效请求。 |
VX_matrix_arbiter
模块通过矩阵状态和优先级算法,实现了在多个请求之间的公平且有序的调度。
VX_cyclic_arbiter
模块实现了一个循环优先级仲裁器,用于在多个请求中周期性地选择一个有效的请求。
条件 | 功能说明 |
---|---|
NUM_REQS == 1 |
当只有一个请求时,模块直接将 requests 赋值给 grant_onehot ,并且 grant_valid 与 requests[0] 相同。grant_index 始终为0。 |
NUM_REQS > 1 |
当请求数量大于1时,模块执行循环仲裁,具体行为如下: |
- 计算是否为2的幂: IS_POW2 用于确定请求数量是否为2的幂,这影响仲裁的循环行为。 |
|
- 轮询索引更新: grant_index_r 是当前的仲裁索引,按时钟周期递增,循环回到0。 |
|
- 索引更新规则: 在非2的幂请求数时,当索引达到最大值时会重置为0。否则,在仲裁器不锁定或 grant_unlock 信号激活时,索引递增。 |
|
- 输出编码: grant_onehot_r 根据 grant_index_r 生成一位热编码的授权信号。 |
|
- 输出赋值: grant_index 赋值为当前的仲裁索引,grant_onehot 赋值为一位热编码,grant_valid 表示当前索引处的请求是否有效。 |
VX_popcount
的定义见于/hw/rtl/libs/VX_popcount.sv
,其代码如下:
`include "VX_platform.vh"
`TRACING_OFF
module VX_popcount63(
input wire [5:0] data_in,
output wire [2:0] data_out
);
reg [2:0] sum;
always @(*) begin
case (data_in)
6'd0: sum=3'd0; 6'd1: sum=3'd1; 6'd2: sum=3'd1; 6'd3: sum=3'd2;
6'd4: sum=3'd1; 6'd5: sum=3'd2; 6'd6: sum=3'd2; 6'd7: sum=3'd3;
6'd8: sum=3'd1; 6'd9: sum=3'd2; 6'd10: sum=3'd2; 6'd11: sum=3'd3;
6'd12: sum=3'd2; 6'd13: sum=3'd3; 6'd14: sum=3'd3; 6'd15: sum=3'd4;
6'd16: sum=3'd1; 6'd17: sum=3'd2; 6'd18: sum=3'd2; 6'd19: sum=3'd3;
6'd20: sum=3'd2; 6'd21: sum=3'd3; 6'd22: sum=3'd3; 6'd23: sum=3'd4;
6'd24: sum=3'd2; 6'd25: sum=3'd3; 6'd26: sum=3'd3; 6'd27: sum=3'd4;
6'd28: sum=3'd3; 6'd29: sum=3'd4; 6'd30: sum=3'd4; 6'd31: sum=3'd5;
6'd32: sum=3'd1; 6'd33: sum=3'd2; 6'd34: sum=3'd2; 6'd35: sum=3'd3;
6'd36: sum=3'd2; 6'd37: sum=3'd3; 6'd38: sum=3'd3; 6'd39: sum=3'd4;
6'd40: sum=3'd2; 6'd41: sum=3'd3; 6'd42: sum=3'd3; 6'd43: sum=3'd4;
6'd44: sum=3'd3; 6'd45: sum=3'd4; 6'd46: sum=3'd4; 6'd47: sum=3'd5;
6'd48: sum=3'd2; 6'd49: sum=3'd3; 6'd50: sum=3'd3; 6'd51: sum=3'd4;
6'd52: sum=3'd3; 6'd53: sum=3'd4; 6'd54: sum=3'd4; 6'd55: sum=3'd5;
6'd56: sum=3'd3; 6'd57: sum=3'd4; 6'd58: sum=3'd4; 6'd59: sum=3'd5;
6'd60: sum=3'd4; 6'd61: sum=3'd5; 6'd62: sum=3'd5; 6'd63: sum=3'd6;
endcase
end
assign data_out = sum;
endmodule
module VX_popcount32(
input wire [2:0] data_in,
output wire [1:0] data_out
);
reg [1:0] sum;
always @(*) begin
case (data_in)
3'd0: sum=2'd0; 3'd1: sum=2'd1; 3'd2: sum=2'd1; 3'd3: sum=2'd2;
3'd4: sum=2'd1; 3'd5: sum=2'd2; 3'd6: sum=2'd2; 3'd7: sum=2'd3;
endcase
end
assign data_out = sum;
endmodule
module VX_sum33(
input wire [2:0] data_in1,
input wire [2:0] data_in2,
output wire [3:0] data_out
);
reg [3:0] sum;
always @(*) begin
case ({data_in1, data_in2})
6'd0: sum=4'd0; 6'd1: sum=4'd1; 6'd2: sum=4'd2; 6'd3: sum=4'd3;
6'd4: sum=4'd4; 6'd5: sum=4'd5; 6'd6: sum=4'd6; 6'd7: sum=4'd7;
6'd8: sum=4'd1; 6'd9: sum=4'd2; 6'd10: sum=4'd3; 6'd11: sum=4'd4;
6'd12: sum=4'd5; 6'd13: sum=4'd6; 6'd14: sum=4'd7; 6'd15: sum=4'd8;
6'd16: sum=4'd2; 6'd17: sum=4'd3; 6'd18: sum=4'd4; 6'd19: sum=4'd5;
6'd20: sum=4'd6; 6'd21: sum=4'd7; 6'd22: sum=4'd8; 6'd23: sum=4'd9;
6'd24: sum=4'd3; 6'd25: sum=4'd4; 6'd26: sum=4'd5; 6'd27: sum=4'd6;
6'd28: sum=4'd7; 6'd29: sum=4'd8; 6'd30: sum=4'd9; 6'd31: sum=4'd10;
6'd32: sum=4'd4; 6'd33: sum=4'd5; 6'd34: sum=4'd6; 6'd35: sum=4'd7;
6'd36: sum=4'd8; 6'd37: sum=4'd9; 6'd38: sum=4'd10; 6'd39: sum=4'd11;
6'd40: sum=4'd5; 6'd41: sum=4'd6; 6'd42: sum=4'd7; 6'd43: sum=4'd8;
6'd44: sum=4'd9; 6'd45: sum=4'd10; 6'd46: sum=4'd11; 6'd47: sum=4'd12;
6'd48: sum=4'd6; 6'd49: sum=4'd7; 6'd50: sum=4'd8; 6'd51: sum=4'd9;
6'd52: sum=4'd10; 6'd53: sum=4'd11; 6'd54: sum=4'd12; 6'd55: sum=4'd13;
6'd56: sum=4'd7; 6'd57: sum=4'd8; 6'd58: sum=4'd9; 6'd59: sum=4'd10;
6'd60: sum=4'd11; 6'd61: sum=4'd12; 6'd62: sum=4'd13; 6'd63: sum=4'd14;
endcase
end
assign data_out = sum;
endmodule
module VX_popcount #(
parameter MODEL = 1,
parameter N = 1,
parameter M = `CLOG2(N+1)
) (
input wire [N-1:0] data_in,
output wire [M-1:0] data_out
);
`UNUSED_PARAM (MODEL)
`ifndef SYNTHESIS
assign data_out = $countones(data_in);
`elsif QUARTUS
assign data_out = $countones(data_in);
`else
if (N == 1) begin
assign data_out = data_in;
end else if (N <= 3) begin
reg [2:0] t_in;
wire [1:0] t_out;
always @(*) begin
t_in = '0;
t_in[N-1:0] = data_in;
end
VX_popcount32 pc32(t_in, t_out);
assign data_out = t_out[M-1:0];
end else if (N <= 6) begin
reg [5:0] t_in;
wire [2:0] t_out;
always @(*) begin
t_in = '0;
t_in[N-1:0] = data_in;
end
VX_popcount63 pc63(t_in, t_out);
assign data_out = t_out[M-1:0];
end else if (N <= 9) begin
reg [8:0] t_in;
wire [4:0] t1_out;
wire [3:0] t2_out;
always @(*) begin
t_in = '0;
t_in[N-1:0] = data_in;
end
VX_popcount63 pc63(t_in[5:0], t1_out[2:0]);
VX_popcount32 pc32(t_in[8:6], t1_out[4:3]);
VX_sum33 sum33(t1_out[2:0], {1'b0, t1_out[4:3]}, t2_out);
assign data_out = t2_out[M-1:0];
end else if (N <= 12) begin
reg [11:0] t_in;
wire [5:0] t1_out;
wire [3:0] t2_out;
always @(*) begin
t_in = '0;
t_in[N-1:0] = data_in;
end
VX_popcount63 pc63a(t_in[5:0], t1_out[2:0]);
VX_popcount63 pc63b(t_in[11:6], t1_out[5:3]);
VX_sum33 sum33(t1_out[2:0], t1_out[5:3], t2_out);
assign data_out = t2_out[M-1:0];
end else if (N <= 18) begin
reg [17:0] t_in;
wire [8:0] t1_out;
wire [5:0] t2_out;
always @(*) begin
t_in = '0;
t_in[N-1:0] = data_in;
end
VX_popcount63 pc63a(t_in[5:0], t1_out[2:0]);
VX_popcount63 pc63b(t_in[11:6], t1_out[5:3]);
VX_popcount63 pc63c(t_in[17:12], t1_out[8:6]);
VX_popcount32 pc32a({t1_out[0], t1_out[3], t1_out[6]}, t2_out[1:0]);
VX_popcount32 pc32b({t1_out[1], t1_out[4], t1_out[7]}, t2_out[3:2]);
VX_popcount32 pc32c({t1_out[2], t1_out[5], t1_out[8]}, t2_out[5:4]);
assign data_out = {2'b0,t2_out[1:0]} + {1'b0,t2_out[3:2],1'b0} + {t2_out[5:4],2'b0};
end else if (MODEL == 1) begin
localparam PN = 1 << `CLOG2(N);
localparam LOGPN = `CLOG2(PN);
`IGNORE_UNOPTFLAT_BEGIN
wire [M-1:0] tmp [LOGPN-1:0][PN-1:0];
`IGNORE_UNOPTFLAT_END
for (genvar j = 0; j < LOGPN; ++j) begin
localparam D = j + 1;
localparam Q = (D < LOGPN) ? (D + 1) : M;
for (genvar i = 0; i < (1 << (LOGPN-j-1)); ++i) begin
localparam l = i * 2;
localparam r = i * 2 + 1;
wire [Q-1:0] res;
if (j == 0) begin
if (r < N) begin
assign res = data_in[l] + data_in[r];
end else if (l < N) begin
assign res = 2'(data_in[l]);
end else begin
assign res = 2'b0;
end
end else begin
assign res = D'(tmp[j-1][l]) + D'(tmp[j-1][r]);
end
assign tmp[j][i] = M'(res);
end
end
assign data_out = tmp[LOGPN-1][0];
end else begin
reg [M-1:0] cnt_r;
always @(*) begin
cnt_r = '0;
for (integer i = 0; i < N; ++i) begin
cnt_r = cnt_r + M'(data_in[i]);
end
end
assign data_out = cnt_r;
end
`endif
endmodule
`TRACING_ON
分别解释:
1、VX_popcount63计算6位输入数据中1的个数。
2、VX_popcount32计算3位输入数据中1的个数。
3、VX_sum33计算两个3位数据的和。
4、VX_popcount计算任意宽度的输入数据中1的个数,这是一个参数化模块,能够处理不同位宽的输入数据。根据N的值选择合适的实现方式(如使用VX_popcount32、VX_popcount63 或更复杂的组合),最终输出data_out为M位,表示输入数据中1的个数。
// Banks access
for (genvar i = 0; i < NUM_BANKS; ++i) begin
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire curr_bank_mem_rsp_valid;
if (NUM_BANKS == 1) begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s;
end else begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == i);
end
`RESET_RELAY (bank_reset, reset);
VX_cache_bank #(
.BANK_ID (i),
.INSTANCE_ID (INSTANCE_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_BUF)
) bank (
.clk (clk),
.reset (bank_reset),
// Core request
.core_req_valid (per_bank_core_req_valid[i]),
.core_req_addr (per_bank_core_req_addr[i]),
.core_req_rw (per_bank_core_req_rw[i]),
.core_req_wsel (per_bank_core_req_wsel[i]),
.core_req_byteen (per_bank_core_req_byteen[i]),
.core_req_data (per_bank_core_req_data[i]),
.core_req_tag (per_bank_core_req_tag[i]),
.core_req_idx (per_bank_core_req_idx[i]),
.core_req_ready (per_bank_core_req_ready[i]),
// Core response
.core_rsp_valid (per_bank_core_rsp_valid[i]),
.core_rsp_data (per_bank_core_rsp_data[i]),
.core_rsp_tag (per_bank_core_rsp_tag[i]),
.core_rsp_idx (per_bank_core_rsp_idx[i]),
.core_rsp_ready (per_bank_core_rsp_ready[i]),
// Memory request
.mem_req_valid (per_bank_mem_req_valid[i]),
.mem_req_addr (curr_bank_mem_req_addr),
.mem_req_rw (per_bank_mem_req_rw[i]),
.mem_req_wsel (per_bank_mem_req_wsel[i]),
.mem_req_byteen (per_bank_mem_req_byteen[i]),
.mem_req_data (per_bank_mem_req_data[i]),
.mem_req_id (per_bank_mem_req_id[i]),
.mem_req_ready (per_bank_mem_req_ready[i]),
// Memory response
.mem_rsp_valid (curr_bank_mem_rsp_valid),
.mem_rsp_data (mem_rsp_data_s),
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
.mem_rsp_ready (per_bank_mem_rsp_ready[i]),
// initialization
.init_enable (init_enable),
.init_line_sel (init_line_sel)
);
if (NUM_BANKS == 1) begin
assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr;
end else begin
assign per_bank_mem_req_addr[i] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, i);
end
end
这里最重要的无疑是VX_cache_bank
模块。因为bank结构比较庞大,当前这一章的内容已经足够多,因此这块放到后面分析!
// Bank responses gather
wire [NUM_BANKS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_in;
wire [NUM_REQS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_out;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign core_rsp_data_in[i] = {per_bank_core_rsp_data[i], per_bank_core_rsp_tag[i]};
end
`RESET_RELAY (rsp_xbar_reset, reset);
VX_stream_xbar #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
.DATAW (CORE_RSP_DATAW)
) rsp_xbar (
.clk (clk),
.reset (rsp_xbar_reset),
`UNUSED_PIN (collisions),
.valid_in (per_bank_core_rsp_valid),
.data_in (core_rsp_data_in),
.sel_in (per_bank_core_rsp_idx),
.ready_in (per_bank_core_rsp_ready),
.valid_out (core_rsp_valid_s),
.data_out (core_rsp_data_out),
.ready_out (core_rsp_ready_s),
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign {core_rsp_data_s[i], core_rsp_tag_s[i]} = core_rsp_data_out[i];
end
关于这一段,核心模块已经在bank request dispatch
中讲解。回过头来看,这一章的代码在于手机对bank
的访问请求、随后执行对bank
的访问,最后将bank
的响应收集。根据细节上的代码可以知道作者在设计访问请求
的过程中考虑多个请求
在同一拍出现并设置如何选择的solution
。
详细讲解了bank request dispatch
模块中的功能,并大致梳理了对bank
访问的请求-应答流程。