算术逻辑单元是一种可对二进制整数执行算术运算或位运算的组合逻辑数字电路。ALU 与浮点数运算单元不同,后者仅对浮点数进行操作。ALU是许多类型的计算电路的基本部件,这些计算电路包括计算机的中央处理单元、浮点处理单元和图形处理单元。单个CPU、FPU 或 GPU可能包含多个ALU。
什么是Critical Path?
critical path一般是指我们设计中时序最关键的路径,通常也就是timing最差或者最难修的路径。
因为ALU中存在加法器,正常加法器的higher bit需要等待lower bit的计算值才能传递carry进位,推断包含加法器的路径位整个ALU的critical path,设计完成后的DC综合同样显示出了这个特性,因此对于adder的优化显得尤为重要。
Adder的种类有很多种,包括
- Carry Ripple adder
- Carry Ripple adder
- Carry Bypass Adder
- Carry select Adder
- Carry lookahead Adder
在此我们设计最基本的Carry Ripple adder与最快(low delay)的 Carry lookahead adder,代码如下:
module adder32(A,B,S,C32);
input [31:0] A;
input [31:0] B;
output [31:0] S;
output C32;
wire [31:0] C;
genvar i;
generate
for(i = 0 ; i<32 ; i = i + 1)
begin
if (i == 0)
adder u1(.X(A[0]),.Y(B[0]),.Cin(1'b0),.F(S[0]),.Cout(C[0]));
else
adder un(.X(A[i]),.Y(B[i]),.Cin(C[i-1]),.F(S[i]),.Cout(C[i]));
end
endgenerate
assign C32 = C[31];
endmodule
注释:这里使用了generate做一个循环例化,行波进位加法器的设计中不能仅仅使用assign{S,C}=A+B;这样的语句是不正确的,原因是DC综合过程中会对HDL语法做自动优化,以此来减少delay,而使用generate例化,前后端口相连,才可以得到正常的不受优化的延时信息。
module adder32(A,B,S,C32);
input [32:1] A;
input [32:1] B;
output [32:1] S;
output C32;
wire px1,gx1,px2,gx2;
wire c16;
CLA_16 CLA1(
.A(A[16:1]),
.B(B[16:1]),
.c0(0),
.S(S[16:1]),
.px(px1),
.gx(gx1)
);
CLA_16 CLA2(
.A(A[32:17]),
.B(B[32:17]),
.c0(c16),
.S(S[32:17]),
.px(px2),
.gx(gx2)
);
assign c16 = gx1 ^ (px1 && 0), //c0 = 0
C32 = gx2 ^ (px2 && c16);
endmodule
module CLA_16(A,B,c0,S,px,gx);
input [16:1] A;
input [16:1] B;
input c0;
output gx,px;
output [16:1] S;
wire c4,c8,c12;
wire Pm1,Gm1,Pm2,Gm2,Pm3,Gm3,Pm4,Gm4;
adder_4 adder1(
.x(A[4:1]),
.y(B[4:1]),
.c0(c0),
.c4(),
.F(S[4:1]),
.Gm(Gm1),
.Pm(Pm1)
);
adder_4 adder2(
.x(A[8:5]),
.y(B[8:5]),
.c0(c4),
.c4(),
.F(S[8:5]),
.Gm(Gm2),
.Pm(Pm2)
);
adder_4 adder3(
.x(A[12:9]),
.y(B[12:9]),
.c0(c8),
.c4(),
.F(S[12:9]),
.Gm(Gm3),
.Pm(Pm3)
);
adder_4 adder4(
.x(A[16:13]),
.y(B[16:13]),
.c0(c12),
.c4(),
.F(S[16:13]),
.Gm(Gm4),
.Pm(Pm4)
);
assign c4 = Gm1 ^ (Pm1 & c0),
c8 = Gm2 ^ (Pm2 & Gm1) ^ (Pm2 & Pm1 & c0),
c12 = Gm3 ^ (Pm3 & Gm2) ^ (Pm3 & Pm2 & Gm1) ^ (Pm3 & Pm2 & Pm1 & c0);
assign px = Pm1 & Pm2 & Pm3 & Pm4,
gx = Gm4 ^ (Pm4 & Gm3) ^ (Pm4 & Pm3 & Gm2) ^ (Pm4 & Pm3 & Pm2 & Gm1);
endmodule
module adder_4(x,y,c0,c4,F,Gm,Pm);
input [4:1] x;
input [4:1] y;
input c0;
output c4,Gm,Pm;
output [4:1] F;
wire p1,p2,p3,p4,g1,g2,g3,g4;
wire c1,c2,c3;
adder adder1(
.X(x[1]),
.Y(y[1]),
.Cin(c0),
.F(F[1]),
.Cout()
);
adder adder2(
.X(x[2]),
.Y(y[2]),
.Cin(c1),
.F(F[2]),
.Cout()
);
adder adder3(
.X(x[3]),
.Y(y[3]),
.Cin(c2),
.F(F[3]),
.Cout()
);
adder adder4(
.X(x[4]),
.Y(y[4]),
.Cin(c3),
.F(F[4]),
.Cout()
);
CLA CLA(
.c0(c0),
.c1(c1),
.c2(c2),
.c3(c3),
.c4(c4),
.p1(p1),
.p2(p2),
.p3(p3),
.p4(p4),
.g1(g1),
.g2(g2),
.g3(g3),
.g4(g4)
);
assign p1 = x[1] ^ y[1],
p2 = x[2] ^ y[2],
p3 = x[3] ^ y[3],
p4 = x[4] ^ y[4];
assign g1 = x[1] & y[1],
g2 = x[2] & y[2],
g3 = x[3] & y[3],
g4 = x[4] & y[4];
assign Pm = p1 & p2 & p3 & p4,
Gm = g4 ^ (p4 & g3) ^ (p4 & p3 & g2) ^ (p4 & p3 & p2 & g1);
endmodule
module adder(X,Y,Cin,F,Cout);
input X,Y,Cin;
output F,Cout;
assign F = X ^ Y ^ Cin;
assign Cout = (X ^ Y) & Cin | X & Y;
endmodule
通过对DC综合工具生成timing report的分析,在AMS 0.35um的库的条件下,CLA形式的加法器减少了 55.1% 的delay但是增加了2.12倍的面积,这里体现了电路设计中的面积速度互换的原则,通过对加法器的优化,critical path的路径有了较大程度的改善
alu_op_i[3] | alu_op_i[2] | alu_op_i[1] | alu_op_i[0] | Function |
---|---|---|---|---|
0 | 0 | 0 | 0 | None |
0 | 0 | 0 | 1 | Shift left |
0 | 0 | 1 | 0 | Shift right an unsigned number |
0 | 0 | 1 | 1 | Shift right a signed number |
0 | 1 | 0 | 0 | Add |
0 | 1 | 0 | 1 | Sub |
0 | 1 | 1 | 0 | And |
0 | 1 | 1 | 1 | Or |
1 | 0 | 0 | 0 | Xor |
1 | 0 | 0 | 1 | Compare(less than unsigned numbers) |
1 | 0 | 1 | 0 | Compare(less than signed numbers ) |
`include "alu_define.v"
module alu
(
// Inputs
input [ 3:0] alu_op_i
,input [ 31:0] alu_a_i
,input [ 31:0] alu_b_i
// Outputs
,output [ 31:0] alu_p_o
,output overflow
);
// Registers
//-----------------------------------------------------------------
reg [31:0] result_r;
reg [31:16] shift_right_fill_r;
reg [31:0] shift_right_1_r;
reg [31:0] shift_right_2_r;
reg [31:0] shift_right_4_r;
reg [31:0] shift_right_8_r;
reg [31:0] shift_left_1_r;
reg [31:0] shift_left_2_r;
reg [31:0] shift_left_4_r;
reg [31:0] shift_left_8_r;
reg [31:0] add_a;
reg [31:0] add_b;
wire[32:0] add_32;
//wire [31:0] sub_res_w = alu_a_i - alu_b_i;
//-----------------------------------------------------------------
// ALU
//-----------------------------------------------------------------
always @ (alu_op_i or alu_a_i or alu_b_i /*or sub_res_w*/)
begin
shift_right_fill_r = 16'b0;
shift_right_1_r = 32'b0;
shift_right_2_r = 32'b0;
shift_right_4_r = 32'b0;
shift_right_8_r = 32'b0;
shift_left_1_r = 32'b0;
shift_left_2_r = 32'b0;
shift_left_4_r = 32'b0;
shift_left_8_r = 32'b0;
case (alu_op_i)
//----------------------------------------------
// Shift Left
//----------------------------------------------
`ALU_SHIFTL :
begin
if (alu_b_i[0] == 1'b1)
shift_left_1_r = {alu_a_i[30:0],1'b0};
else
shift_left_1_r = alu_a_i;
if (alu_b_i[1] == 1'b1)
shift_left_2_r = {shift_left_1_r[29:0],2'b00};
else
shift_left_2_r = shift_left_1_r;
if (alu_b_i[2] == 1'b1)
shift_left_4_r = {shift_left_2_r[27:0],4'b0000};
else
shift_left_4_r = shift_left_2_r;
if (alu_b_i[3] == 1'b1)
shift_left_8_r = {shift_left_4_r[23:0],8'b00000000};
else
shift_left_8_r = shift_left_4_r;
if (alu_b_i[4] == 1'b1)
result_r = {shift_left_8_r[15:0],16'b0000000000000000};
else
result_r = shift_left_8_r;
end
//----------------------------------------------
// Shift Right
//----------------------------------------------
`ALU_SHIFTR, `ALU_SHIFTR_ARITH:
begin
//judge
if (alu_a_i[31] == 1'b1 && alu_op_i == `ALU_SHIFTR_ARITH)
shift_right_fill_r = 16'b1111111111111111;
else
shift_right_fill_r = 16'b0000000000000000;
if (alu_b_i[0] == 1'b1)
shift_right_1_r = {shift_right_fill_r[31], alu_a_i[31:1]};
else
shift_right_1_r = alu_a_i;//b[0]=0,r_reg1=a
if (alu_b_i[1] == 1'b1)
shift_right_2_r = {shift_right_fill_r[31:30], shift_right_1_r[31:2]};
else
shift_right_2_r = shift_right_1_r;
if (alu_b_i[2] == 1'b1)
shift_right_4_r = {shift_right_fill_r[31:28], shift_right_2_r[31:4]};
else
shift_right_4_r = shift_right_2_r;
if (alu_b_i[3] == 1'b1)
shift_right_8_r = {shift_right_fill_r[31:24], shift_right_4_r[31:8]};
else
shift_right_8_r = shift_right_4_r;
if (alu_b_i[4] == 1'b1)
result_r = {shift_right_fill_r[31:16], shift_right_8_r[31:16]};
else
result_r = shift_right_8_r;
end
//----------------------------------------------
// Arithmetic
//----------------------------------------------
`ALU_ADD :
begin
add_a = alu_a_i;
add_b = alu_b_i;
end
`ALU_SUB :
begin
add_a = alu_a_i;
add_b[31] = alu_b_i[30:0] ;
add_b[30:0] = ~alu_b_i[30:0]+1'b1;
end
//----------------------------------------------
// Logical
//----------------------------------------------
`ALU_AND :
begin
result_r = (alu_a_i & alu_b_i);//and
end
`ALU_OR :
begin
result_r = (alu_a_i | alu_b_i);//or
end
`ALU_XOR :
begin
result_r = (alu_a_i ^ alu_b_i);//xor
end
//----------------------------------------------
// Comparision
//----------------------------------------------
`ALU_LESS_THAN :
begin
result_r = (alu_a_i < alu_b_i) ? 32'h1 : 32'h0;
end
`ALU_LESS_THAN_SIGNED :
begin
result_r = (alu_a_i < alu_b_i) ? 32'h1 : 32'h0;
/* if (alu_a_i[31] != alu_b_i[31])
result_r = alu_a_i[31] ? 32'h1 : 32'h0;
else
result_r = sub_res_w[31] ? 32'h1 : 32'h0;
end
*/
end
default :
begin
result_r = alu_a_i;
end
endcase
end
adder32 u1(.A(add_a),.B(add_b),.S((add_32[31:0])),.C32());
assign alu_p_o =
(alu_op_i == `ALU_ADD|| alu_op_i ==`ALU_SUB)?add_32[31:0]:result_r;
assign overflow =
(alu_a_i[31]&alu_b_i[31]&!add_32[31])|(!alu_a_i[31]&!alu_b_i[31]&add_32[31]);
endmodule
经验证综合和后仿,ALU的功能符合预期要求