代码均经过测试,请放心食用
这一章可能是最难的一章了,虽然难,但是还是挺有趣的
本章所需的网络旁注和模拟器请自行下载 http://csapp.cs.cmu.edu/3e/students.html
下面给出测试简要指南,详情请自行阅读网络旁注和 README 文件
从 HCL 的实现角度来看,HCL 实际上是用来生成一种非常格式化的 C 代码的语言。HCL 文件中的所有块定义都被 HCL2C 程序转换成C函数。然后这些函数和实现其他模拟器函数的源代码一起编译,来生成一个可执行的模拟程序,比如 ssim 和 psim。
我并没有使用 GUI 模式的模拟器,因为配置太过麻烦,要下载一些东西,况且在终端进行测试就足够了;如果使用终端模式的话,请将 MakeFile 文件中关于 GUIMODE, TKLIBS, TKINC 的参数注释掉;
先下载压缩包,然后运行下列指令,如果出错可能是因为一些参数没有注释掉:
USER@NAME:~# tar xf sim.tar
USER@NAME:~# cd sim
USER@NAME:~/sim# make clean
USER@NAME:~/sim# make
可以直接使用 YIS 运行程序,它是指令集模拟器,不模拟任何具体处理器的实现:
USER@NAME:~/sim/y86-code# ../misc/yis asum.yo
ssim 和 psim 分别是 SEQ 和 PIPE 的模拟器;如果要生成模拟器(xxx代表不同版本):
USER@NAME:~/sim/seq# make clean; make ssim VERSION=xxx
也可以使用模拟器来运行某一个程序;可以在 /sim/y86-code 下复制一个 ys 文件比如 asum.ys,重命名为 bubble.ys,将里面的 sum 函数删掉,改成所要测试的函数,然后在 main 里调用;
然后 make bubble.yo 生成一个目标文件 bubble.yo,这个 yo 文件就可以用模拟器运行了:
USER@NAME:~/sim/y86-code# make bubble.yo
USER@NAME:~/sim/y86-code# ../seq/ssim -t < ./bubble.yo
如果需要测试修改后的 hcl 文件,需要先生成模拟器,在上面说过了;然后在 /sim/ptest 中测试(参数 SIM 指示需要测试的是 psim 还是 ssim,参数 TFLAGS=-i 表示测试时包含 iaddq 指令);如果测试失败,会在当前文件夹中留下导致错误的 ys 文件:
USER@NAME:~/sim/ptest# make SIM=../pipe/psim TFLAGS=-i
4.54 题中有完整的生成模拟器并测试的步骤
A.
错误,它压入了减去 8 的 %rsp 的值
B.
movq REG, -8(%rsp)
subq $8, %rsp
A.
错误,它将栈指针设置为了正确的值再减去 8
B.
addq $8, %rsp
movq -8(%rsp), REG
使用数组索引
/*参考代码*/
void bubble_a(long *data, long count) {
long i, last;
for (last = count - 1; last > 0; last--) {
for (i = 0; i < last; i++) {
if (data[i + 1] < data[i]) {
/*Swap adjacent elements*/
long t = data[i + 1];
data[i + 1] = data[i];
data[i] = t;
}
}
}
}
A.
使用指针引用数组元素
void bubble_b(long *data, long count) {
long i, last;
for (last = count - 1; last > 0; last--) {
for (i = 0; i < last; i++) {
if (*(data + i + 1)< *(data + i)) {
/*Swap adjacent elements*/
long t = *(data + i + 1);
*(data + i + 1) = *(data + i);
*(data + i) = t;
}
}
}
}
B.
手写汇编真麻烦我都要吐了怪不得发明高级语言
这里完整的 ys 文件,以后的题目只给出函数定义
# 从地址 0 开始执行
.pos 0
irmovq stack, %rsp # Set up stack pointer
call main # Execute main program
halt # Terminate program
# 四个元素的数组
.align 8
array:
.quad 0x000000000abc
.quad 0x0000000000bc
.quad 0x00000000000c
.quad 0x000000000001
# 主函数
main:
irmovq array,%rdi
irmovq $4,%rsi
call bubble_b # bubble(array, 4)
ret
# void bubble_b(long *data, long count)
# data in %rdi, count in %rsi
bubble_b:
irmovq $1, %r8 # 常数 1
irmovq $8, %r9 # 常数 8
rrmovq %rsi, %rax
subq %r8, %rax # last = %rax = count - 1
je Done # last == 0 -> jmp done
Loop1:
xorq %rcx, %rcx # i = %rcx = 0
Loop2:
rrmovq %rcx, %rdx # %rdx = i
addq %rdx, %rdx # %rdx = 2 * i
addq %rdx, %rdx # %rdx = 4 * i
addq %rdx, %rdx # %rdx = 8 * i
addq %rdi, %rdx # %rdx = data + 8 * i
mrmovq (%rdx), %r10 # %r10 = data[i]
addq %r9, %rdx # %rdx = data + 8 * i + 8
mrmovq (%rdx), %rbx # %rbx = data[i + 1]
rrmovq %rbx, %r11 # %r11 = data[i + 1]
subq %r10, %rbx # %rbx = data[i + 1] - data[i]
jge Test1 # data[i + 1] - data[i] > 0 -> do nothing
rmmovq %r10, (%rdx) # data[i + 1] = data[i]
subq %r9, %rdx # %rdx = %rdx - 8
rmmovq %r11, (%rdx) # data[i] = data[i + 1]
Test1:
addq %r8, %rcx # i++
rrmovq %rcx, %r12 # %r12 = %rcx = i
subq %rax, %r12 # i - last
jl Loop2 # i < last -> jmp Loop2
subq %r8, %rax # last--
jg Loop1 # last > 0 -> jmp Loop1
Done:
ret # Return
# 栈从 0x200 向低地址生长
.pos 0x200
stack:
可以看到输出,左边是运行前,右边是运行后:
Changed Memory State:
0x0018: 0x0000000000000abc 0x0000000000000001
0x0020: 0x00000000000000bc 0x000000000000000c
0x0028: 0x000000000000000c 0x00000000000000bc
0x0030: 0x0000000000000001 0x0000000000000abc
#号包围区域与 4.47 题做对比:
# void bubble_c(long *data, long count)
# data in %rdi, count in %rsi
bubble_c:
irmovq $1, %r8 # 常数 1
irmovq $8, %r9 # 常数 8
rrmovq %rsi, %rax
subq %r8, %rax # last = %rax = count - 1
je Done # last == 0 -> jmp done
Loop1:
xorq %rcx, %rcx # i = %rcx = 0
Loop2:
rrmovq %rcx, %rdx # %rdx = i
addq %rdx, %rdx # %rdx = 2 * i
addq %rdx, %rdx # %rdx = 4 * i
addq %rdx, %rdx # %rdx = 8 * i
addq %rdi, %rdx # %rdx = data + 8 * i
mrmovq (%rdx), %r10 # %r10 = data[i]
addq %r9, %rdx # %rdx = data + 8 * i + 8
mrmovq (%rdx), %rbx # %rbx = data[i + 1]
rrmovq %rbx, %r11 # %r11 = data[i + 1]
subq %r10, %rbx # %rbx = data[i + 1] - data[i]
######################### 删除部分 #################################
##jge Test1 # data[i + 1] - data[i] > 0 -> do nothing##
##rmmovq %r10, (%rdx) # data[i + 1] = data[i] ##
##subq %r9, %rdx # %rdx = %rdx - 8 ##
##rmmovq %r11, (%rdx) # data[i] = data[i + 1] ##
###################################################################
########################## 新增部分 ######################################
rrmovq %r11, %r12 # %r12 = data[i + 1]
rrmovq %r10, %r13 # %r13 = data[i]
cmovl %r10, %r12 # data[i + 1] < data[i] -> %r12 = data[i]
cmovl %r11, %r13 # data[i + 1] < data[i] -> %r13 = data[i + 1]
rmmovq %r12, (%rdx) # data[i + 1] = %r12
subq %r9, %rdx # %rdx = %rdx - 8
rmmovq %r13, (%rdx) # data[i] = %r13
#########################################################################
Test1:
addq %r8, %rcx # i++
rrmovq %rcx, %r12 # %r12 = %rcx = i
subq %rax, %r12 # i - last
jl Loop2 # i < last -> jmp Loop2
subq %r8, %rax # last--
jg Loop1 # last > 0 -> jmp Loop1
Done:
ret # Return
一个条件传送,比较巧妙,#号包围区域与 4.47 题做对比:
# void bubble_d(long *data, long count)
# data in %rdi, count in %rsi
bubble_d:
irmovq $1, %r8 # 常数 1
irmovq $8, %r9 # 常数 8
irmovq $0, %r12 # 常数 0
rrmovq %rsi, %rax
subq %r8, %rax # last = %rax = count - 1
je Done # last == 0 -> jmp done
Loop1:
xorq %rcx, %rcx # i = %rcx = 0
Loop2:
rrmovq %rcx, %rdx # %rdx = i
addq %rdx, %rdx # %rdx = 2 * i
addq %rdx, %rdx # %rdx = 4 * i
addq %rdx, %rdx # %rdx = 8 * i
addq %rdi, %rdx # %rdx = data + 8 * i
mrmovq (%rdx), %r10 # %r10 = data[i]
addq %r9, %rdx # %rdx = data + 8 * i + 8
mrmovq (%rdx), %rbx # %rbx = data[i + 1]
rrmovq %rbx, %r11 # %r11 = data[i + 1]
subq %r10, %rbx # %rbx = data[i + 1] - data[i]
######################### 删除部分 #################################
##jge Test1 # data[i + 1] - data[i] > 0 -> do nothing##
##rmmovq %r10, (%rdx) # data[i + 1] = data[i] ##
##subq %r9, %rdx # %rdx = %rdx - 8 ##
##rmmovq %r11, (%rdx) # data[i] = data[i + 1] ##
###################################################################
########################## 新增部分 ######################################
cmovge %r12, %rbx # data[i + 1] < data[i] -> %rbx = data[i + 1] - data[i] 否则 %rbx = 0
subq %rbx, %r11 # %r11 = data[i+1] < data[i] : data[i] : data[i+1]
rmmovq %r11, (%rdx) # data[i+1] = %r11
subq %r9, %rdx # %rdx = %rdx - 8
addq %rbx, %r10 # %r10 = data[i+1] < data[i] : data[i] : data[i+1]
rmmovq %r10, (%rdx) # data[i] = %r10
#########################################################################
Test1:
addq %r8, %rcx # i++
rrmovq %rcx, %r12 # %r12 = %rcx = i
subq %rax, %r12 # i - last
jl Loop2 # i < last -> jmp Loop2
subq %r8, %rax # last--
jg Loop1 # last > 0 -> jmp Loop1
Done:
ret # Return
完整的 ys 文件:
# 从地址 0 开始执行
.pos 0
irmovq stack, %rsp # Set up stack pointer
call main # Execute main program
halt # Terminate program
# 跳转表
.align 8
table:
.quad 0x00000000015e
.quad 0x00000000017f
.quad 0x000000000169
.quad 0x000000000174
.quad 0x00000000017f
.quad 0x000000000169
.quad 0x00000000017f
# 主函数
main:
irmovq $3,%rdi
call switchv # switchv(3)
ret
.pos 0x100
# long switchv(long idx)
# idx in %rdi
# 地址:0x100
switchv:
irmovq 0xaaa, %r8 # %r8 = 0xaaa
irmovq 0xbbb, %r9 # %r9 = 0xbbb
irmovq 0xccc, %r10 # %r10 = 0xccc
irmovq 0xddd, %r11 # %r11 = 0xddd
irmovq $5, %r12 # %r12 = 5
irmovq table, %r13 # %r13 = table
rrmovq %rdi, %rdx
subq %r12, %rdx # idx - 5
jg default # idx > 5 -> jmp default
addq %rdi, %rdi # idx = 2 * idx
addq %rdi, %rdi # idx = 4 * idx
addq %rdi, %rdi # idx = 8 * idx
addq %rdi, %r13 # %r13 = table + 8 * idx
mrmovq (%r13), %r13
pushq %r13
ret
# 地址:0x15e
rrmovq %r8, %rax
jmp Done
# 地址:0x169
rrmovq %r9, %rax
jmp Done
# 地址:0x174
rrmovq %r10, %rax
jmp Done
# 地址:0x17f
default:
rrmovq %r11, %rax
Done:
ret
# 栈从 0x200 向低地址生长
.pos 0x200
stack:
输出:
Changed Register State:
%rax: 0x0000000000000000 0x0000000000000ccc
参照图 4-18:
阶段 | iaddq V, rB |
---|---|
取指 | icode:ifun <- M1[PC] rA:rB <- M1[PC+1] valC <- M8[PC+2] valP <- PC + 10 |
译码 | valB <- R[rB] |
执行 | valE <- valB + valC |
访存 | |
写回 | R[rB] <- valE |
更新PC | PC <- valP |
根据上题修改 /sim/seq-full.hcl,先复制一个备份文件 seq-full-backup.hcl
USER@NAME:~/sim/seq# diff -u seq-full-backup.hcl seq-full.hcl
--- seq-full-backup.hcl 2014-06-23 22:01:01.000000000 +0800
+++ seq-full.hcl 2018-08-22 23:04:46.906999999 +0800
@@ -106,16 +106,16 @@
bool instr_valid = icode in
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
- IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ };
+ IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ };
# Does fetched instruction require a regid byte?
bool need_regids =
icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,
- IIRMOVQ, IRMMOVQ, IMRMOVQ };
+ IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ };
# Does fetched instruction require a constant word?
bool need_valC =
- icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL };
+ icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };
################ Decode Stage ###################################
@@ -128,7 +128,7 @@
## What register should be used as the B source?
word srcB = [
- icode in { IOPQ, IRMMOVQ, IMRMOVQ } : rB;
+ icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't need register
];
@@ -136,7 +136,7 @@
## What register should be used as the E destination?
word dstE = [
icode in { IRRMOVQ } && Cnd : rB;
- icode in { IIRMOVQ, IOPQ} : rB;
+ icode in { IIRMOVQ, IOPQ, IIADDQ} : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't write any register
];
@@ -152,7 +152,7 @@
## Select input A to ALU
word aluA = [
icode in { IRRMOVQ, IOPQ } : valA;
- icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ } : valC;
+ icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : valC;
icode in { ICALL, IPUSHQ } : -8;
icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
@@ -161,7 +161,7 @@
## Select input B to ALU
word aluB = [
icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
- IPUSHQ, IRET, IPOPQ } : valB;
+ IPUSHQ, IRET, IPOPQ, IIADDQ } : valB;
icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];
@@ -173,7 +173,7 @@
];
## Should the condition codes be updated?
-bool set_cc = icode in { IOPQ };
+bool set_cc = icode in { IOPQ, IIADDQ };
################ Memory Stage ###################################
在我下载的 sim 文件中,此题的文件为 pipe-nobypass.hcl,先创建一个备份文件 pipe-nobypass-backup.hcl
1. 数据冒险
当d_srcA in { e_dstE, M_dstM, M_dstE, W_dstM, W_dstE } || d_srcB in { e_dstE, M_dstM, M_dstE, W_dstM, W_dstE }
发生数据冒险,需要在 E 插入气泡并暂停 F 和 D,这对于加载/使用数据冒险同样适用,由此得到数据冒险的条件:
bool s_data_hazard =
(
(
d_srcA != RNONE &&
d_srcA in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE }
) ||
(
d_srcB != RNONE &&
d_srcB in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE }
)
)
2. 处理ret
ret 时的条件与 pipe-full.hcl 中的一致,也与书上的触发条件一致:
bool s_ret = IRET in { D_icode, E_icode, M_icode }
3. 预测错误的分支
也与书上的触发条件、 pipe-full.hcl 中的一致,其条件为:
bool s_mispredicted = (E_icode == IJXX && !e_Cnd)
不考虑组合,得出流水线控制逻辑的动作:
条件 | F | D | E | M | W |
---|---|---|---|---|---|
(Ⅰ)数据冒险 | 暂停 | 暂停 | 气泡 | 正常 | 正常 |
(Ⅱ)处理 ret | 暂停 | 气泡 | 正常 | 正常 | 正常 |
(Ⅲ)预测错误的分支 | 正常 | 气泡 | 气泡 | 正常 | 正常 |
4. 上述条件的组合
让我们参照书上 图 4-67 来做:
数据冒险有三种基本情况(执行/使用、访存/使用、写回/使用),JXX 只有一种,ret 有三种
寄存器 | ①执行/使用 | ②访存/使用 | ③写回/使用 | ④JXX | ⑤ret | ⑥ret | ⑦ret |
---|---|---|---|---|---|---|---|
W | —— | —— | 写回 | —— | —— | —— | —— |
M | —— | 访存 | —— | —— | —— | —— | ret |
E | 执行 | —— | —— | JXX | —— | ret | 气泡 |
D | 使用 | 使用 | 使用 | —— | ret | 气泡 | 气泡 |
可行的组合:
①② 或 ①③:属于数据冒险基本情况的组合,同(Ⅰ)
①⑤:与书上组合B类似,组合结果也与其相同,即同(Ⅰ)
②③:同(Ⅰ)
②④:因为不选择分支,使用被取消,所以没有数据冒险,因此同(Ⅲ)
②⑤:与书上组合B类似,同(Ⅰ)
③④:同(Ⅲ)
③⑤:与书上组合B类似,同(Ⅰ)
④⑤:同书上组合A
① (或② 或③) ④⑤:与 ④⑤ 类似,同书上组合A
根据上述分析得出:
组合 | 数据冒险 | 处理ret | 预测错误的分支 | F | D | E | M | W |
---|---|---|---|---|---|---|---|---|
1 | 0 | 0 | 0 | 正常 | 正常 | 正常 | 正常 | 正常 |
2 | 0 | 0 | 1 | 正常 | 气泡 | 气泡 | 正常 | 正常 |
3 | 0 | 1 | 0 | 暂停 | 气泡 | 正常 | 正常 | 正常 |
4 | 1 | 0 | 0 | 暂停 | 暂停 | 气泡 | 正常 | 正常 |
5 | 0 | 1 | 1 | 暂停 | 气泡 | 气泡 | 正常 | 正常 |
6 | 1 | 0 | 1 | 正常 | 气泡 | 气泡 | 正常 | 正常 |
7 | 1 | 1 | 0 | 暂停 | 暂停 | 气泡 | 正常 | 正常 |
8 | 1 | 1 | 1 | 暂停 | 气泡 | 气泡 | 正常 | 正常 |
因此:
F:
bubble = 0
stall = (s_data_hazard || s_ret) && (!s_data_hazard || s_ret || !s_mispredicted)
D:
bubble = s_mispredicted || (s_ret && !s_data_hazard)
stall = s_data_hazard && !s_mispredicted
E:
bubble = s_data_hazard || s_mispredicted
stall = 0
M:
不变
E:
不变
修改 pipe-nobypass.hcl:
USER@NAME:~/sim/pipe# diff -u pipe-nobypass-backup.hcl pipe-nobypass.hcl
--- pipe-nobypass-backup.hcl 2014-12-29 23:08:40.000000000 +0800
+++ pipe-nobypass.hcl 2018-08-25 01:57:59.011000000 +0800
@@ -303,39 +303,43 @@
];
################ Pipeline Register Control #########################
# Should I stall or inject a bubble into Pipeline Register F?
# At most one of these can be true.
+#bool F_bubble = 0;
+#bool F_stall = (s_data_hazard || s_ret) && (!s_data_hazard || s_ret || !s_mispredicted);
bool F_bubble = 0;
-bool F_stall =
- # Modify the following to stall the update of pipeline register F
- 0 ||
- # Stalling at fetch while ret passes through pipeline
- IRET in { D_icode, E_icode, M_icode };
+bool F_stall =
+ (((d_srcA != RNONE && d_srcA in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE }) ||
+ (d_srcB != RNONE &&d_srcB in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE })) ||
+ IRET in { D_icode, E_icode, M_icode }) &&
+ (!((d_srcA != RNONE && d_srcA in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE }) ||
+ (d_srcB != RNONE &&d_srcB in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE })) ||
+ IRET in { D_icode, E_icode, M_icode } ||
+ !(E_icode == IJXX && !e_Cnd));
# Should I stall or inject a bubble into Pipeline Register D?
# At most one of these can be true.
+#bool D_stall = s_data_hazard && !s_mispredicted;
+#bool D_bubble = s_mispredicted || (s_ret && !s_data_hazard);
+bool D_bubble =
+ (E_icode == IJXX && !e_Cnd) ||
+ (IRET in { D_icode, E_icode, M_icode } &&
+ !((d_srcA != RNONE && d_srcA in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE }) ||
+ (d_srcB != RNONE &&d_srcB in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE })));
bool D_stall =
- # Modify the following to stall the instruction in decode
- 0;
-
-bool D_bubble =
- # Mispredicted branch
- (E_icode == IJXX && !e_Cnd) ||
- # Stalling at fetch while ret passes through pipeline
- !(E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }) &&
- # but not condition for a generate/use hazard
- !0 &&
- IRET in { D_icode, E_icode, M_icode };
+ ((d_srcA != RNONE && d_srcA in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE }) ||
+ (d_srcB != RNONE &&d_srcB in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE })) &&
+ !(E_icode == IJXX && !e_Cnd);
# Should I stall or inject a bubble into Pipeline Register E?
# At most one of these can be true.
+#bool E_stall = 0;
+#bool E_bubble = s_data_hazard || s_mispredicted;
bool E_stall = 0;
-bool E_bubble =
- # Mispredicted branch
- (E_icode == IJXX && !e_Cnd) ||
- # Modify the following to inject bubble into the execute stage
- 0;
+bool E_bubble =
+ ((d_srcA != RNONE && d_srcA in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE }) ||
+ (d_srcB != RNONE &&d_srcB in { e_dstE, E_dstM, M_dstM, M_dstE, W_dstM, W_dstE })) ||
+ (E_icode == IJXX && !e_Cnd);
# Should I stall or inject a bubble into Pipeline Register M?
# At most one of these can be true.
参照本文开始处的指南,所有测试均通过
4.51 题的图:
阶段 | iaddq V, rB |
---|---|
取指 | icode:ifun <- M1[PC] rA:rB <- M1[PC+1] valC <- M8[PC+2] valP <- PC + 10 |
译码 | valB <- R[rB] |
执行 | valE <- valB + valC |
访存 | |
写回 | R[rB] <- valE |
更新PC | PC <- valP |
修改 pipe-full.hcl 文件并测试:
USER@NAME:~/sim/pipe# make clean; make psim VERSION=full
rm -f psim pipe-*.c *.o *.exe *~
# Building the pipe-full.hcl version of PIPE
../misc/hcl2c -n pipe-full.hcl < pipe-full.hcl > pipe-full.c
gcc -Wall -O2 -I../misc -o psim psim.c pipe-full.c \
../misc/isa.c -lm
USER@NAME:~/sim/pipe# cd ../ptest/
USER@NAME:~/sim/ptest# make SIM=../pipe/psim TFLAGS=-i
./optest.pl -s ../pipe/psim -i
Simulating with ../pipe/psim
All 58 ISA Checks Succeed
./jtest.pl -s ../pipe/psim -i
Simulating with ../pipe/psim
All 96 ISA Checks Succeed
./ctest.pl -s ../pipe/psim -i
Simulating with ../pipe/psim
All 22 ISA Checks Succeed
./htest.pl -s ../pipe/psim -i
Simulating with ../pipe/psim
All 756 ISA Checks Succeed
USER@NAME:~/sim/ptest# cd ../pipe
USER@NAME:~/sim/pipe# diff -u pipe-full-backup.hcl pipe-full.hcl
--- pipe-full-backup.hcl 2014-12-29 23:08:40.000000000 +0800
+++ pipe-full.hcl 2018-08-26 02:45:01.228184132 +0800
@@ -158,7 +158,7 @@
# Is instruction valid?
bool instr_valid = f_icode in
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
- IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ };
+ IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ };
# Determine status code for fetched instruction
word f_stat = [
@@ -171,11 +171,11 @@
# Does fetched instruction require a regid byte?
bool need_regids =
f_icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,
- IIRMOVQ, IRMMOVQ, IMRMOVQ };
+ IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ };
# Does fetched instruction require a constant word?
bool need_valC =
- f_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL };
+ f_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };
# Predict next value of PC
word f_predPC = [
@@ -195,14 +195,14 @@
## What register should be used as the B source?
word d_srcB = [
- D_icode in { IOPQ, IRMMOVQ, IMRMOVQ } : D_rB;
+ D_icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : D_rB;
D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't need register
];
## What register should be used as the E destination?
word d_dstE = [
- D_icode in { IRRMOVQ, IIRMOVQ, IOPQ} : D_rB;
+ D_icode in { IRRMOVQ, IIRMOVQ, IOPQ, IIADDQ} : D_rB;
D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't write any register
];
@@ -239,7 +239,7 @@
## Select input A to ALU
word aluA = [
E_icode in { IRRMOVQ, IOPQ } : E_valA;
- E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ } : E_valC;
+ E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : E_valC;
E_icode in { ICALL, IPUSHQ } : -8;
E_icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
@@ -248,7 +248,7 @@
## Select input B to ALU
word aluB = [
E_icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
- IPUSHQ, IRET, IPOPQ } : E_valB;
+ IPUSHQ, IRET, IPOPQ, IIADDQ } : E_valB;
E_icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];
@@ -260,7 +260,7 @@
];
## Should the condition codes be updated?
-bool set_cc = E_icode == IOPQ &&
+bool set_cc = E_icode in { IOPQ, IIADDQ } &&
# State changes only during normal operation
!m_stat in { SADR, SINS, SHLT } && !W_stat in { SADR, SINS, SHLT };
做这个题时可以边看 图4-52 流水线化的最终实现
需注意 Cnd 表示的是选择分支(take branch),可以从 hcl 文件中的定义看出:boolsig M_Cnd 'ex_mem_curr->takebranch' # Condition flag
主要的思路就是让 “Sel+Fwd A” 在无条件转移时选择 D_valP,在条件转移时选择 D_valC;这样的话 valC 就会传递到 M_valA,从而使得“Select PC”可以(通过M_valA)接收到 ValC,以便当预测失败时使用 valC 更新 PC;
USER@NAME:~/sim/pipe# diff -u pipe-nt-backup.hcl pipe-nt.hcl
--- pipe-nt-backup.hcl 2014-12-29 23:08:40.000000000 +0800
+++ pipe-nt.hcl 2018-08-26 01:35:40.452673831 +0800
@@ -80,9 +80,11 @@
##### Pipeline Register D ##########################################
wordsig D_icode 'if_id_curr->icode' # Instruction code
+wordsig D_ifun 'if_id_curr->ifun' # 译码阶段的 ifun
wordsig D_rA 'if_id_curr->ra' # rA field from instruction
wordsig D_rB 'if_id_curr->rb' # rB field from instruction
wordsig D_valP 'if_id_curr->valp' # Incremented PC
+wordsig D_valC 'if_id_curr->valc' # 译码阶段的 valC
##### Intermediate Values in Decode Stage #########################
@@ -139,7 +141,7 @@
## What address should instruction be fetched at
word f_pc = [
# Mispredicted branch. Fetch at incremented PC
- M_icode == IJXX && !M_Cnd : M_valA;
+ M_icode == IJXX && M_ifun != UNCOND && M_Cnd : M_valA;
# Completion of RET instruction
W_icode == IRET : W_valM;
# Default: Use predicted value of PC
@@ -183,7 +185,8 @@
# Predict next value of PC
word f_predPC = [
# BNT: This is where you'll change the branch prediction rule
- f_icode in { IJXX, ICALL } : f_valC;
+ # 无条件转移
+ f_icode in { IJXX, ICALL } && f_ifun == UNCOND : f_valC;
1 : f_valP;
];
@@ -220,7 +223,11 @@
## What should be the A value?
## Forward into decode stage for valA
word d_valA = [
- D_icode in { ICALL, IJXX } : D_valP; # Use incremented PC
+ # 无条件转移
+ D_icode in { IJXX, ICALL } && D_ifun == UNCOND : D_valP;
+ # 条件转移
+ D_icode == IJXX : D_valC;
+ #D_icode in { ICALL, IJXX } : D_valP; # Use incremented PC
d_srcA == e_dstE : e_valE; # Forward valE from execute
d_srcA == M_dstM : m_valM; # Forward valM from memory
d_srcA == M_dstE : M_valE; # Forward valE from memory
@@ -343,7 +350,7 @@
bool D_bubble =
# Mispredicted branch
- (E_icode == IJXX && !e_Cnd) ||
+ (E_icode == IJXX && E_ifun != UNCOND && e_Cnd) ||
# Stalling at fetch while ret passes through pipeline
# but not condition for a load/use hazard
!(E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }) &&
@@ -354,7 +361,7 @@
bool E_stall = 0;
bool E_bubble =
# Mispredicted branch
- (E_icode == IJXX && !e_Cnd) ||
+ (E_icode == IJXX && E_ifun != UNCOND && e_Cnd) ||
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
E_dstM in { d_srcA, d_srcB};
思路和上题有异曲同工之处;valP 是送往“Sel+Fwd A”,之后会到达 M_valM;而 valC 送往 ALUA ,之后会到达 M_valE;只需比较 M_valM 和 M_valE 就知道是前向分支还是后向;然后根据 Cnd 的值修改 pc;实际上是增加了一条转发路径;
USER@NAME:~/sim/pipe# diff -u pipe-btfnt-backup.hcl pipe-btfnt.hcl
--- pipe-btfnt-backup.hcl 2014-12-29 23:08:40.000000000 +0800
+++ pipe-btfnt.hcl 2018-08-26 22:31:56.452823164 +0800
@@ -83,6 +83,8 @@
wordsig D_rA 'if_id_curr->ra' # rA field from instruction
wordsig D_rB 'if_id_curr->rb' # rB field from instruction
wordsig D_valP 'if_id_curr->valp' # Incremented PC
+wordsig D_valC 'if_id_curr->valc' # 译码阶段的 valC
+wordsig D_ifun 'if_id_curr->ifun' # 译码阶段的 ifun
##### Intermediate Values in Decode Stage #########################
@@ -138,8 +140,11 @@
## What address should instruction be fetched at
word f_pc = [
- # Mispredicted branch. Fetch at incremented PC
- M_icode == IJXX && !M_Cnd : M_valA;
+ # 后向分支预测错误
+ M_icode == IJXX && M_ifun != UNCOND && M_valE < M_valA && !M_Cnd : M_valA;
+ # 前向分支预测错误
+ M_icode == IJXX && M_ifun != UNCOND && M_valE > M_valA && M_Cnd : M_valE;
# Completion of RET instruction
W_icode == IRET : W_valM;
# Default: Use predicted value of PC
@@ -183,13 +188,14 @@
# Predict next value of PC
word f_predPC = [
# BBTFNT: This is where you'll change the branch prediction rule
+ f_icode == IJXX && f_ifun != UNCOND && f_valC < f_valP : f_valC;
+ f_icode == IJXX && f_ifun != UNCOND && f_valC > f_valP : f_valP;
f_icode in { IJXX, ICALL } : f_valC;
1 : f_valP;
];
################ Decode Stage ######################################
## What register should be used as the A source?
word d_srcA = [
D_icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ } : D_rA;
@@ -247,7 +253,8 @@
## Select input A to ALU
word aluA = [
E_icode in { IRRMOVQ, IOPQ } : E_valA;
- E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ } : E_valC;
+ E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ } ||
+ E_icode == IJXX : E_valC;
E_icode in { ICALL, IPUSHQ } : -8;
E_icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
@@ -257,7 +264,8 @@
word aluB = [
E_icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
IPUSHQ, IRET, IPOPQ } : E_valB;
- E_icode in { IRRMOVQ, IIRMOVQ } : 0;
+ E_icode in { IRRMOVQ, IIRMOVQ } ||
+ E_icode == IJXX : 0;
# Other instructions don't need ALU
];
@@ -343,7 +351,8 @@
bool D_bubble =
# Mispredicted branch
- (E_icode == IJXX && !e_Cnd) ||
+ ((E_icode == IJXX && E_ifun != UNCOND && E_valC < E_valA && !e_Cnd) ||
+ (E_icode == IJXX && E_ifun != UNCOND && E_valC > E_valA && e_Cnd)) ||
# BBTFNT: This condition will change
# Stalling at fetch while ret passes through pipeline
# but not condition for a load/use hazard
@@ -355,7 +364,8 @@
bool E_stall = 0;
bool E_bubble =
# Mispredicted branch
- (E_icode == IJXX && !e_Cnd) ||
+ ((E_icode == IJXX && E_ifun != UNCOND && E_valC < E_valA && !e_Cnd) ||
+ (E_icode == IJXX && E_ifun != UNCOND && E_valC > E_valA && e_Cnd)) ||
# BBTFNT: This condition will change
# Conditions for a load/use hazard
E_icode in { IMRMOVQ, IPOPQ } &&
A.
书上加载使用冒险的条件:
E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB };
情况 | 1 | 2 | 3 | 4 |
---|---|---|---|---|
E_dsM == d_srcA | 1 | 1 | 0 | 0 |
E_dsM == d_srcB | 1 | 0 | 1 | 0 |
情况 1 2 3 会发生加载使用冒险,因为 rB 会在执行阶段访问,所以情况 1 3 是不能通过加载转发解决的;
只需考虑情况 2 —— E_dsM == d_srcA,对于所有指令,在访存阶段才使用 valA 的只有 rmmovq 和 pushq,因此得出下列条件:
E_icode in { IMRMOVQ, IPOPQ } &&
(
E_dstM == d_srcB ||
(
E_dstM == d_srcA && !(D_icode in { IRMMOVQ, IPUSHQ })
)
);
B.
修改一下 e_valA 的值,再修改暂停和气泡的条件就行了:
USER@NAME:~/sim/pipe# diff -u pipe-lf-backup.hcl pipe-lf.hcl
--- pipe-lf-backup.hcl 2014-12-29 23:08:40.000000000 +0800
+++ pipe-lf.hcl 2018-08-30 02:05:59.609000000 +0800
@@ -271,6 +271,7 @@
## from memory stage when appropriate
## Here it is set to the default used in the normal pipeline
word e_valA = [
+ E_icode in { IRMMOVQ, IPUSHQ } && E_srcA == M_dstM : m_valM;
1 : E_valA; # Use valA from stage pipe register
];
@@ -329,7 +330,13 @@
bool F_stall =
# Conditions for a load/use hazard
## Set this to the new load/use condition
- 0 ||
+ E_icode in { IMRMOVQ, IPOPQ } &&
+ (
+ E_dstM == d_srcB ||
+ (
+ E_dstM == d_srcA && !(D_icode in { IRMMOVQ, IPUSHQ })
+ )
+ ) ||
# Stalling at fetch while ret passes through pipeline
IRET in { D_icode, E_icode, M_icode };
@@ -338,14 +345,26 @@
bool D_stall =
# Conditions for a load/use hazard
## Set this to the new load/use condition
- 0;
+ E_icode in { IMRMOVQ, IPOPQ } &&
+ (
+ E_dstM == d_srcB ||
+ (
+ E_dstM == d_srcA && !(D_icode in { IRMMOVQ, IPUSHQ })
+ )
+ );
bool D_bubble =
# Mispredicted branch
(E_icode == IJXX && !e_Cnd) ||
# Stalling at fetch while ret passes through pipeline
# but not condition for a load/use hazard
- !(E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }) &&
+ !(E_icode in { IMRMOVQ, IPOPQ } &&
+ (
+ E_dstM == d_srcB ||
+ (
+ E_dstM == d_srcA && !(D_icode in { IRMMOVQ, IPUSHQ })
+ )
+ )) &&
IRET in { D_icode, E_icode, M_icode };
# Should I stall or inject a bubble into Pipeline Register E?
@@ -356,7 +375,13 @@
(E_icode == IJXX && !e_Cnd) ||
# Conditions for a load/use hazard
## Set this to the new load/use condition
- 0;
+ E_icode in { IMRMOVQ, IPOPQ } &&
+ (
+ E_dstM == d_srcB ||
+ (
+ E_dstM == d_srcA && !(D_icode in { IRMMOVQ, IPUSHQ })
+ )
+ );
# Should I stall or inject a bubble into Pipeline Register M?
# At most one of these can be true.
很简单
USER@NAME:~/sim/pipe# diff -u pipe-1w-backup.hcl pipe-1w.hcl
--- pipe-1w-backup.hcl 2014-12-29 23:08:40.000000000 +0800
+++ pipe-1w.hcl 2018-08-30 04:02:30.477000000 +0800
@@ -157,6 +157,7 @@
## so that it will be IPOP2 when fetched for second time.
word f_icode = [
imem_error : INOP;
+ D_icode == IPOPQ : IPOP2;
1: imem_icode;
];
@@ -169,7 +170,7 @@
# Is instruction valid?
bool instr_valid = f_icode in
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
- IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ };
+ IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IPOP2 };
# Determine status code for fetched instruction
word f_stat = [
@@ -182,7 +183,7 @@
# Does fetched instruction require a regid byte?
bool need_regids =
f_icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,
- IIRMOVQ, IRMMOVQ, IMRMOVQ };
+ IIRMOVQ, IRMMOVQ, IMRMOVQ, IPOP2 };
# Does fetched instruction require a constant word?
bool need_valC =
@@ -192,6 +193,7 @@
word f_predPC = [
f_icode in { IJXX, ICALL } : f_valC;
## 1W: Want to refetch popq one time
+ f_icode == IPOPQ : f_pc;
1 : f_valP;
];
@@ -204,14 +206,14 @@
## What register should be used as the A source?
word d_srcA = [
D_icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ } : D_rA;
- D_icode in { IPOPQ, IRET } : RRSP;
+ D_icode in { IRET } : RRSP;
1 : RNONE; # Don't need register
];
## What register should be used as the B source?
word d_srcB = [
D_icode in { IOPQ, IRMMOVQ, IMRMOVQ } : D_rB;
- D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
+ D_icode in { IPUSHQ, IPOPQ, ICALL, IRET, IPOP2 } : RRSP;
1 : RNONE; # Don't need register
];
@@ -224,7 +226,7 @@
## What register should be used as the M destination?
word d_dstM = [
- D_icode in { IMRMOVQ, IPOPQ } : D_rA;
+ D_icode in { IMRMOVQ, IPOP2 } : D_rA;
1 : RNONE; # Don't write any register
];
@@ -255,7 +257,7 @@
word aluA = [
E_icode in { IRRMOVQ, IOPQ } : E_valA;
E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ } : E_valC;
- E_icode in { ICALL, IPUSHQ } : -8;
+ E_icode in { ICALL, IPUSHQ, IPOP2 } : -8;
E_icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
];
@@ -263,7 +265,7 @@
## Select input B to ALU
word aluB = [
E_icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
- IPUSHQ, IRET, IPOPQ } : E_valB;
+ IPUSHQ, IRET, IPOPQ, IPOP2 } : E_valB;
E_icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];
@@ -292,13 +294,13 @@
## Select memory address
word mem_addr = [
- M_icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : M_valE;
- M_icode in { IPOPQ, IRET } : M_valA;
+ M_icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ, IPOP2 } : M_valE;
+ M_icode in { IRET } : M_valA;
# Other instructions don't need address
];
## Set read control signal
-bool mem_read = M_icode in { IMRMOVQ, IPOPQ, IRET };
+bool mem_read = M_icode in { IMRMOVQ, IPOP2, IRET };
## Set write control signal
bool mem_write = M_icode in { IRMMOVQ, IPUSHQ, ICALL };
@@ -350,7 +352,7 @@
bool F_bubble = 0;
bool F_stall =
# Conditions for a load/use hazard
- E_icode in { IMRMOVQ, IPOPQ } &&
+ E_icode in { IMRMOVQ, IPOP2 } &&
E_dstM in { d_srcA, d_srcB } ||
# Stalling at fetch while ret passes through pipeline
IRET in { D_icode, E_icode, M_icode };
@@ -359,7 +361,7 @@
# At most one of these can be true.
bool D_stall =
# Conditions for a load/use hazard
- E_icode in { IMRMOVQ, IPOPQ } &&
+ E_icode in { IMRMOVQ, IPOP2 } &&
E_dstM in { d_srcA, d_srcB };
bool D_bubble =
@@ -367,7 +369,7 @@
(E_icode == IJXX && !e_Cnd) ||
# Stalling at fetch while ret passes through pipeline
# but not condition for a load/use hazard
- !(E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }) &&
+ !(E_icode in { IMRMOVQ, IPOP2 } && E_dstM in { d_srcA, d_srcB }) &&
# 1W: This condition will change
IRET in { D_icode, E_icode, M_icode };
@@ -378,7 +380,7 @@
# Mispredicted branch
(E_icode == IJXX && !e_Cnd) ||
# Conditions for a load/use hazard
- E_icode in { IMRMOVQ, IPOPQ } &&
+ E_icode in { IMRMOVQ, IPOP2 } &&
E_dstM in { d_srcA, d_srcB};
# Should I stall or inject a bubble into Pipeline Register M?
# 4.47
jge Test1 # data[i + 1] - data[i] > 0 -> do nothing
rmmovq %r10, (%rdx) # data[i + 1] = data[i]
subq %r9, %rdx # %rdx = %rdx - 8
rmmovq %r11, (%rdx) # data[i] = data[i + 1]
#######################################################################
# 4.48
rrmovq %r11, %r12 # %r12 = data[i + 1]
rrmovq %r10, %r13 # %r13 = data[i]
cmovl %r10, %r12 # data[i + 1] < data[i] -> %r12 = data[i]
cmovl %r11, %r13 # data[i + 1] < data[i] -> %r13 = data[i + 1]
rmmovq %r12, (%rdx) # data[i + 1] = %r12
subq %r9, %rdx # %rdx = %rdx - 8
rmmovq %r13, (%rdx) # data[i] = %r13
#######################################################################
# 4.49
cmovge %r12, %rbx # data[i + 1] < data[i] -> %rbx = data[i + 1] - data[i] 否则 %rbx = 0
subq %rbx, %r11 # %r11 = data[i+1] < data[i] : data[i] : data[i+1]
rmmovq %r11, (%rdx) # data[i+1] = %r11
subq %r9, %rdx # %rdx = %rdx - 8
addq %rbx, %r10 # %r10 = data[i+1] < data[i] : data[i] : data[i+1]
rmmovq %r10, (%rdx) # data[i] = %r10
设 data[i + 1] > data[i] 的几率为 50%
平均:
三者分别执行的指令数 2.5, 7, 6
三者分别插入的气泡 1, 0, 0
三者分别需要的时钟周期 3.5, 7, 6
显然 4.47 性能更好