这个实验无疑是目前已做三个实验里花费我最多精力的,很可能也是所有实验里最花费精力的一个。如某人所言,本实验实验环境的搭建甚至比实验内容本身还要困难。搭建环境的过程不知掉了多少头发,可谓是十分痛苦了,令我一度想要放弃本实验。现在磕磕绊绊把实验做完,回顾全过程,记录下实验中遇到的一些难点,即是反思回味,也希望能帮助同我当初一样面对这般那般的问题而一筹莫展的朋友
我的实验环境为WSL版的Ubuntu,由于没有安装相关库,模拟器不支持GUI界面,因此在解压sim包后,将sim目录下的Makefile
文件中涉及GUI的三行注释掉
#GUIMODE=-DHAS_GUI
#TKLIBS=-L/usr/lib -ltk -ltcl
#TKINC=-isystem /usr/include
注释GUI相关东西是我从别人博客学到的,在整个实验过程中我也没有使用过GUI。我自己也尝试过安装tk与tcl,但是如果安装版本是8.6及以上,仍不能正常使用,若想要在本实验中使用GUI,tk与tcl必须安装为8.5版本及以下,接着尝试
unix> make clean
unix> make
如果报错信息包含关键词flex
和bison
,可能是没有安装相关依赖软件,尝试安装
unix> sudo apt-get install flex
unix> sudo apt-get install bison
当在sim目录及其子目录中任意一个进行
unix> make
时显示某.c文件报以下错误
build fails with tcl 8.6: error: tcl_interp has no member named result
请参考这篇文章查看解决方案
这一部分内容较为简单,正如lab介绍文档中所言,part A部分是为后续实验准备的热身运动
# Execution begins at address 0
.pos 0
irmovq stack, %rsp
call main
halt
# Sample linked list
.align 8
list:
ele1:
.quad 0x00a
.quad ele2
ele2:
.quad 0x0b0
.quad ele3
ele3:
.quad 0xc00
.quad 0
main:
irmovq list, %rdi
call sum_list
ret
# long sum_list(list_ptr ls)
# ls in %rdi
sum_list:
irmovq $8, %r8
xorq %rax, %rax
test:
andq %rdi, %rdi
je done
mrmovq (%rdi), %r9
addq %r9, %rax
addq %r8, %rdi
mrmovq (%rdi), %rdi
jmp test
done:
ret
.pos 0x200
stack:
ArchitectureLab/sim/misc> make sum.yo
./yas sum.ys
ArchitectureLab/sim/misc> ./yis sum.yo
Stopped in 32 steps at PC = 0x13. Status 'HLT', CC Z=1 S=0 O=0
Changes to registers:
%rax: 0x0000000000000000 0x0000000000000cba
%rsp: 0x0000000000000000 0x0000000000000200
%r8: 0x0000000000000000 0x0000000000000008
%r9: 0x0000000000000000 0x0000000000000c00
Changes to memory:
0x01f0: 0x0000000000000000 0x000000000000005b
0x01f8: 0x0000000000000000 0x0000000000000013
# Execution begins at address 0
.pos 0
irmovq stack, %rsp
call main
halt
# Sample linked list
.align 8
list:
ele1:
.quad 0x00a
.quad ele2
ele2:
.quad 0x0b0
.quad ele3
ele3:
.quad 0xc00
.quad 0
main:
irmovq list, %rdi
call rsum_list
ret
# long rsum_list(list_ptr ls)
# ls in %rdi
rsum_list:
irmovq $8, %r8
xorq %rax, %rax
andq %rdi, %rdi
je null_pointer
mrmovq (%rdi), %rbx
pushq %rbx
addq %r8, %rdi
mrmovq (%rdi), %rdi
call rsum_list
popq %rbx
addq %rbx, %rax
null_pointer:
ret
.pos 0x200
stack:
ArchitectureLab/sim/misc> make rsum.yo
./yas rsum.ys
ArchitectureLab/sim/misc> ./yis rsum.yo
Stopped in 47 steps at PC = 0x13. Status 'HLT', CC Z=0 S=0 O=0
Changes to registers:
%rax: 0x0000000000000000 0x0000000000000cba
%rbx: 0x0000000000000000 0x000000000000000a
%rsp: 0x0000000000000000 0x0000000000000200
%r8: 0x0000000000000000 0x0000000000000008
Changes to memory:
0x01c0: 0x0000000000000000 0x0000000000000094
0x01c8: 0x0000000000000000 0x0000000000000c00
0x01d0: 0x0000000000000000 0x0000000000000094
0x01d8: 0x0000000000000000 0x00000000000000b0
0x01e0: 0x0000000000000000 0x0000000000000094
0x01e8: 0x0000000000000000 0x000000000000000a
0x01f0: 0x0000000000000000 0x000000000000005b
0x01f8: 0x0000000000000000 0x0000000000000013
# Execution begins at address 0
.pos 0
irmovq stack, %rsp
call main
halt
.align 8
# Source block
src:
.quad 0x00a
.quad 0x0b0
.quad 0xc00
# Destination block
dest:
.quad 0x111
.quad 0x222
.quad 0x333
main:
irmovq src, %rdi
irmovq dest, %rsi
irmovq $3, %rdx
call copy_block
ret
# long copy_block(long* src, long* dest, long len)
# src in %rdi, dest in %rsi, len in %rdx
copy_block:
irmovq $1, %r10
irmovq $8, %r8
xorq %rax, %rax
loop:
andq %rdx, %rdx
je done
mrmovq (%rdi), %r9
addq %r8, %rdi
rmmovq %r9, (%rsi)
addq %r8, %rsi
xorq %r9, %rax
subq %r10, %rdx
jmp loop
done:
ret
.pos 0x200
stack:
ArchitectureLab/sim/misc> make copy.yo
./yas copy.ys
ArchitectureLab/sim/misc> ./yis copy.yo
Stopped in 41 steps at PC = 0x13. Status 'HLT', CC Z=1 S=0 O=0
Changes to registers:
%rax: 0x0000000000000000 0x0000000000000cba
%rsp: 0x0000000000000000 0x0000000000000200
%rsi: 0x0000000000000000 0x0000000000000048
%rdi: 0x0000000000000000 0x0000000000000030
%r8: 0x0000000000000000 0x0000000000000008
%r9: 0x0000000000000000 0x0000000000000c00
%r10: 0x0000000000000000 0x0000000000000001
Changes to memory:
0x0030: 0x0000000000000111 0x000000000000000a
0x0038: 0x0000000000000222 0x00000000000000b0
0x0040: 0x0000000000000333 0x0000000000000c00
0x01f0: 0x0000000000000000 0x000000000000006f
0x01f8: 0x0000000000000000 0x0000000000000013
实验的第二部分也不困难,只需实现一个iaddq
立即数加功能即可,所需修改部分如下
bool instr_valid = icode in
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ };
# Does fetched instruction require a regid byte?
bool need_regids =
icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,
IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ };
# Does fetched instruction require a constant word?
bool need_valC =
icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };
## What register should be used as the B source?
word srcB = [
icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't need register
];
## What register should be used as the E destination?
word dstE = [
icode in { IRRMOVQ } && Cnd : rB;
icode in { IIRMOVQ, IOPQ, IIADDQ} : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't write any register
];
## Select input A to ALU
word aluA = [
icode in { IRRMOVQ, IOPQ } : valA;
icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : valC;
icode in { ICALL, IPUSHQ } : -8;
icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
];
## Select input B to ALU
word aluB = [
icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
IPUSHQ, IRET, IPOPQ, IIADDQ } : valB;
icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];
后续按照实验手册测试步骤照做即可
最后部分稍难一些,需要对流水线有较好理解后自行优化硬件逻辑及汇编代码,尽可能高效地实现函数功能。先说下我对pipe-full.hcl
文件的修改。首先是听从了实验手册的建议,实现了iaddq立即数加法指令,代码修改同Part B部分。因为函数含循环涉及大量条件跳转,考虑过将跳转预测逻辑修改为后向分支跳转,前向分支不跳转(见习题4.56),但稍加分析认为在本题中修改分支预测逻辑并无提升,遂延用默认预测逻辑
对于ncopy.ys代码的优化,首先是利用实现的iaddq将原本的诸如
irmovq $1, %r10
addq %r10, %rax
两条指令合为一条
iaddq $1, %r10
其次注意到
mrmovq (%rdi), %r10
rmmovq %r10, (%rsi)
andq %r10, %r10
产生加载/使用冲突,浪费一个时钟周期,通过重排指令顺序将该周期利用起来
mrmovq (%rdi), %r10
andq %r10, %r10
rmmovq %r10, (%rsi)
再根据实验手册建议,参考第五章循环展开部分对CPE进行优化
mrmovq (%rdi), %r10 # read val from src...
mrmovq 8(%rdi), %r11
mrmovq 16(%rdi), %r12
mrmovq 24(%rdi), %r13
andq %r10, %r10 # val <= 0?
rmmovq %r10, (%rsi) # ...and store it to dst
jle Npos1 # if so, goto Npos:
iaddq $1, %rax # count++
Npos1:
andq %r11, %r11
rmmovq %r11, 8(%rsi)
jle Npos2
iaddq $1, %rax
Npos2:
andq %r12, %r12
rmmovq %r12, 16(%rsi)
jle Npos3
iaddq $1, %rax
Npos3:
andq %r13, %r13
rmmovq %r13, 24(%rsi)
jle Npos4
iaddq $1, %rax
Npos4:
iaddq $32, %rdi # src+=4
iaddq $32, %rsi # dst+=4
完整代码见下
#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:
##################################################################
# You can modify this portion
# Loop header
xorq %rax, %rax # count = 0;
iaddq $-8, %rdx # len -= 8
andq %rdx, %rdx # len < 0?
jl Remain # if so, goto Remain:
Loop:
mrmovq (%rdi), %r10 # read val from src...
mrmovq 8(%rdi), %r11
mrmovq 16(%rdi), %r12
mrmovq 24(%rdi), %r13
andq %r10, %r10 # val <= 0?
rmmovq %r10, (%rsi) # ...and store it to dst
jle Npos1 # if so, goto Npos:
iaddq $1, %rax # count++
Npos1:
andq %r11, %r11
rmmovq %r11, 8(%rsi)
jle Npos2
iaddq $1, %rax
Npos2:
andq %r12, %r12
rmmovq %r12, 16(%rsi)
jle Npos3
iaddq $1, %rax
Npos3:
andq %r13, %r13
rmmovq %r13, 24(%rsi)
jle Npos4
iaddq $1, %rax
Npos4:
iaddq $32, %rdi # src+=4
iaddq $32, %rsi # dst+=4
mrmovq (%rdi), %r10 # read val from src...
mrmovq 8(%rdi), %r11
mrmovq 16(%rdi), %r12
mrmovq 24(%rdi), %r13
andq %r10, %r10 # val <= 0?
rmmovq %r10, (%rsi) # ...and store it to dst
jle Npos5 # if so, goto Npos:
iaddq $1, %rax # count++
Npos5:
andq %r11, %r11
rmmovq %r11, 8(%rsi)
jle Npos6
iaddq $1, %rax
Npos6:
andq %r12, %r12
rmmovq %r12, 16(%rsi)
jle Npos7
iaddq $1, %rax
Npos7:
andq %r13, %r13
rmmovq %r13, 24(%rsi)
jle Npos8
iaddq $1, %rax
Npos8:
iaddq $32, %rdi # src+=4
iaddq $32, %rsi # dst+=4
iaddq $-8, %rdx # len -= 8
jge Loop # if so, goto Loop:
Remain:
iaddq $8, %rdx
je Done
Final_loop:
mrmovq (%rdi), %r10 # read val from src...
andq %r10, %r10 # val <= 0?
rmmovq %r10, (%rsi) # ...and store it to dst
jle Npos9 # if so, goto Npos9:
iaddq $1, %rax # count++
Npos9:
iaddq $8, %rdi # src++
iaddq $8, %rsi # dst++
iaddq $-1, %rdx # len--
jg Final_loop
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */
最终测试
unix> make psim VERSION=full
unix> ./psim -t sdriver.yo
unix> ./psim -t ldriver.yo
unix> ./correctness.pl
unix> ./benchmark.pl
最终结果Average CPE
为8.70
,分数为35.9
。满分需要Average CPE
在7.50及以下,暂时先到这里,以后有机会再考虑继续优化,完成这个实验花费了大量时间与精力