CSAPP ArchitectureLab

欢迎前往我的github的CSAPP仓库,含各章家庭作业解答,各实验原始数据及题解和CSAPP电子书等,如果对您的学习有所帮助,能点个star就更好不过了

准备

这个实验无疑是目前已做三个实验里花费我最多精力的,很可能也是所有实验里最花费精力的一个。如某人所言,本实验实验环境的搭建甚至比实验内容本身还要困难。搭建环境的过程不知掉了多少头发,可谓是十分痛苦了,令我一度想要放弃本实验。现在磕磕绊绊把实验做完,回顾全过程,记录下实验中遇到的一些难点,即是反思回味,也希望能帮助同我当初一样面对这般那般的问题而一筹莫展的朋友

我的实验环境为WSL版的Ubuntu,由于没有安装相关库,模拟器不支持GUI界面,因此在解压sim包后,将sim目录下的Makefile文件中涉及GUI的三行注释掉

#GUIMODE=-DHAS_GUI
#TKLIBS=-L/usr/lib -ltk -ltcl
#TKINC=-isystem /usr/include

注释GUI相关东西是我从别人博客学到的,在整个实验过程中我也没有使用过GUI。我自己也尝试过安装tk与tcl,但是如果安装版本是8.6及以上,仍不能正常使用,若想要在本实验中使用GUI,tk与tcl必须安装为8.5版本及以下,接着尝试

unix> make clean
unix> make

如果报错信息包含关键词flexbison,可能是没有安装相关依赖软件,尝试安装

unix> sudo apt-get install flex
unix> sudo apt-get install bison

当在sim目录及其子目录中任意一个进行

unix> make

时显示某.c文件报以下错误

build fails with tcl 8.6: error: tcl_interp has no member named result

请参考这篇文章查看解决方案

Part A

这一部分内容较为简单,正如lab介绍文档中所言,part A部分是为后续实验准备的热身运动

sum.ys

# Execution begins at address 0
	.pos 0
	irmovq stack, %rsp
	call main
	halt

# Sample linked list
	.align 8
list:
	ele1:
		.quad 0x00a
		.quad ele2
	ele2:
		.quad 0x0b0
		.quad ele3
	ele3:
		.quad 0xc00
		.quad 0

main:
	irmovq list, %rdi
	call sum_list
	ret

# long sum_list(list_ptr ls)
# ls in %rdi
sum_list:
	irmovq $8, %r8
	xorq %rax, %rax
test:
	andq %rdi, %rdi
	je done
	mrmovq (%rdi), %r9
	addq %r9, %rax
	addq %r8, %rdi
	mrmovq (%rdi), %rdi
	jmp test
done:
	ret

	.pos 0x200
stack:
ArchitectureLab/sim/misc> make sum.yo
./yas sum.ys
ArchitectureLab/sim/misc> ./yis sum.yo
Stopped in 32 steps at PC = 0x13.  Status 'HLT', CC Z=1 S=0 O=0
Changes to registers:
%rax:	0x0000000000000000	0x0000000000000cba
%rsp:	0x0000000000000000	0x0000000000000200
%r8:	0x0000000000000000	0x0000000000000008
%r9:	0x0000000000000000	0x0000000000000c00

Changes to memory:
0x01f0:	0x0000000000000000	0x000000000000005b
0x01f8:	0x0000000000000000	0x0000000000000013

rsum.ys

# Execution begins at address 0
	.pos 0
	irmovq stack, %rsp
	call main
	halt

# Sample linked list
	.align 8
list:
	ele1:
		.quad 0x00a
		.quad ele2
	ele2:
		.quad 0x0b0
		.quad ele3
	ele3:
		.quad 0xc00
		.quad 0

main:
	irmovq list, %rdi
	call rsum_list
	ret

# long rsum_list(list_ptr ls)
# ls in %rdi
rsum_list:
	irmovq $8, %r8
	xorq %rax, %rax
	andq %rdi, %rdi
	je null_pointer
	mrmovq (%rdi), %rbx
	pushq %rbx
	addq %r8, %rdi
	mrmovq (%rdi), %rdi
	call rsum_list
	popq %rbx
	addq %rbx, %rax
null_pointer:
	ret

	.pos 0x200
stack:
ArchitectureLab/sim/misc> make rsum.yo
./yas rsum.ys
ArchitectureLab/sim/misc> ./yis rsum.yo
Stopped in 47 steps at PC = 0x13.  Status 'HLT', CC Z=0 S=0 O=0
Changes to registers:
%rax:	0x0000000000000000	0x0000000000000cba
%rbx:	0x0000000000000000	0x000000000000000a
%rsp:	0x0000000000000000	0x0000000000000200
%r8:	0x0000000000000000	0x0000000000000008

Changes to memory:
0x01c0:	0x0000000000000000	0x0000000000000094
0x01c8:	0x0000000000000000	0x0000000000000c00
0x01d0:	0x0000000000000000	0x0000000000000094
0x01d8:	0x0000000000000000	0x00000000000000b0
0x01e0:	0x0000000000000000	0x0000000000000094
0x01e8:	0x0000000000000000	0x000000000000000a
0x01f0:	0x0000000000000000	0x000000000000005b
0x01f8:	0x0000000000000000	0x0000000000000013

copy.ys

# Execution begins at address 0
	.pos 0
	irmovq stack, %rsp
	call main
	halt

	.align 8
# Source block
src:
	.quad 0x00a
	.quad 0x0b0
	.quad 0xc00

# Destination block
dest:
	.quad 0x111
	.quad 0x222
	.quad 0x333

main:
	irmovq src, %rdi
	irmovq dest, %rsi
	irmovq $3, %rdx
	call copy_block
	ret

# long copy_block(long* src, long* dest, long len)
# src in %rdi, dest in %rsi, len in %rdx
copy_block:
	irmovq $1, %r10
	irmovq $8, %r8
	xorq %rax, %rax
loop:
	andq %rdx, %rdx
	je done
	mrmovq (%rdi), %r9
	addq %r8, %rdi
	rmmovq %r9, (%rsi)
	addq %r8, %rsi
	xorq %r9, %rax
	subq %r10, %rdx
	jmp loop
done:
	ret

	.pos 0x200
stack:
ArchitectureLab/sim/misc> make copy.yo
./yas copy.ys
ArchitectureLab/sim/misc> ./yis copy.yo
Stopped in 41 steps at PC = 0x13.  Status 'HLT', CC Z=1 S=0 O=0
Changes to registers:
%rax:	0x0000000000000000	0x0000000000000cba
%rsp:	0x0000000000000000	0x0000000000000200
%rsi:	0x0000000000000000	0x0000000000000048
%rdi:	0x0000000000000000	0x0000000000000030
%r8:	0x0000000000000000	0x0000000000000008
%r9:	0x0000000000000000	0x0000000000000c00
%r10:	0x0000000000000000	0x0000000000000001

Changes to memory:
0x0030:	0x0000000000000111	0x000000000000000a
0x0038:	0x0000000000000222	0x00000000000000b0
0x0040:	0x0000000000000333	0x0000000000000c00
0x01f0:	0x0000000000000000	0x000000000000006f
0x01f8:	0x0000000000000000	0x0000000000000013

Part B

实验的第二部分也不困难,只需实现一个iaddq立即数加功能即可,所需修改部分如下

bool instr_valid = icode in 
	{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
	       IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ };

# Does fetched instruction require a regid byte?
bool need_regids =
	icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ, 
		     IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ };

# Does fetched instruction require a constant word?
bool need_valC =
	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };

## What register should be used as the B source?
word srcB = [
	icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ  } : rB;
	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't need register
];

## What register should be used as the E destination?
word dstE = [
	icode in { IRRMOVQ } && Cnd : rB;
	icode in { IIRMOVQ, IOPQ, IIADDQ} : rB;
	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't write any register
];

## Select input A to ALU
word aluA = [
	icode in { IRRMOVQ, IOPQ } : valA;
	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : valC;
	icode in { ICALL, IPUSHQ } : -8;
	icode in { IRET, IPOPQ } : 8;
	# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
	icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL, 
		      IPUSHQ, IRET, IPOPQ, IIADDQ } : valB;
	icode in { IRRMOVQ, IIRMOVQ } : 0;
	# Other instructions don't need ALU
];

后续按照实验手册测试步骤照做即可

Part C

最后部分稍难一些,需要对流水线有较好理解后自行优化硬件逻辑及汇编代码,尽可能高效地实现函数功能。先说下我对pipe-full.hcl文件的修改。首先是听从了实验手册的建议,实现了iaddq立即数加法指令,代码修改同Part B部分。因为函数含循环涉及大量条件跳转,考虑过将跳转预测逻辑修改为后向分支跳转,前向分支不跳转(见习题4.56),但稍加分析认为在本题中修改分支预测逻辑并无提升,遂延用默认预测逻辑

对于ncopy.ys代码的优化,首先是利用实现的iaddq将原本的诸如

	irmovq $1, %r10
	addq %r10, %rax

两条指令合为一条

	iaddq $1, %r10

其次注意到

 	mrmovq (%rdi), %r10
	rmmovq %r10, (%rsi)
	andq %r10, %r10

产生加载/使用冲突,浪费一个时钟周期,通过重排指令顺序将该周期利用起来

 	mrmovq (%rdi), %r10
	andq %r10, %r10
	rmmovq %r10, (%rsi)

再根据实验手册建议,参考第五章循环展开部分对CPE进行优化

	mrmovq (%rdi), %r10	# read val from src...
	mrmovq 8(%rdi), %r11
	mrmovq 16(%rdi), %r12
	mrmovq 24(%rdi), %r13
	andq %r10, %r10		# val <= 0?
	rmmovq %r10, (%rsi)	# ...and store it to dst
	jle Npos1		# if so, goto Npos:
	iaddq $1, %rax		# count++
Npos1:
	andq %r11, %r11
	rmmovq %r11, 8(%rsi)
	jle Npos2
	iaddq $1, %rax
Npos2:
	andq %r12, %r12
	rmmovq %r12, 16(%rsi)
	jle Npos3
	iaddq $1, %rax
Npos3:
	andq %r13, %r13
	rmmovq %r13, 24(%rsi)
	jle Npos4
	iaddq $1, %rax
Npos4:
	iaddq $32, %rdi		# src+=4
	iaddq $32, %rsi		# dst+=4

完整代码见下

#/* $begin ncopy-ys */
##################################################################
# ncopy.ys - Copy a src block of len words to dst.
# Return the number of positive words (>0) contained in src.
#
# Include your name and ID here.
#
# Describe how and why you modified the baseline code.
#
##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
	# Loop header
	xorq %rax, %rax		# count = 0;
	iaddq $-8, %rdx		# len -= 8
	andq %rdx, %rdx		# len < 0?
	jl Remain		# if so, goto Remain:
	
Loop:	
	mrmovq (%rdi), %r10	# read val from src...
	mrmovq 8(%rdi), %r11
	mrmovq 16(%rdi), %r12
	mrmovq 24(%rdi), %r13
	andq %r10, %r10		# val <= 0?
	rmmovq %r10, (%rsi)	# ...and store it to dst
	jle Npos1		# if so, goto Npos:
	iaddq $1, %rax		# count++
Npos1:
	andq %r11, %r11
	rmmovq %r11, 8(%rsi)
	jle Npos2
	iaddq $1, %rax
Npos2:
	andq %r12, %r12
	rmmovq %r12, 16(%rsi)
	jle Npos3
	iaddq $1, %rax
Npos3:
	andq %r13, %r13
	rmmovq %r13, 24(%rsi)
	jle Npos4
	iaddq $1, %rax
Npos4:
	iaddq $32, %rdi		# src+=4
	iaddq $32, %rsi		# dst+=4
	
	mrmovq (%rdi), %r10	# read val from src...
	mrmovq 8(%rdi), %r11
	mrmovq 16(%rdi), %r12
	mrmovq 24(%rdi), %r13
	andq %r10, %r10		# val <= 0?
	rmmovq %r10, (%rsi)	# ...and store it to dst
	jle Npos5		# if so, goto Npos:
	iaddq $1, %rax		# count++
Npos5:
	andq %r11, %r11
	rmmovq %r11, 8(%rsi)
	jle Npos6
	iaddq $1, %rax
Npos6:
	andq %r12, %r12
	rmmovq %r12, 16(%rsi)
	jle Npos7
	iaddq $1, %rax
Npos7:
	andq %r13, %r13
	rmmovq %r13, 24(%rsi)
	jle Npos8
	iaddq $1, %rax
Npos8:
	iaddq $32, %rdi		# src+=4
	iaddq $32, %rsi		# dst+=4
	
	iaddq $-8, %rdx		# len -= 8
	jge Loop			# if so, goto Loop:
Remain:
	iaddq $8, %rdx
	je Done
Final_loop:
	mrmovq (%rdi), %r10	# read val from src...
	andq %r10, %r10		# val <= 0?
	rmmovq %r10, (%rsi)	# ...and store it to dst
	jle Npos9		# if so, goto Npos9:
	iaddq $1, %rax		# count++
Npos9:	
	iaddq $8, %rdi		# src++
	iaddq $8, %rsi		# dst++
	iaddq $-1, %rdx		# len--
	jg Final_loop

##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
	ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

最终测试

unix> make psim VERSION=full
unix> ./psim -t sdriver.yo
unix> ./psim -t ldriver.yo
unix> ./correctness.pl
unix> ./benchmark.pl

最终结果Average CPE8.70,分数为35.9。满分需要Average CPE在7.50及以下,暂时先到这里,以后有机会再考虑继续优化,完成这个实验花费了大量时间与精力

你可能感兴趣的:(CSAPP,c++)