【深入理解计算机系统】CSAPP-实验四：ArchLab全网最详细

前言

对应书本第四章内容。通过对Y86-64的ISA构造，熟悉对x86-64指令集。

实验分为三个部分，

第一部分是熟悉C到ys汇编。

第二部分是修改线性的SEQ，增加iaddq这个指令。

第三部分是最核心，也是最困难的开放部分，优化代码以及体系结构。

本机使用win10 +wsl2.0 + ubuntu18.04完成实验。

点击查看我的全部代码

reference

CSAPP LAB

关于CSAPP cannot find -ltcl -ltk的问题

[读书笔记]CSAPP：ArchLab

通俗解说CSAPP的archlab partC

csapp-Archlab

用到的指令

ys文件翻译成yo

在sim/misc 文件夹中运行./yas sum.ys

拟真运行程序

在sim/misc ./yis sum.yo

ys文件结构

根据y86-code/asum.ys的例子中可以总结出ys文件的通常结构。

# Execution begins at address 0 
	.pos 0
	irmovq stack, %rsp  	# Set up stack pointer
	call main		# Execute main program
	halt			# Terminate program 
	
# 内存区域，存放数据/链表之类
...
# END

main:	
	irmovq $4,%rsi  #参数准备
	call myFunc		# myFunc( 4)
	ret

# long myFunc(long i)
# i in %rdi
sum:	
	... #程序具体内容
	ret                  # Return

# Stack starts here and grows to lower addresses.
# 这里自定义栈开始地址
	.pos 0x200
stack:

PART A

任务：模拟examples.c完成三个函数的翻译：从c语言到Y86-64的汇编语言。

数据结构

examples.c中，定义了一个链表。

/* linked list element */
typedef struct ELE {
    long val;
    struct ELE *next;
} *list_ptr;

同时给了测试数据：

# Sample linked list
.align 8
ele1:
    .quad 0x00a
    .quad ele2
ele2:
    .quad 0x0b0
    .quad ele3
ele3:
    .quad 0xc00
    .quad 0

sum_list

/* sum_list - Sum the elements of a linked list */
long sum_list(list_ptr ls)
{
    long val = 0;
    while (ls) {
        val += ls->val;
        ls = ls->next;
    }
    return val;
}

对链表进行迭代求和。

做一个出色的copycat。

sum_list.ys

# Execution begins at address 0 
	.pos 0
	irmovq stack, %rsp  	# Set up stack pointer
	call main		# Execute main program
	halt			# Terminate program 
	
# 内存区域，存放数据/链表之类
# Sample linked list
.align 8
ele1:
    .quad 0x00a
    .quad ele2
ele2:
    .quad 0x0b0
    .quad ele3
ele3:
    .quad 0xc00
    .quad 0
# END

main:	
	irmovq ele1,%rdi  #参数准备
	call sum_list		# sum_list(ele1)
	ret

# long sum_list(long i)
# ele1 in %rdi
sum_list:	
	xorq %rax,%rax           #val=0
loop:
    mrmovq (%rdi),%r8 #读取node.val值到寄存器r8
    addq %r8,%rax #将结果加到return val中
    mrmovq 8(%rdi),%rdi
    jmp test       #无条件跳转到test
test:
    andq %rdi,%rdi      # set CC。这个操作我有点迷惑，用andq来设置CC
    jne loop
	ret                  # Return

# Stack starts here and grows to lower addresses.
# 这里自定义栈开始地址
	.pos 0x200
stack:

进行测试：

./yas sum_list.ys和./yis sum_list.yo

可以查看结果%rax: 0x0000000000000000 0x0000000000000cba ，没有问题。

rsum_list

/* rsum_list - Recursive version of sum_list */
long rsum_list(list_ptr ls)
{
    if (!ls)
	    return 0;
    else {
        long val = ls->val;
        long rest = rsum_list(ls->next);
        return val + rest;
    }
}

是一个递归的版本。

总体结构和sum_list一致。需改rsum_list。这里用栈的方式先存储了node.val，然后到达了ptr的末尾的时候，才开始相加addq %rbx, %rax。

rsum_list.ys

# Execution begins at address 0 
	.pos 0
	irmovq stack, %rsp  	# Set up stack pointer
	call main		# Execute main program
	halt			# Terminate program 
	
# 内存区域，存放数据/链表之类
# Sample linked list
.align 8
ele1:
    .quad 0x00a
    .quad ele2
ele2:
    .quad 0x0b0
    .quad ele3
ele3:
    .quad 0xc00
    .quad 0
# END

main:	
	irmovq ele1,%rdi  #参数准备
	call rsum_list		# rsum_list(ele1)
	ret

# long rsum_list(long i)
# ele1 in %rdi
rsum_list:	
    pushq %rbx          #自递归，需要保存目前的结果。其实rbx就是存了 上一个node.val
	xorq %rax,%rax           #return val=0
    andq %rdi, %rdi
    je finish
    mrmovq (%rdi), %rbx  #当前的node.val
    mrmovq 8(%rdi), %rdi #node = node->next
    call rsum_list
    addq %rbx, %rax #在这里仍然会继续往下执行。

finish:
    popq %rbx            #
	ret                  # Return

# Stack starts here and grows to lower addresses.
# 这里自定义栈开始地址
	.pos 0x200
stack:

测试可行。

copy_block

/* copy_block - Copy src to dest and return xor checksum of src */
long copy_block(long *src, long *dest, long len)
{
    long result = 0;
    while (len > 0) {
        long val = *src++; //两个语句：long val = *src;src++;
        *dest++ = val;//两个语句：*dest = val;dest++
        result ^= val;//update checksum
        len--;
    }
    return result;
}

.align 8
# Source block
src:
.quad 0x00a
.quad 0x0b0
.quad 0xc00
# Destination block
dest:
.quad 0x111
.quad 0x222
.quad 0x333

慢慢做。

copy_block.ys

# Execution begins at address 0 
	.pos 0
	irmovq stack, %rsp  	# Set up stack pointer
	call main		# Execute main program
	halt			# Terminate program 
	
# 内存区域，存放数据/链表之类
.align 8
# Source block
src:
.quad 0x00a
.quad 0x0b0
.quad 0xc00
# Destination block
dest:
.quad 0x111
.quad 0x222
.quad 0x333

#END
main:	
	irmovq src,%rdi     #param1
    irmovq dest,%rsi    #param2
    irmovq $3,%rdx     #param3
	call copy_block		# copy_block(src,dest,)
	ret

# long copy_block(long *src, long *dest, long len)
# src in %rdi
# dest in %rsi
# 3 in %rdx
copy_block:	
    pushq %rbx
    pushq %r9
    pushq %r10
	xorq %rax,%rax           #result=0
    irmovq $8,%r9
    irmovq $1,%r10
    jmp test
loop:
    mrmovq (%rdi),%rbx        #long val = *src;
    addq %r9,%rdi            #src++;
    rmmovq %rbx,(%rsi)        #*dest = val;
    addq %r9,%rsi            #dest++;
    xorq %rbx,%rax            #update checksum
    subq %r10,%rdx           #len--
test:
    andq %rdx,%rdx  #set CC
    jne loop        #Stop when len = 0
    popq %r10
    popq %r9
    popq %rbx
	ret                         

# Stack starts here and grows to lower addresses.
	.pos 0x200
stack:

值得注意的点：

Y86-64的OP操作的两端必须是寄存器，而不能是立即数

PART B

在sim/seq文件夹。

任务：拓展SEQ处理器，即修改seq-full.hcl文件以支持iaddq。

iaddq description

指令格式

顺序实现

阶段	iaddq V,rB
取指Fetch	icode:ifun <- M1[PC] rA:rB <- M1[PC+1] D <- M8[ PC+2 ] valP <- PC+10
译码Decode	valB <- R[rB]
执行Execute	valE <- valB + valC Set CC
访存Memory
写回	R[rB] <- valE
更新PC Program Counter Update	PC <- valP

书本的P280详细介绍了五个阶段的过程。

修改seq-full.hcl

符号表已经增加了iaddq

# Instruction code for iaddq instruction
wordsig IIADDQ	'I_IADDQ'

所以我们按照顺序修改五个阶段即可。

Fetch

这个阶段和irmovq V,rB是一模一样的。所以它怎样，iaddq就怎样。

#有效指令
bool instr_valid = icode in 
	{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
	       IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ,IIADDQ };
	       
#需要寄存器
bool need_regids =
	icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ, 
		     IIRMOVQ, IRMMOVQ, IMRMOVQ ,IIADDQ};

#需要常量
bool need_valC =
	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL 		 ,IIADDQ};

Decode

要读寄存器rB的值。

指定srcB

## What register should be used as the B source?
word srcB = [
	icode in { IOPQ, IRMMOVQ, IMRMOVQ,IIADDQ  } : rB;
	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't need register
];

Execute

valE <- valB + valC

即aluA = valC；aluB = valB ；

## Select input A to ALU
word aluA = [
	icode in { IRRMOVQ, IOPQ } : valA;
	icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ,IIADDQ } : valC;
	icode in { ICALL, IPUSHQ } : -8;
	icode in { IRET, IPOPQ } : 8;
	# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
	icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL, 
		      IPUSHQ, IRET, IPOPQ,IIADDQ } : valB;
	icode in { IRRMOVQ, IIRMOVQ } : 0;
	# Other instructions don't need ALU
];

这里我们是默认使用了add，所以不需要修改alufun

但我们希望iaddq可以更新CC

## Should the condition codes be updated?
bool set_cc = icode in { IOPQ, IIADDQ};

访存Memory

没有。

写回

R[rB] <- valE

目标寄存器是rB。这里和irmovq也是一样的。

## What register should be used as the E destination?
word dstE = [
	icode in { IRRMOVQ } && Cnd : rB;
	icode in { IIRMOVQ, IOPQ,IIADDQ} : rB;
	icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
	1 : RNONE;  # Don't write any register
];

更新PC

默认就会更新为valP，此处不用修改。

构造ssim并且测试

根据seq-full.hcl文件构建新的仿真器。注意指定一下tcl和tk.
make VERSION=full
在小的Y86-64程序中测试你的方法
./ssim -t ../y86-code/asumi.yo
如果失败了，还要重新修改你的实现

使用基准程序来测试你的方法
(cd ../y86-code; make testssim)
一旦可以正确执行基准测试程序，则应在../ptest中运行大量的回归测试

测试除了iaddq以外的所有指令
(cd ../ptest; make SIM=../seq/ssim) 
测试我们实现的iaddq指令
 (cd ../ptest; make SIM=../seq/ssim TFLAGS=-i)

至此PART B完成。

但需要承认的是，这里可参考元素过多，其实对五个阶段还并不是很熟悉就可以完成。

PART C

在sim/pipe。

任务：修改ncopy.ys and pipe-full.hcl两个文件，使得ncopy.ys跑得越快越好。

ncopy

ncopy.c

/*
 * ncopy - copy src to dst, returning number of positive ints
 * contained in src array.
 */
word_t ncopy(word_t *src, word_t *dst, word_t len)
{
    word_t count = 0;
    word_t val;

    while (len > 0) {
        val = *src++; //val = *src; src++;
        *dst++ = val; //*dst = val; dst++;
        if (val > 0)
            count++;
        len--;
    }
    return count;
}

ncopy.ys

##################################################################
# Do not modify this portion
# Function prologue.
# %rdi = src, %rsi = dst, %rdx = len
ncopy:

##################################################################
# You can modify this portion
	# Loop header
	xorq %rax,%rax		# count = 0;
	andq %rdx,%rdx		# len <= 0?
	jle Done		# if so, goto Done:

Loop:	
	mrmovq (%rdi), %r10	# read val from src
	rmmovq %r10, (%rsi)	# and store it to dst
	andq %r10, %r10		# val <= 0?
	jle Npos		# if so, goto Npos:
	irmovq $1, %r10
	addq %r10, %rax		# count++
Npos:	
	irmovq $1, %r10
	subq %r10, %rdx		# len--
	irmovq $8, %r10
	addq %r10, %rdi		# src++
	addq %r10, %rsi		# dst++
	andq %rdx,%rdx		# len > 0?
	jg Loop			# if so, goto Loop:
##################################################################
# Do not modify the following section of code
# Function epilogue.
Done:
	ret
##################################################################
# Keep the following label at the end of your function
End:
#/* $end ncopy-ys */

测试以及评分

../misc/yas ncopy.ys 生成ncopy.yo

./check-len.pl < ncopy.yo 检测长度

make drivers 生成ncopy.ys的测试程序

make psim VERSION=full 生成新的psim

./psim -t sdriver.yo 测试small 4-element array

./psim -t ldriver.yo测试larger 63-element array

./correctness.pl测试不同Block length下code range是否符合

./benchmark.pl 评分

可以用./correctness.pl测试ncopy函数的正确性，然后使用./benchmark.pl来测试函数的性能，希望CPE越小越好。初始CPE为15.18，大于10.5为0分，小于7.5为满分60。

if the simulated code requires C cycles（时钟周期） to copy a block of N（数组大小） elements, then the CPE is C=N.

We will therefore evaluate the performance of your function by computing the average of the CPEs for blocks ranging from 1 to 64 elements.

思路

增加使用iaddq。改pipe-full.hcl是和PART B是一样的。然后将ncopy.ys的所有不合适addq改成iaddq。

##################################################################
# You can modify this portion
	# Loop header
	xorq %rax,%rax		# count = 0;
	andq %rdx,%rdx		# len <= 0?
	jle Done		# if so, goto Done:

Loop:	
	mrmovq (%rdi), %r10	# read val from src...
	rmmovq %r10, (%rsi)	# ...and store it to dst
	andq %r10, %r10		# val <= 0?
	jle Npos		# if so, goto Npos:
	iaddq $1, %rax		# count++
Npos:	
	iaddq $-1, %rdx		# len--
	iaddq $8, %rdi		# src++
	iaddq $8, %rsi		# dst++
	andq %rdx,%rdx		# len > 0?
	jg Loop			# if so, goto Loop:
##################################################################

此时是12.70，仍然没有分。

偷工减料

xorq %rax,%rax # count = 0; 删掉，因为是单次程序，rax初始化就是0
我们设计iaddq的时候，会让它顺便设置符号位。所以我们可以省去andq %rdx,%rdx这个操作再去设置符号位。但是要移动一下顺序，使得iaddq $-1, %rdx和jg Loop临接

##################################################################
# You can modify this portion
	# Loop header
	andq %rdx,%rdx		# len <= 0?
	jle Done		# if so, goto Done:
Loop:	
	mrmovq (%rdi), %r10	# read val from src...
	rmmovq %r10, (%rsi)	# ...and store it to dst
	andq %r10, %r10		# val <= 0?
	jle Npos		# if so, goto Npos:
	iaddq $1, %rax		# count++
Npos:	
	iaddq $8, %rdi		# src++
	iaddq $8, %rsi		# dst++
	iaddq $-1, %rdx		# len--
	jg Loop			# if so, goto Loop:
##################################################################

到这里了就要开始看书本第五章了。先去看。

好的看完回来了。

循环展开

通常展开k小于16，但现在有代码长度限制（9是极限）。看了几篇博文都说展开到7 8 9 都是差不多的。

我们就挑一个吉利的数字8，做8x8展开。

//原版本
word_t ncopy1(word_t *src, word_t *dst, word_t len)
{
    word_t count = 0;
    word_t val;

    while (len > 0) {
        val = *src; 
        src++;
        *dst = val; 
        dst++;
        if (val > 0)
            count++;
        len--;
    }
    return count;
}

//新版本 8x8循环展开
word_t ncopy1(word_t *src, word_t *dst, word_t len)
{
    word_t count = 0;
    word_t acc0;
    word_t acc1;
    word_t acc2;
    word_t acc3;
    word_t acc4;
    word_t acc5;
    word_t acc6;
    word_t acc7;

    while (len  - 8 > 0) {
        acc0 = *(src);
        acc1 = *(src+1);
        acc2 = *(src+2);
        acc3 = *(src+3);
        acc4 = *(src+4);
        acc5 = *(src+5);
        acc6 = *(src+6);
        acc7 = *(src+7);

        if (... > 0)  //八个都判断是否添加
            count++;
        
        *(dst) = acc0; //八个都赋值
        ...
        
        len-=8;
        src+=8;
        dst+=8;
    }
    
    //处理剩余的
    while (len > 0) {
        val = *src; 
        src++;
        *dst = val; 
        dst++;
        if (val > 0)
            count++;
        len--;
    }
    return count;
}

所以，