// f = g + (h - 5);
// f, g and h have been placed in registers x5, x6, x7
addi x7, x7, -5
add x5, x6, x7
/*
add f, g, h
add f, i, f
*/
f = i + (g + h);
// B[8] = A[i - j];
//i in x28, j in x29. A in x10, B in x11
sub x30, x28, x29
slli x30, x30, 3
add x31, x10, x30
ld x9, 0(x31)
sd x9, 64(x11)
B[g * 8] = 2 * A[f * 8] + 8;
little-endian:
0-7: abcdef12
big-endian:
0-7: 12efcdab
0xabcdef12 = a * (16^7) + b * (16^6) + c * (16^5) + d *(16^4) + e * (16^3) + f * (16^2) + 1 * 16 + 2
= 2882400018
//B[8] = A[i] + A[j];
//i in x28, j in x29, A in x10, B in x11
add x30, x10, x28
slli x30, x30, 3
add x31, x10, x29
slli x31, x31, 3
add x30, x30, x31
sd x9, 0(x30)
ld x9, 64(x11)
A[0] = A[8];
f = A[8] + A[0];
//addi x30, x10, 8
immediate: 000000001000
rs1: 01010
funct3: 000
rd: 11110
opcode: 0010011
//addi x31, x10, 0
immediate: 000000000000
rs1: 01010
funct3: 000
rd: 11111
opcode: 0010011
//sd x31, 0(x30)
immediate: 0000000
rs2: 11111
rs1: 11110
funct3: 011
immediate: 00000
opcode: 0100011
//ld x30, 0(x30)
immediate: 000000000000
rs1: 11110
funct3: 011
rd: 11110
opcode: 0000011
//add x5, x30, x31
funct7: 0000000
rs2: 11111
rs1: 11110
funct3: 000
rd: 00101
opcode: 0110011
/*
x5 = 0x8000000000000000, x6 = 0xD000000000000000
*/
//add x30, x5, x6
x30 = 0x5000000000000000
overflow
//sub x30, x5, x6
x30 = 0xB000000000000000
overflow
//add x30, x5, x6
//add x30, x30, x5
x30 = 0xD000000000000000
overflow
// x5 = 128
x6 >= 0xFFFFFFFFFFFFFF80
x6 > 0x000000000000080
x6 < 0x000000000000080
/*
0000 0000 0001 0000 1000 0000 1011 0011
*/
R-type instruction
funct7: 0000000
rs2: 00001
rs1: 00001
funct3: 000
rd: 00001
opcode: 0110011
add x1, x1, x1
//sd x5, 32(x30)
S-type instruction
immediate: 0000001
rs2: 00101
rs1: 11110
funct3: 011
immediate: 00000
opcode: 0100011
0000 0010 0101 1111 0011 0000 0010 0011
/*
opcode = 0x33, funct3 = 0x0, funct7 = 0x20, rs2 = 5, rs1 = 7, rd = 6
*/
R-type instruction
sub x6, x7, x5
0100 0000 0101 0011 1000 0011 0011 0011
/*
opcode = 0x3, funct3 = 0x3, rs1 = 27, rd = 3, imm = 0x4
*/
I-type instruction
ld x3, 4(x27)
0000 0000 0100 1101 1011 0001 1000 0011
/*
x5 = 0x00000000AAAAAAAA, x6 = 0x1234567812345678
*/
/*
slli x7, x5, 4
or x7, x7, x6
*/
x7 = 1234FEFABABE5678
//slli x7, x6, 4
x7 = 0x5678123456780000
/*
srli x7, x5, 3
andi x7, x7, 0xFEF
*/
x7 = 0xAAA
slli x5, x5, 15
ori x5, x5, 0xFFC0FFFFFFFFFFFF
and x6, x6, x5
// not x5, x6
xori x5, x6, 0xFFFFFFFFFFFFFFFF
/*
x6 = A, x17 is the base address of C
A = C[0] << 4
*/
sd x18, 0(x17)
slli x18, x18, 4
addi x6, x18, 0
/*`在这里插入代码片`
x5 = 0x00000000001010000
bge x5, x0, ELSE
jal x0, DONE
ELSE: ori x6, x0, 2
DONE:
*/
x6 = 2
//PC = 0x20000000
0x20000000 ± 0xFFFFF
0x20000000 ± 0xFFF
//a proposed new instruction named rpt
// rpt x29, loop
if (x29 > 0){
x29 = x29 - 1;
goto loop
}
I think the most appropriate instruction format is I-type.
x29 as a rs1, s as a rd, -1 as a immediate.
bge x29, 0, loop
addi x29, x29, -1
LOOP: beq x6, x0, DONE
addi x6, x6, -1
addi x5, x5, 2
jal x0, LOOP
DONE:
The final value in register x5 is 20
assuming the x5 is initially zero and the x6 is initially 10.
while (i != 0) {
i = i - 1 ;
acc = acc + 2;
}
4 * N + 1
LOOP: blt x6, x0, DONE
addi x6, x6, -1
addi x5, x5, 2
jal x0, LOOP
DONE:
while (i >= 0) {
i = i - 1;
acc = acc + 2;
}
/* a in x5, b in x6, i in x7, j in x29
x10 holds the base address of the array D */
//translate the following C code to RISC-V assembly code
for (i = 0; i < a; i++)
for (j = 0; j < b; j++)
D[4 * j] = i + j;
and x7, x7, 0
and x29, x29, 0
LOOP1: bge x7, x5, DONE
bge x29, x6, LOOP2
add x30, x7, x29
slli x8, x29, 5
add x10, x10, x8
sd x9, 0(x30)
ld x9, 0(x10)
addi x29, x29, 1
jal x0, LOOP1
LOOP2: addi x7, x7, 1
and x29, x29, 0
jal x0, LOOP1
DONE:
3 + a * (9 * b + 5)
if a = 10, b = 1, all elements of D are initially 0,
the total number of RISC-V instructions is 143.
//translate the following loop into C.
// i in x6, result in x5, MemArray in x10
addi x6, x0, 0
addi x29, x0, 100
LOOP: ld x7, 0(x10)
add x5, x5, x7
addi x10, x10, 8
addi x6, x6, 1
blt x6, x29, LOOP
i = 0;
do {
result = result + *MemArray;
MemArray++;
i = i + 1;
} while (i < 100);
addi x6, x0, 0
LOOP: bge x6, 100, DONE
ld x7, 0(x10)
add x5, x5, x7
addi x10, x10, 8
addi x6, x6, 1
jal x0, loop
DONE:
//implement the following C code in RISC-V assembly
//the stack pointer must remain aligned on a multiple of 16
int fib(int n) {
if (n == 0)
return 0;
else if (n == 1)
return 1;
else
return fib(n - 1) + fib(n - 2);
}
// n in x10
fib:
addi, sp, sp, -16 //adjust stack for 2 items
sd, x1, 8(sp) //save the return address
sd, x10,0(sp) //save the argument n
bne x10, x0, L1 //if n != 0, go to L1
addi x10, x0, 0 //return 0
addi sp, sp, 16 //pop 2 items off stack
jalr x0, 0(x1) //return to caller
L1: addi x6, x10, -2 //x6 = n - 2
bge x6, x0, L2 //if (n - 2) >= 0, go to L2
addi x10, x0, 1 //return 1
addi sp, sp, 16 //pop 2 items off stack
jalr x0, 0(x1) //return to caller
L2: addi x10, x10, -1 //n >= 2: argument gets (n - 1)
jal x1, fib //call fib with (n - 1)
addi x5, x10, 0 //return from jal: move result of fib (n-1) to x5
addi x10, x10, -2 //argument gets (n - 2)
jal x1, fib //call fib with (n - 2)
addi x7, x10, 0 //return from jal: move result of fib (n-2) to x7
ld x10, 0(sp) //restore argument n
ld x1, 8(sp) //restore the return address
addi sp, sp, 16 //adjust stack pointer to pop 2 items
add x10, x5, x7 // return fib(n - 1) + fib(n - 2)
jalr x0, 0(x1) //return to the caller
// x7 contains the address 0x10000000 and the data at address is 0x1122334455667788
lb x6, o(x7)
sd x6, 8(x7)
77
22
lui x10, 0x1122334455667
addi x10, x10, 0x788
old execution time = (1 * 500 + 10 * 300 + 3 * 100) / clock rate = 3800 / rate
new execution time = (0.75 * 500 + 10 * 300 + 3 * 100) * 1.1 / clock rate = 4042.5 / rate
So this is not good design choice.
//double the performance of arithmetic instructions
new execution time = (0.5 * 500 + 10 * 300 + 3 * 100) / clock rate = 3550 / rate
the overall speedup = 3800 / 3550 = 1.07
//improve the performance of arithmetic instructions by 10 times
new execution time = (0.1 * 500 + 10 * 300 + 3 * 100) / clock rate = 3350 / rate
the overall speedup = 3800 / 3350 = 1.13
assume the number of instruction is I
the average CPI = (2 * 0.7 * I + 6 * 0.1 * I + 3 * 0.2 * I) / I = 2.6
assume arithmetic instructions requires x cycles
0.7 * x + 1.2 = 0.75 * 2.6
x = 1.07
assume arithmetic instructions requires x cycles
0.7 * x + 1.2 = 0.5 * 2.6
x = 0.14