依然是 CSAPP bomblab phase_1 的小白视角的理解。先前已经根据汇编代码,手工翻译到了C代码,但是步骤上显得很吃力。为什么呢?汇编代码比较短的时候(比如10行之内),能够记住每个寄存器的值; 汇编代码超过10行时,很容易忘记每个寄存器里的值。虽然可以用注释的方式,把变量和寄存器名字绑定起来,但是每翻译一小段汇编代码后,就需要更新对应的C代码,过于繁琐了。例如 strings_not_equal 大概分成了10小段汇编代码。而且这个划分未必是合理的,还容易漏掉内容。
本篇则是前面两篇的尝试手工反汇编的重构版本: 使用了标准的4个步骤,让反汇编变得容易:
第0步: 打开 gdb, 查看函数汇编代码:
gdb bomb
(gdb) disas string_length
第1步: 复制汇编代码到文本文件,在汇编代码右侧,逐句写出对应的C代码
Dump of assembler code for function string_length: // int string_length(char* str)
0x000000000040131b <+0>: cmp BYTE PTR [rdi],0x0 // if (*str==0)
0x000000000040131e <+3>: je 0x401332 <string_length+23> // goto label23
0x0000000000401320 <+5>: mov rdx,rdi // char* p = str;
0x0000000000401323 <+8>: add rdx,0x1 // p += 1;
0x0000000000401327 <+12>: mov eax,edx // int ret = p;
0x0000000000401329 <+14>: sub eax,edi // ret = ret - str;
0x000000000040132b <+16>: cmp BYTE PTR [rdx],0x0 // if (!(*p == 0))
0x000000000040132e <+19>: jne 0x401323 <string_length+8> // goto label8
0x0000000000401330 <+21>: repz ret // return ret
0x0000000000401332 <+23>: mov eax,0x0 // ret = 0
0x0000000000401337 <+28>: ret // return ret
第2步:抄写C代码,并且添加用到的label,这些 label 是 goto 的目的位置
int string_length(char* str)
{
if (*str == 0)
{
goto label23;
}
char* p = str;
label8:
p += 1;
int ret = p - str;
if (*p != 0)
{
goto label8;
}
return ret;
label23:
ret = 0;
return ret;
}
第3步:消除 goto 和 label:
int string_length(char* str)
{
if (*str == 0)
{
return 0;
}
char* p = str;
while (true)
{
p += 1;
int ret = p - str;
if (*p == 0)
{
return ret;
}
}
}
第4步: 消除冗余C代码
int string_length(char* str)
{
if (*str == 0)
{
return 0;
}
char* p = str;
int ret = 1;
while (true)
{
p += 1;
ret = p - str;
if (*p == 0)
{
break;
}
}
return ret;
}
第5步: 消除冗余C代码
int string_length(char* str)
{
for (char* p = str; ; p++)
{
int ret = p - str;
if (*p == 0)
{
return ret;
}
}
}
第0步: gdb 加载可执行文件, 使用 disas 命令查看反汇编代码。
gdb bomb
(gdb) disas strings_not_equal:
第1步: 在汇编代码右侧,逐句写出对应的C代码:
Dump of assembler code for function strings_not_equal: // int strings_not_equal(const char* s1, const char* s2) {
0x0000000000401338 <+0>: push r12 //
0x000000000040133a <+2>: push rbp //
0x000000000040133b <+3>: push rbx //
0x000000000040133c <+4>: mov rbx,rdi // char* p1 = s1; 根据后面 BYTE PTR 能确定类型是 char*
0x000000000040133f <+7>: mov rbp,rsi // char* p2 = s2;
0x0000000000401342 <+10>: call 0x40131b <string_length> // int len1 = string_length(s1);
0x0000000000401347 <+15>: mov r12d,eax //
0x000000000040134a <+18>: mov rdi,rbp // char* p = s2;
0x000000000040134d <+21>: call 0x40131b <string_length> // int len2 = string_length(p);
0x0000000000401352 <+26>: mov edx,0x1 // int ret = 1;
0x0000000000401357 <+31>: cmp r12d,eax // if (len1 != len2)
0x000000000040135a <+34>: jne 0x40139b <strings_not_equal+99> // goto label99
0x000000000040135c <+36>: movzx eax,BYTE PTR [rbx] // char c1 = *p1;
0x000000000040135f <+39>: test al,al // if (c1 == '\0')
0x0000000000401361 <+41>: je 0x401388 <strings_not_equal+80> // goto label80
0x0000000000401363 <+43>: cmp al,BYTE PTR [rbp+0x0] // if (c1 == *p2)
0x0000000000401366 <+46>: je 0x401372 <strings_not_equal+58> // goto label58
0x0000000000401368 <+48>: jmp 0x40138f <strings_not_equal+87> // goto label87
0x000000000040136a <+50>: cmp al,BYTE PTR [rbp+0x0] // if (c1 != *p2)
0x000000000040136d <+53>: nop DWORD PTR [rax] //
0x0000000000401370 <+56>: jne 0x401396 <strings_not_equal+94> // goto label94
0x0000000000401372 <+58>: add rbx,0x1 // p1++;
0x0000000000401376 <+62>: add rbp,0x1 // p2++;
0x000000000040137a <+66>: movzx eax,BYTE PTR [rbx] // c1 = *p1;
0x000000000040137d <+69>: test al,al // if (c1 != '\0')
0x000000000040137f <+71>: jne 0x40136a <strings_not_equal+50> // goto label50
0x0000000000401381 <+73>: mov edx,0x0 // ret = 0
0x0000000000401386 <+78>: jmp 0x40139b <strings_not_equal+99> // goto label99
0x0000000000401388 <+80>: mov edx,0x0 // ret = 0
0x000000000040138d <+85>: jmp 0x40139b <strings_not_equal+99> // goto label99
0x000000000040138f <+87>: mov edx,0x1 // ret = 1
0x0000000000401394 <+92>: jmp 0x40139b <strings_not_equal+99> // goto label99
0x0000000000401396 <+94>: mov edx,0x1 // ret = 1
0x000000000040139b <+99>: mov eax,edx // return ret
0x000000000040139d <+101>: pop rbx
0x000000000040139e <+102>: pop rbp
0x000000000040139f <+103>: pop r12
0x00000000004013a1 <+105>: ret
第2步: 逐句抄写 C 代码, 并且添加对应的 label, 用于 goto:
int strings_not_equal(const char* s1, const char* s2)
{
char* p1 = s1;
char* p2 = s2;
int len1 = string_length(s1);
int len2 = string_length(s2);
int ret = 1;
if (len1 != len2)
{
goto label99
}
char c1 = *p1;
if (c1 =='\0')
{
goto label80;
}
if (c1 == *p2)
{
goto label58;
}
else
{
goto label87;
}
label50:
if (c1 != *p2)
{
goto label94;
}
label58:
p1++;
p2++;
c1 = *p1;
if (c1 != '\0')
goto label50
ret = 0;
goto label99;
label80:
ret = 0;
goto label99;
label87:
ret = 1;
goto label99;
label94:
ret = 1;
goto label99;
label99:
return ret;
}
第3步: 消除用于返回返回值的 goto:
int strings_not_equal(const char* s1, const char* s2)
{
char* p1 = s1;
char* p2 = s2;
int len1 = string_length(s1);
int len2 = string_length(s2);
if (len1 != len2)
{
return 1;
}
char c1 = *p1;
if (c1 =='\0')
{
return 0;
}
if (c1 == *p2)
{
goto label58;
}
else
{
return 1;
}
label50:
if (c1 != *p2)
{
return 1;
}
label58:
p1++;
p2++;
c1 = *p1;
if (c1 != '\0')
goto label50
return 0;
}
第4步: 消除最后的 goto, 消除重复代码, 转为 while 循环:
int strings_not_equal(const char* s1, const char* s2)
{
char* p1 = s1;
char* p2 = s2;
int len1 = string_length(s1);
int len2 = string_length(s2);
if (len1 != len2)
{
return 1;
}
while (true)
{
char c1 = *p1;
if (c1 =='\0')
{
return 0;
}
if (c1 != *p2)
{
return 1;
}
p1++;
p2++;
}
}
围棋里有定式。反汇编也有定式。
cmp + je: if( A == B) goto C
0x000000000040131b <+0>: cmp BYTE PTR [rdi],0x0 // if (*str==0)
0x000000000040131e <+3>: je 0x401332 <string_length+23> // goto label23
cmp + jne: if (A != B) goto C
0x000000000040132b <+16>: cmp BYTE PTR [rdx],0x0 // if (!(*p == 0))
0x000000000040132e <+19>: jne 0x401323 <string_length+8> // goto label8
更多例子:
0x0000000000401357 <+31>: cmp r12d,eax // if (len1 != len2)
0x000000000040135a <+34>: jne 0x40139b <strings_not_equal+99> // goto label99
0x0000000000401363 <+43>: cmp al,BYTE PTR [rbp+0x0] // if (c1 == *p2)
0x0000000000401366 <+46>: je 0x401372 <strings_not_equal+58> // goto label58
0x0000000000401368 <+48>: jmp 0x40138f <strings_not_equal+87> // goto label87
0x000000000040136a <+50>: cmp al,BYTE PTR [rbp+0x0] // if (c1 != *p2)
0x000000000040136a <+50>: cmp al,BYTE PTR [rbp+0x0] // if (c1 != *p2)
0x000000000040136d <+53>: nop DWORD PTR [rax] //
0x0000000000401370 <+56>: jne 0x401396 <strings_not_equal+94> // goto label94
test A,A + je: if (A == 0) goto B
0x000000000040135f <+39>: test al,al // if (c1 == '\0')
0x0000000000401361 <+41>: je 0x401388 <strings_not_equal+80> // goto label80
test A, A + jne: if (A != 0) goto B
0x000000000040137d <+69>: test al,al // if (c1 != '\0')
0x000000000040137f <+71>: jne 0x40136a <strings_not_equal+50> // goto label50