看了下liangbch 兄打包的Uint32sqrt测试程序,发现进一步改进FPU式的很难,不过我还是勉强做了些细微的改动,给出四个版本加上iSqrt_FPU2_yaos,iSqrt_FPU1_lbc。
代码: #include <stdio.h> #include <time.h> typedef unsigned int DWORD; double b32[] = {0.0,4294967296.0}; __declspec(naked) DWORD __fastcall iSqrt_FPU2_yaos(DWORD n) { __asm { push ecx mov eax, ecx and eax, 0x80000000 shr eax, 31 fld qword ptr [b32 + eax * 8] fild dword ptr [esp] faddp st(1), st fsqrt sub esp, 8 fstp qword ptr [esp] mov edx, dword ptr [esp + 4] mov eax, edx and edx,0x7ff00000 and eax,0xfffff shr edx, 20 or eax, 0x100000 xchg ecx, edx sub ecx, 1043 neg ecx shr eax, cl xchg edx, ecx add esp, 12 or ecx, ecx cmove eax, ecx ret } } _declspec(naked) DWORD fast_sqrt1(DWORD x) { _asm { sub esp,4 mov dword ptr [esp+12],0 fild qword ptr [esp+8] fsqrt fisttp dword ptr[esp] mov eax,[esp] add esp,4 ret } } _declspec(naked) DWORD fast_sqrt2(DWORD x) { _asm { add esp,12 mov dword ptr [esp-4],0 fild qword ptr [esp-8] fsqrt fstp qword ptr [esp] mov ecx,[esp+4] ;//指数处理 mov eax,ecx ;//尾数处理 shr ecx,20 and eax,0xfffff sub esp,12 or eax,0x100000 test ecx,ecx ;//对0的处理 cmove eax,ecx sub ecx,1075 neg ecx shr eax,cl ret } } _declspec(naked) DWORD fast_sqrt3(DWORD x) { _asm { add esp,4 mov eax,dword ptr [esp] and eax, 0x80000000 shr eax, 28 fld qword ptr [b32 + eax] fild dword ptr [esp] faddp st(1), st fsqrt fstp qword ptr [esp+4] mov ecx,[esp+8] ;//指数处理 mov eax,ecx ;//尾数处理 shr ecx,20 and eax,0xfffff sub esp,4 or eax,0x100000 test ecx,ecx ;//对0的处理 cmove eax,ecx sub ecx,1075 neg ecx shr eax,cl ret } } __declspec(naked) DWORD __fastcall fast_sqrt4(DWORD n) { __asm { push ecx shr ecx, 31 fld qword ptr [b32 + ecx*8] fild dword ptr [esp] faddp st(1),st fsqrt fisttp dword ptr[esp] pop eax ret } } double zero5= 0.49999999999636;; __declspec(naked) DWORD __fastcall iSqrt_FPU1_lbc(DWORD n) { __asm { push ecx shr ecx, 31 fld qword ptr [b32 + ecx * 8] fild dword ptr [esp] faddp st(1),st fsqrt fsub qword ptr [zero5] fistp dword ptr [esp] pop eax ret } } int main() { double t0,t1; DWORD i; printf("Elapsed time: /n"); //============================= fast_sqrt1(0); t0=clock(); for(i=0;i<=0xfffffff;i++) //test { fast_sqrt1(i); } printf("fast_sqrt1 : %f s/n",(clock()-t0)/CLOCKS_PER_SEC); //============================= fast_sqrt2(0); t0=clock(); for(i=0;i<=0xfffffff;i++) //test { fast_sqrt2(i); } printf("fast_sqrt2 : %f s/n",(clock()-t0)/CLOCKS_PER_SEC); //============================= iSqrt_FPU2_yaos(0); t0=clock(); for(i=0;i<=0xfffffff;i++) //test { iSqrt_FPU2_yaos(i); } printf("iSqrt_FPU2_yaos: %f s/n",(clock()-t0)/CLOCKS_PER_SEC); //============================= fast_sqrt3(0); t0=clock(); for(i=0;i<=0xfffffff;i++) //test { fast_sqrt3(i); } printf("fast_sqrt3 : %f s/n",(clock()-t0)/CLOCKS_PER_SEC); //============================= fast_sqrt4(0); t0=clock(); for(i=0;i<=0xfffffff;i++) //test { fast_sqrt4(i); } printf("fast_sqrt4 : %f s/n",(clock()-t0)/CLOCKS_PER_SEC); //============================= iSqrt_FPU1_lbc(0); t0=clock(); for(i=0;i<=0xfffffff;i++) //test { iSqrt_FPU1_lbc(i); } printf("iSqrt_FPU1_lbc : %f s/n",(clock()-t0)/CLOCKS_PER_SEC); //============================= printf("/n/nBoundary Test./n"); printf("fast_sqrt1(0)=%10u /n",fast_sqrt1(0)); printf("fast_sqrt1(0xffffffff)=%u /n/n",fast_sqrt1(0xffffffff)); printf("fast_sqrt2(0)=%10u /n",fast_sqrt2(0)); printf("fast_sqrt2(0xffffffff)=%u /n/n",fast_sqrt2(0xffffffff)); printf("fast_sqrt3(0)=%10u /n",fast_sqrt3(0)); printf("fast_sqrt3(0xffffffff)=%u /n/n",fast_sqrt3(0xffffffff)); printf("fast_sqrt4(0)=%10u /n",fast_sqrt4(0)); printf("fast_sqrt4(0xffffffff)=%u /n",fast_sqrt4(0xffffffff)); printf("iSqrt_FPU1_lbc(0)=%10u /n",iSqrt_FPU1_lbc(0)); printf("iSqrt_FPU1_lbc(0xffffffff)=%u /n",iSqrt_FPU1_lbc(0xffffffff)); return 0; }