c 内联汇编 crc 32 算法 ...

__declspec(naked) static __cdecl getCrc32(unsigned int size, unsigned char* buffer) {

__asm {
mov edx, 4[esp] ; - U ecx <- loop count
mov esi, 8[esp] ; - V esi <- source buffer

lea edi, [crc32_table] ; - U crc32 table
xor ecx, ecx ; - V

mov eax, -1 ; - U
add esi, edx ; - V

neg edx ; - N

mov cl, al ; - U
align 16 ; - V

main_loop:

shr eax, 8 ; - U
xor cl, [edx+esi] ; - V 2 cycle ...

xor eax, [edi + ecx*4] ; - U
inc edx ; - V 2 cycle ..

mov cl, al ; - U
jne main_loop ; - V 1 cycle ...
imt_ret:
xor eax, -1
ret
}
}


写的时候忘记保存寄存器了 ... 这是参考的 原始的 crc32 算法做的汇编优化 不排除会比 优化过 算法的 crc32 慢 ... -_- ./....一次处理了 一个字节 想一次多处理几个字节来着 ... 发现每次都会依赖上次的的结果求值 ... 优化过程可能也会与 无法避免的 agi 冲突 ...整个循环 每次都要 读内存两次 (读crc表/缓冲区数据) .. 除此之外都是用 寄存器做的了 ...值得一提的是 sse4.2 已经从 硬件指令上支持了 crc 算法 ... http://softpixel.com/~cwright/programming/simd/sse4.php

d 语言版本
import std.digest.crc;
import std.stdio;
import std.c.stdio;

extern(C)
uint getCrc32(uint size, ubyte* pBuffer) {
asm{
naked ; // use naked asm mode ...

push EDI ; //
push ESI ;

mov EDX, 12[ESP] ; // - U ECX <- loop count
mov ESI, 16[ESP] ; // - V ESI <- source buffer

lea EDI, [crc32_table] ; // - U crc32 table
xor ECX, ECX ; // - V

mov EAX, -1 ; // - U
add ESI, EDX ; // - V

neg EDX ; // - N

mov CL, AL ; // - U
align 16 ; // - V

main_loop:

shr EAX, 8 ; // - U
xor CL, [EDX+ESI] ; // - V 2 cycle ...

xor EAX, [EDI + ECX*4] ; // - U
inc EDX ; // - V 2 cycle ..

mov CL, AL ; // - U
jne main_loop ; // - V 1 cycle ...

pop ESI ;
pop EDI ;

xor EAX, -1 ;
ret ;
}
}
// F7D18982
extern(C)
ulong RDTSC () {
asm {
naked;
rdtsc;
ret;
}
}
// crc32Of
void main () {

uint [16000] p;

p[0..$] = 0xFFFFFFFF;

int index = 15;
while (index--){
ulong tStart= RDTSC();

uint drt[1] = cast(uint[])crc32Of(p);

printf ("std time : %d hash:%x\n", cast(uint)(RDTSC() - tStart), drt[0]);

tStart= RDTSC();

printf ("asm time : %d hash:%x\n\n", cast(uint)(RDTSC() - tStart), getCrc32(64000,cast(ubyte*)p));
}
}


Microsoft Windows XP [版本 5.1.2600]
(C) 版权所有 1985-2001 Microsoft Corp.

D:\Downloads\dmd.2.067.0.windows\dmd2\windows\bin>dmd -O -release -inline -bound
scheck=off main.d

D:\Downloads\dmd.2.067.0.windows\dmd2\windows\bin>main
std time : 944818 hash:2d732d7c
asm time : 476119 hash:2d732d7c

std time : 807100 hash:2d732d7c
asm time : 483308 hash:2d732d7c

std time : 828443 hash:2d732d7c
asm time : 440832 hash:2d732d7c

std time : 787787 hash:2d732d7c
asm time : 466522 hash:2d732d7c

std time : 1089501 hash:2d732d7c
asm time : 467411 hash:2d732d7c

std time : 777931 hash:2d732d7c
asm time : 480207 hash:2d732d7c

std time : 816949 hash:2d732d7c
asm time : 463925 hash:2d732d7c

std time : 782110 hash:2d732d7c
asm time : 440370 hash:2d732d7c

std time : 802452 hash:2d732d7c
asm time : 494844 hash:2d732d7c

std time : 823711 hash:2d732d7c
asm time : 483938 hash:2d732d7c

std time : 807352 hash:2d732d7c
asm time : 440496 hash:2d732d7c

std time : 810950 hash:2d732d7c
asm time : 1092294 hash:2d732d7c

std time : 824159 hash:2d732d7c
asm time : 557011 hash:2d732d7c

std time : 983766 hash:2d732d7c
asm time : 484351 hash:2d732d7c

std time : 786254 hash:2d732d7c
asm time : 440832 hash:2d732d7c


D:\Downloads\dmd.2.067.0.windows\dmd2\windows\bin>
貌似也快不了多少 ... -_- ..

后来又在vc 下测试发现还是慢了点 ...(横向对比一下 发现dmd的优化确实不咋地 ... ) 改了下 ... 应该是 addring mode 切换会多出开销的 ... 管道配对也全被打乱了 ... 发现反而更快 ... @#($(#Q&@(*@ ... ):

#include 
#include

unsigned int crc32_table [256] =
{
0x00000000,0x77073096,0xee0e612c,0x990951ba,0x076dc419,0x706af48f,0xe963a535,
0x9e6495a3,0x0edb8832,0x79dcb8a4,0xe0d5e91e,0x97d2d988,0x09b64c2b,0x7eb17cbd,
0xe7b82d07,0x90bf1d91,0x1db71064,0x6ab020f2,0xf3b97148,0x84be41de,0x1adad47d,
0x6ddde4eb,0xf4d4b551,0x83d385c7,0x136c9856,0x646ba8c0,0xfd62f97a,0x8a65c9ec,
0x14015c4f,0x63066cd9,0xfa0f3d63,0x8d080df5,0x3b6e20c8,0x4c69105e,0xd56041e4,
0xa2677172,0x3c03e4d1,0x4b04d447,0xd20d85fd,0xa50ab56b,0x35b5a8fa,0x42b2986c,
0xdbbbc9d6,0xacbcf940,0x32d86ce3,0x45df5c75,0xdcd60dcf,0xabd13d59,0x26d930ac,
0x51de003a,0xc8d75180,0xbfd06116,0x21b4f4b5,0x56b3c423,0xcfba9599,0xb8bda50f,
0x2802b89e,0x5f058808,0xc60cd9b2,0xb10be924,0x2f6f7c87,0x58684c11,0xc1611dab,
0xb6662d3d,0x76dc4190,0x01db7106,0x98d220bc,0xefd5102a,0x71b18589,0x06b6b51f,
0x9fbfe4a5,0xe8b8d433,0x7807c9a2,0x0f00f934,0x9609a88e,0xe10e9818,0x7f6a0dbb,
0x086d3d2d,0x91646c97,0xe6635c01,0x6b6b51f4,0x1c6c6162,0x856530d8,0xf262004e,
0x6c0695ed,0x1b01a57b,0x8208f4c1,0xf50fc457,0x65b0d9c6,0x12b7e950,0x8bbeb8ea,
0xfcb9887c,0x62dd1ddf,0x15da2d49,0x8cd37cf3,0xfbd44c65,0x4db26158,0x3ab551ce,
0xa3bc0074,0xd4bb30e2,0x4adfa541,0x3dd895d7,0xa4d1c46d,0xd3d6f4fb,0x4369e96a,
0x346ed9fc,0xad678846,0xda60b8d0,0x44042d73,0x33031de5,0xaa0a4c5f,0xdd0d7cc9,
0x5005713c,0x270241aa,0xbe0b1010,0xc90c2086,0x5768b525,0x206f85b3,0xb966d409,
0xce61e49f,0x5edef90e,0x29d9c998,0xb0d09822,0xc7d7a8b4,0x59b33d17,0x2eb40d81,
0xb7bd5c3b,0xc0ba6cad,0xedb88320,0x9abfb3b6,0x03b6e20c,0x74b1d29a,0xead54739,
0x9dd277af,0x04db2615,0x73dc1683,0xe3630b12,0x94643b84,0x0d6d6a3e,0x7a6a5aa8,
0xe40ecf0b,0x9309ff9d,0x0a00ae27,0x7d079eb1,0xf00f9344,0x8708a3d2,0x1e01f268,
0x6906c2fe,0xf762575d,0x806567cb,0x196c3671,0x6e6b06e7,0xfed41b76,0x89d32be0,
0x10da7a5a,0x67dd4acc,0xf9b9df6f,0x8ebeeff9,0x17b7be43,0x60b08ed5,0xd6d6a3e8,
0xa1d1937e,0x38d8c2c4,0x4fdff252,0xd1bb67f1,0xa6bc5767,0x3fb506dd,0x48b2364b,
0xd80d2bda,0xaf0a1b4c,0x36034af6,0x41047a60,0xdf60efc3,0xa867df55,0x316e8eef,
0x4669be79,0xcb61b38c,0xbc66831a,0x256fd2a0,0x5268e236,0xcc0c7795,0xbb0b4703,
0x220216b9,0x5505262f,0xc5ba3bbe,0xb2bd0b28,0x2bb45a92,0x5cb36a04,0xc2d7ffa7,
0xb5d0cf31,0x2cd99e8b,0x5bdeae1d,0x9b64c2b0,0xec63f226,0x756aa39c,0x026d930a,
0x9c0906a9,0xeb0e363f,0x72076785,0x05005713,0x95bf4a82,0xe2b87a14,0x7bb12bae,
0x0cb61b38,0x92d28e9b,0xe5d5be0d,0x7cdcefb7,0x0bdbdf21,0x86d3d2d4,0xf1d4e242,
0x68ddb3f8,0x1fda836e,0x81be16cd,0xf6b9265b,0x6fb077e1,0x18b74777,0x88085ae6,
0xff0f6a70,0x66063bca,0x11010b5c,0x8f659eff,0xf862ae69,0x616bffd3,0x166ccf45,
0xa00ae278,0xd70dd2ee,0x4e048354,0x3903b3c2,0xa7672661,0xd06016f7,0x4969474d,
0x3e6e77db,0xaed16a4a,0xd9d65adc,0x40df0b66,0x37d83bf0,0xa9bcae53,0xdebb9ec5,
0x47b2cf7f,0x30b5ffe9,0xbdbdf21c,0xcabac28a,0x53b39330,0x24b4a3a6,0xbad03605,
0xcdd70693,0x54de5729,0x23d967bf,0xb3667a2e,0xc4614ab8,0x5d681b02,0x2a6f2b94,
0xb40bbe37,0xc30c8ea1,0x5a05df1b,0x2d02ef8d
};

__declspec(naked) static unsigned int __cdecl asm_getCrc32(unsigned int size, unsigned char* buffer) {

__asm {
push edi
push esi

mov edi, 12[esp] ; - U ecx <- loop count
mov esi, 16[esp] ; - V esi <- source buffer

// lea edi, [crc32_table] ; - U crc32 table
xor ecx, ecx ; - V

mov eax, -1 ; - U
xor edx, edx

mov cl, al ; - U
align 16 ; - V

main_loop:

shr eax, 8 ; - U
movzx edx, byte ptr[esi]

inc esi
xor ecx, edx ; - V 2 cycle ...

xor eax, [crc32_table + ecx*4] ; - U
movzx ecx, al ; - U

dec edi ; - V 2 cycle ..
jne main_loop ; - V 1 cycle ...

pop esi
pop edi

xor eax, -1
ret
}
}

__declspec(naked) static unsigned __int64 __cdecl RDTSC(){
__asm {
rdtsc
ret
}
}

DWORD getCrc32(int size, unsigned char* c)
{
DWORD r = 0xFFFFFFFFUL;
for (int i = 0; i < size; i++)
r = (r >> 8) ^ crc32_table[(BYTE)r ^ *c++];

return r ^ 0xFFFFFFFFUL;
}

void main(){
static unsigned int m_array[320000000];
// memset (m_array, -1, 16000000);

unsigned int i = 20;
while(i--){
unsigned __int32 tStart = timeGetTime();
__asm {
lea eax,[m_array]
push eax
push 128000000
call dword ptr [getCrc32]
add esp, 8
}

printf ("nor time : %d \n", (unsigned int)(timeGetTime() - tStart));
tStart= timeGetTime();
UINT hi = asm_getCrc32(128000000, (unsigned char*)&m_array);
printf ("asm time : %d \n\n", (unsigned int)(timeGetTime() - tStart));
}
}

vc++ 2010 release 版本结果

nor time : 551
asm time : 461

nor time : 468
asm time : 456

nor time : 501
asm time : 492

nor time : 492
asm time : 462

nor time : 513
asm time : 690

nor time : 633
asm time : 516

nor time : 475
asm time : 448

nor time : 464
asm time : 472

nor time : 460
asm time : 441

nor time : 485
asm time : 468

nor time : 483
asm time : 450

nor time : 449
asm time : 471

nor time : 512
asm time : 463

nor time : 477
asm time : 487

nor time : 478
asm time : 463

nor time : 468
asm time : 481

nor time : 462
asm time : 463

nor time : 459
asm time : 451

nor time : 484
asm time : 501

nor time : 467
asm time : 487

__declspec(naked) static  unsigned int __cdecl asm_getCrc32(unsigned int size, unsigned char* buffer) {  

__asm {
push edi
push esi

mov eax, -1 ; - U
mov ecx, 0xFF ; - U

mov edi, 12[esp] ; - U ecx <- loop count
mov esi, 16[esp] ; - V esi <- source buffer

align 16 ; - V

main_loop:

shr eax, 8 ; - U
movzx edx, byte ptr[esi]

inc esi
xor ecx, edx ; - V 2 cycle ...

xor eax, [crc32_table + ecx*4] ; - U
dec edi ; - V 2 cycle ..

movzx ecx, al ; - U
jne main_loop ; - V 1 cycle ...

pop esi
pop edi

xor eax, -1
ret
}
}

你可能感兴趣的:(c 内联汇编 crc 32 算法 ...)