快速存拷贝 fast_memcpy

精确拷贝到字节,三种拷贝方式:1. movsd 2.SSE 系列 + 软预取 3.SSE系列+硬预取方式,要注意的是,不是所有的拷贝SSE都更优,正如为了行进100米,是开飞机还是步行咧?嘿嘿。所以,对于小字节量拷贝用movsd、 movsb过渡。

 

测试平台(CPU-Z):

Intel(R) Celeron(R) CPU 2.66GHz

支持的指令集:MMX,SSE(1,2,3) ,EM64T

一级数据缓存:16KB

二级数据缓存:256KB

 

测试32.1 MB文件存拷贝:

 _fast_memcpy1 (movsd) 33 ms

_fast_memcpy9 (SSE 系列+软预取) 23 ms

_block_prefetch (硬预取 block_size 8KB) 22 ms

 

代码: ;************************************************************ ;-==-: fast_memcpyTest By G-Spider @2010 ;-==-: ml /c /coff memcpyTest.asm 注意,请用6.15以上的版本 ;-==-: link /subsystem:console memcpyTest.obj ;************************************************************ .686p .XMM .model flat,stdcall option casemap:none include windows.inc include user32.inc include kernel32.inc include msvcrt.inc includelib user32.lib includelib kernel32.lib includelib msvcrt.lib BLOCK_SIZE equ 8192 .data dwlm dd 1000 ;1000是毫秒为单位,1000000则是微秒为单位 fmt db '计算用时:',0dh,0ah,0 fmt1 db '%6lld ms',0dh,0ah,0 szFileName db 'xinyu.avi',0 ;32,954KB 原文件 szOutName db 'output.avi',0 ;输出文件; ;szFileName db 'test.png',0 ;63KB 请以微秒为单位 原文件 ;szOutName db 'output.png',0 ;输出文件 szPause db 'Pause',0 .data? hHandle dd ? hHandle1 dd ? lpInputBuf dd ? lpOutputBuf dd ? dwStrlen dd ? lpNumberOfBytes dd ? dwOldProcessP dd ? dwOldThreadP dd ? ;------------------------------------- dqTickCounter1 dq ? dqTickCounter2 dq ? dqFreq dq ? dqTime dq ? .code ;************************************* _fast_memcpy1 proc lpdst,lpsrc,dwlen ;%define param esp+8+4 ;%define src param+0 ;%define dst param+4 ;%define len param+8 mov esi, lpsrc ; source array mov edi, lpdst ; destination array mov ecx, dwlen mov eax,ecx and eax,3 shr ecx, 2 ; convert to DWORD count test ecx,ecx jz A000 rep movsd A000: test eax,eax jz A001 mov ecx,eax rep movsb A001: xor eax,eax ret _fast_memcpy1 endp ;*************************************** _fast_memcpy9 proc lpdst,lpsrc,dwlen mov esi, lpsrc ;src pointer mov edi, lpdst ;dest pointer mov ebx, dwlen ;ebx is our counter mov ecx, ebx and ecx, 07fh ;剩余的<128字节 shr ebx, 7 ;divide by 128 (8 * 128bit registers) test ebx,ebx jz A000 ALIGN 16 loop_copy: prefetchnta 128[ESI]; SSE2 prefetch prefetchnta 160[ESI]; prefetchnta 192[ESI]; prefetchnta 224[ESI]; movdqa xmm0, 0[ESI] ; move data from src to registers movdqa xmm1, 16[ESI]; movdqa xmm2, 32[ESI]; movdqa xmm3, 48[ESI]; movdqa xmm4, 64[ESI]; movdqa xmm5, 80[ESI]; movdqa xmm6, 96[ESI]; movdqa xmm7, 112[ESI]; movntdq 0[EDI], xmm0 ; move data from registers to dest movntdq 16[EDI], xmm1; movntdq 32[EDI], xmm2; movntdq 48[EDI], xmm3; movntdq 64[EDI], xmm4; movntdq 80[EDI], xmm5; movntdq 96[EDI], xmm6; movntdq 112[EDI], xmm7; add esi, 128; add edi, 128; dec ebx; jnz loop_copy; //loop please sfence align 16 A000: mov eax, ecx and eax, 3 shr ecx, 2 ; convert to DWORD count test ecx,ecx jz short A001 rep movsd A001: test eax,eax jz A002 mov ecx,eax rep movsb A002: xor eax,eax ret _fast_memcpy9 endp ;***************************************************** _block_prefetch proc lpdst,lpsrc,dwlen mov edi, lpdst mov esi, lpsrc mov eax, dwlen mov edx, eax and eax, (BLOCK_SIZE-1) ;4096-1=0fffh ;8192-1=1fffh;16*1024-1=3fffh and edx, 0ffffe000h ;与 BLOCK_SIZE有关 test edx,edx jz A000 align 16 main_loop: xor ecx,ecx align 16 prefetch_loop: movaps xmm0, [esi+ecx] movaps xmm0, [esi+ecx+64] add ecx,128 cmp ecx,BLOCK_SIZE jne prefetch_loop xor ecx,ecx align 16 cpy_loop: movdqa xmm0,[esi+ecx] movdqa xmm1,[esi+ecx+16] movdqa xmm2,[esi+ecx+32] movdqa xmm3,[esi+ecx+48] movdqa xmm4,[esi+ecx+64] movdqa xmm5,[esi+ecx+16+64] movdqa xmm6,[esi+ecx+32+64] movdqa xmm7,[esi+ecx+48+64] movntdq [edi+ecx],xmm0 movntdq [edi+ecx+16],xmm1 movntdq [edi+ecx+32],xmm2 movntdq [edi+ecx+48],xmm3 movntdq [edi+ecx+64],xmm4 movntdq [edi+ecx+80],xmm5 movntdq [edi+ecx+96],xmm6 movntdq [edi+ecx+112],xmm7 add ecx,128 cmp ecx,BLOCK_SIZE jne cpy_loop add esi,ecx add edi,ecx sub edx,ecx jnz main_loop sfence align 16 A000: mov ecx, eax and eax, 3 shr ecx, 2 ; convert to DWORD count test ecx,ecx jz short A001 rep movsd A001: test eax,eax jz A002 mov ecx,eax rep movsb A002: xor eax,eax ret _block_prefetch endp ;***************************************************** start: invoke CreateFile,offset szFileName,GENERIC_READ,FILE_SHARE_READ,/ NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL .if eax == INVALID_HANDLE_VALUE invoke MessageBox,NULL,0,0,0 .endif mov hHandle,eax invoke GetFileSize,eax,NULL mov dwStrlen,eax add eax,16 invoke crt_malloc,eax mov lpInputBuf,eax mov edx,lpInputBuf and eax,0fh jz Good1 xor eax,edx add eax,10h mov lpInputBuf,eax Good1: invoke RtlZeroMemory,lpInputBuf,dwStrlen invoke ReadFile,hHandle,lpInputBuf,dwStrlen,offset lpNumberOfBytes,NULL mov eax,dwStrlen add eax,16 invoke crt_malloc,eax mov lpOutputBuf,eax mov edx,lpOutputBuf and eax,0fh jz Good2 xor eax,edx add eax,10h mov lpOutputBuf,eax Good2: invoke RtlZeroMemory,lpOutputBuf,dwStrlen ;---------------------------------------------------- invoke crt_printf,offset fmt mov ecx,5 ;测试5次 .while ecx!=0 push ecx invoke GetCurrentProcess invoke GetPriorityClass,eax mov dwOldProcessP,eax invoke GetCurrentThread invoke GetThreadPriority,eax mov dwOldThreadP,eax invoke GetCurrentProcess invoke SetPriorityClass,eax,REALTIME_PRIORITY_CLASS invoke GetCurrentThread invoke SetThreadPriority,eax,THREAD_PRIORITY_TIME_CRITICAL ;-------------------------------------------------- invoke QueryPerformanceCounter,addr dqTickCounter1 ;时间测试 ;invoke _fast_memcpy1,lpOutputBuf,lpInputBuf,dwStrlen ;invoke _fast_memcpy9,lpOutputBuf,lpInputBuf,dwStrlen invoke _block_prefetch,lpOutputBuf,lpInputBuf,dwStrlen ;测试结束 invoke QueryPerformanceCounter,addr dqTickCounter2 invoke QueryPerformanceFrequency,addr dqFreq mov eax,dword ptr dqTickCounter1 mov edx,dword ptr dqTickCounter1[4] sub dword ptr dqTickCounter2,eax sub dword ptr dqTickCounter2[4],edx ;---------------------------------------------------- ;优先级还原 invoke GetCurrentThread invoke SetThreadPriority,eax,dwOldThreadP invoke GetCurrentProcess invoke SetPriorityClass,eax, dwOldProcessP finit fild dqFreq fild dqTickCounter2 fimul dwlm fdivr fistp dqTime ;dqTime中的64位值就是时间间隔(以微秒为单位) ;--------------------------------------------------- invoke crt_printf,offset fmt1,dqTime pop ecx dec ecx .endw ;输出copy文件 invoke CreateFile,offset szOutName,GENERIC_WRITE,FILE_SHARE_READ,/ NULL,CREATE_ALWAYS,FILE_ATTRIBUTE_NORMAL,NULL .if eax == INVALID_HANDLE_VALUE invoke MessageBox,NULL,0,0,0 .endif mov hHandle1,eax invoke WriteFile,eax,lpOutputBuf,dwStrlen,offset lpNumberOfBytes,NULL invoke CloseHandle,hHandle invoke CloseHandle,hHandle1 invoke crt_system,offset szPause invoke ExitProcess,0 end start

你可能感兴趣的:(thread,File,user,测试,null,System)