在openmp中有个比较典型的测试例子cpp_compiler_options_openmp.cpp,展示了
for循环中的归约操作, #pragma omp parallel for reduction(+:sum) private(x)
自已也写个多线程的版本,针对Intel Core 2 Duo CPU.
计算pi的方法很多,这个方法用于测试最好了,原理:http://wenku.baidu.com/view/3287baacdd3383c4bb4cd2ed.html
c代码:
double test1(int num_steps) { int i; double x, pi, sum = 0.0, step; step = 1.0 / (double) num_steps; for (i = 1; i <= num_steps; i++) { x = (i - 0.5) * step; sum = sum + 4.0 / (1.0 + x*x); } pi = step * sum; return pi; }
使用MS vc++6.0 编译,得到FPU单线程版本。
用Intel C++编译,得到SSE单线程版本。
用Intel C++结合openmp生成SSE的3线程版本(两个计算线程,一个主线程)。
我将Intel C++编译,得到SSE单线程版本改写为如上的3线程版本。
处理器:Intel Core(TM)2 Duo CPU E8500 @3.16GHz 3.16GHz win7 32位.
性能如下:
MS VC For 1000000000 steps, pi = 3.141592653589971, 6506 milliseconds 单线程
Intel C++ For 1000000000 steps, pi = 3.141592653589763, 3307 milliseconds 单线程
Openmp+Intel C++ For 1000000000 steps, pi = 3.141592653589738, 1684 milliseconds 3线程
my mtTest.exe For 1000000000 steps, pi = 3.141592653589738 ,1606 milliseconds 3线程
很容易扩展成更多的线程。
;>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ;*--==--* fasm multiple threads. ;*--==--* By G-Spider ;*--==--* fasm mtTest.asm mtTest.exe ;>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> format PE console entry start include 'win32a.inc' THREAD_PRIORITY_TIME_CRITICAL = 0fh THREAD_PRIORITY_HIGHEST = 02h CREATE_SUSPENDED = 04h INFINITE = -1 ;i = 1; i <= num_steps; i++ ;[1 , N/2] [N/2+1, N] N = 1000000000 ;--------------------------------------------- section '.text' code readable executable start: invoke GetCurrentThread mov edi,eax invoke SetThreadPriority,edi,THREAD_PRIORITY_TIME_CRITICAL invoke SetThreadAffinityMask,edi,1 ;============================================================== invoke GetTickCount mov ebx,eax xor esi,esi @@: xor eax,eax lea edx,[esi*8+dwParam] invoke CreateThread,eax,eax,ThreadProc,edx,CREATE_SUSPENDED,eax test eax,eax jz _END mov [hTrd+esi*4],eax mov edi,eax invoke SetThreadPriority,edi,THREAD_PRIORITY_HIGHEST inc esi invoke SetThreadAffinityMask,edi,esi invoke ResumeThread,edi test esi,1 jnz @B invoke WaitForMultipleObjects,2,hTrd,TRUE,INFINITE invoke GetTickCount sub eax,ebx ;============================================================== push eax sub esp,8 fld qword [dwParam+16] fadd qword [dwParam+24] fstp qword [esp] push szFmt call [printf] add esp,16 cinvoke system,szPause _END: invoke ExitProcess,0 ;align 16 proc ThreadProc uses esi, lpParam mov esi,[lpParam] stdcall _testPiSSE,[esi],[esi+4] fstp qword [esi+16] ret endp align 16 _testPiSSE: ; parameter 1: 8 + ebp ;lower ; parameter 2: 12 + ebp ;higher .B2.1: ; Preds .B2.0 push ebp mov ebp, esp and esp, -16 push ebx sub esp, 28 mov eax, dword [8+ebp] mov ecx, dword [12+ebp] cmp eax, ecx jg .B2.10 ; Prob 50% .B2.2: ; Preds .B2.1 sub ecx, eax inc ecx cmp ecx, 8 jl .B2.12 ; Prob 10% .B2.3: ; Preds .B2.2 mov ebx, 2 mov edx, ecx and edx, 7 neg edx pxor xmm1, xmm1 pxor xmm6, xmm6 add edx, ecx movd xmm3, ebx lea ebx, dword [1+eax] pshufd xmm4, xmm3, 0 movd xmm3, eax movaps xmm5, dqword [_2il0floatpacket.13] movaps xmm2, dqword [_2il0floatpacket.14] movd xmm0, ebx xor ebx, ebx punpckldq xmm3, xmm0 movaps xmm0, xmm6 punpcklqdq xmm3, xmm1 .B2.4: ; Preds .B2.4 .B2.3 cvtdq2pd xmm7, xmm3 subpd xmm7, xmm2 mulpd xmm7, xmm5 mulpd xmm7, xmm7 movaps xmm1, dqword [_2il0floatpacket.16] paddd xmm3, xmm4 addpd xmm7, xmm1 movaps dqword [esp], xmm0 add ebx, 8 movaps xmm0, dqword [_2il0floatpacket.15] cmp ebx, edx divpd xmm0, xmm7 cvtdq2pd xmm7, xmm3 addpd xmm6, xmm0 subpd xmm7, xmm2 mulpd xmm7, xmm5 mulpd xmm7, xmm7 addpd xmm7, xmm1 movaps xmm0, dqword [_2il0floatpacket.15] paddd xmm3, xmm4 divpd xmm0, xmm7 movaps xmm7, dqword [esp] addpd xmm7, xmm0 cvtdq2pd xmm0, xmm3 subpd xmm0, xmm2 mulpd xmm0, xmm5 mulpd xmm0, xmm0 addpd xmm0, xmm1 movaps dqword [esp], xmm7 paddd xmm3, xmm4 movaps xmm7, dqword [_2il0floatpacket.15] divpd xmm7, xmm0 cvtdq2pd xmm0, xmm3 addpd xmm6, xmm7 subpd xmm0, xmm2 mulpd xmm0, xmm5 mulpd xmm0, xmm0 addpd xmm0, xmm1 movaps xmm1, dqword [_2il0floatpacket.15] paddd xmm3, xmm4 divpd xmm1, xmm0 movaps xmm0, dqword [esp] addpd xmm0, xmm1 jb .B2.4 ; Prob 82% .B2.5: ; Preds .B2.4 addpd xmm6, xmm0 movaps xmm0, xmm6 unpckhpd xmm0, xmm6 addsd xmm6, xmm0 .B2.6: ; Preds .B2.5 .B2.12 movsd xmm2, qword [_2il0floatpacket.12] add eax, edx cmp edx, ecx jae .B2.11 ; Prob 10% .B2.7: ; Preds .B2.6 movsd xmm1, qword [_2il0floatpacket.17] movsd xmm0, qword [_2il0floatpacket.19] .B2.8: ; Preds .B2.8 .B2.7 pxor xmm3, xmm3 inc edx cvtsi2sd xmm3, eax movsd xmm4, qword [_2il0floatpacket.18] inc eax cmp edx, ecx subsd xmm3, xmm1 mulsd xmm3, xmm2 mulsd xmm3, xmm3 addsd xmm3, xmm0 divsd xmm4, xmm3 addsd xmm6, xmm4 jb .B2.8 ; Prob 82% jmp .B2.11 ; Prob 100% .B2.10: ; Preds .B2.1 movsd xmm2, qword [_2il0floatpacket.12] pxor xmm6, xmm6 .B2.11: ; Preds .B2.8 .B2.6 .B2.10 mulsd xmm2, xmm6 movsd qword [esp], xmm2 fld qword [esp] add esp, 28 pop ebx mov esp, ebp pop ebp retn 8 .B2.12: ; Preds .B2.2 xor edx, edx pxor xmm6, xmm6 jmp .B2.6 ; Prob 100% ;--------------------------------------------- section '.data' data readable writeable _2il0floatpacket.13 DD 0e826d695H,03e112e0bH,0e826d695H,03e112e0bH _2il0floatpacket.14 DD 000000000H,03fe00000H,000000000H,03fe00000H _2il0floatpacket.15 DD 000000000H,040100000H,000000000H,040100000H _2il0floatpacket.16 DD 000000000H,03ff00000H,000000000H,03ff00000H _2il0floatpacket.12 DD 0e826d695H,03e112e0bH ;N=1000000000 _2il0floatpacket.17 DD 000000000H,03fe00000H ;0.5 _2il0floatpacket.18 DD 000000000H,040100000H ;4.0 _2il0floatpacket.19 DD 000000000H,03ff00000H ;1.0 dwParam dd 1,N/2-1000,N/2+1-1000,N,0,0,0,0 hTrd rd 2 szFmt db '%.15lf ,%d ms', 0aH, 00H szPause db 'pause',0 ;--------------------------------------------- section '.idata' import data readable writeable library kernel32,'KERNEL32.DLL',\ msvcrt,'msvcrt.dll' include 'api\kernel32.inc' import msvcrt,\ printf,'printf',\ system,'system'