#include <xmmintrin.h> #include <math.h> #include <time.h> #include <iostream> using namespace std; #define asm __asm #define ARRAY_SIZE 3000000 clock_t start,finish; void init_CPLUSPLUS(float* pSource ,int nCount) { start = clock(); float* pCur = pSource; for(int i=0;i<nCount;i++) { *pCur = (float)(sin((float)i))+(float)cos((float)i); pCur++; } finish = clock(); cout<<"init CPLUSPLUS"<<(finish-start)*1000.0/CLOCKS_PER_SEC<<endl; } void init_CPLUSPLUSFPU(float* pSource ,int nCount) { start = clock(); float temp; for(int i=0;i<nCount;i++) { asm { fild i; fsincos; fadd; mov eax,pSource; fstp dword ptr[eax]; } pSource++; } finish = clock(); cout<<"init CPLUSPLUSFPU"<<(finish-start)*1000.0/CLOCKS_PER_SEC<<endl; } void sqrt_CPLUSPLUS(float* pSource1,float* pSource2,float* pResult,int nCount) { start = clock(); float *p1,*p2,*p3; p1 = pSource1; p2 = pSource2; p3 = pResult; for(int i=0;i<nCount;i++) { *p3 = sqrt((*p1)*(*p1)+(*p2)*(*p2))+0.5; p3++; p1++; p2++; } finish = clock(); cout<<"sqrt_CPLUSPLUS"<<(finish-start)*1000.0/CLOCKS_PER_SEC<<endl; } void sqrt_CPLUSPLUSSSE(float* pSource1,float* pSource2,float* pResult,int nCount) { start = clock(); nCount/=4; __m128 s1,s2,r; // __m128* pS1 = (__m128*) pSource1; // __m128* pS2 = (__m128*) pSource2; // __m128* pR = (__m128*) pResult; __m128 t; t = _mm_set_ps1(0.5f); for(int i = 0; i<nCount;i++) { s1 = _mm_load_ps(pSource1+(i<<2)); s2 = _mm_load_ps(pSource2+(i<<2)); s1 = _mm_mul_ps(s1,s1); s2 = _mm_mul_ps(s2,s2); s1 = _mm_add_ps(s1,s2); r = _mm_sqrt_ps(s1); r = _mm_add_ps(r,t); _mm_store_ps(pResult+(i<<2),r); } finish = clock(); cout<<"sqrt_CPLUSPLUSSSE"<<(finish-start)*1000.0/CLOCKS_PER_SEC<<endl; } void main() { // __declspec(align(16)) float fArray[ARRAY_SIZE]; float* pSource1 = (float*)_aligned_malloc(ARRAY_SIZE*sizeof(float),16); float* pSource2 = (float*)_aligned_malloc(ARRAY_SIZE*sizeof(float),16); float* pResult1 = (float*)_aligned_malloc(ARRAY_SIZE*sizeof(float),16); float* pResult2 = (float*)_aligned_malloc(ARRAY_SIZE*sizeof(float),16); init_CPLUSPLUS(pSource1,ARRAY_SIZE); init_CPLUSPLUSFPU(pSource2,ARRAY_SIZE); for(int i=0;i<ARRAY_SIZE;i++) { if(fabs(pSource1[i]-pSource2[i])>0.00001f) { cout<<"init error"; break; } } sqrt_CPLUSPLUS(pSource1,pSource2,pResult1,ARRAY_SIZE); sqrt_CPLUSPLUSSSE(pSource1,pSource2,pResult2,ARRAY_SIZE); for(int i=0;i<ARRAY_SIZE;i++) { if(fabs(pResult1[i]-pResult2[i])>0.00001f) { cout<<"sqrt error"; break; } } _aligned_free(pResult2); _aligned_free(pResult1); _aligned_free(pSource2); _aligned_free(pSource1); system("pause"); }
用x87指令和SSE指令对三角函数的计算和开方计算进行优化。
平台:软件VS2010,CPU Intel Celeron E3400 2.6G
Release版本运行结果如下:
init CPLUSPLUS:359ms
init CPLUSPLUSFPU:141ms
sqrt_CPLUSPLUS:47ms
sqrt_CPLUSPLUSSSE:15ms