英特尔多核平台编码优化大赛的优化过程--补充

                   英特尔多核平台编码优化大赛的优化过程--补充
                       [email protected]  2007.01.20

tag: 多核编程,sse2,牛顿迭代,代码优化,优化大赛,invsqrt,开方

   主要文章请参看我的《英特尔多核平台编码优化大赛的优化过程》: http://blog.csdn.net/housisong/archive/2007/01/20/1488465.aspx
本文章是其补充;提供一个完整的float实现版本、double到float的手工转换、手工得到invSqrt的粗略起始迭代值 等其它几个不算成功的实现;

    我测试和优化过程中用的 CPU:AMD64x2 3600+ (双核CPU)
                      操作系统:Windows XP 32bit
                        编译器:Visual Studio 2005

大赛公布的原始代码执行时间          3.97秒


一个float完整实现版本(牺牲了计算精度),源代码如下:
(如果用汇编实现应该还可以把速度提高一些,或者使用ICC编译器:)

/*  compute the potential energy of a collection of  */
/*  particles interacting via pairwise potential     */
#include 
< stdio.h >
#include 
< stdlib.h >
#include 
< math.h >
#include 
< windows.h >
#include   
< xmmintrin.h > // 使用SSE1
#include    < process.h >
#include   
< vector >


/////////////////////////////////////////////////////////////////////////////////////////////////////////////// /

// 作者:侯思松  [email protected]
// 计算结果的精度为float单精度浮点 版本

#define  _IS_FAST 
// 以牺牲精度的方式加快速度,否则就运算达到float单精度

/////////////////////////////////////////////////////////////////////////////////////////////////////////////// /

// #define _NEW_TIME_CLOCK

#ifdef _NEW_TIME_CLOCK
    
#define  clock_t double
    
double  CLOCKS_PER_SEC = 0.0 ;
    inline 
double  clock() {
        __int64 result;
        
if  (CLOCKS_PER_SEC == 0 )
        {
            QueryPerformanceFrequency((LARGE_INTEGER 
* ) & result);
            CLOCKS_PER_SEC
= ( double )result;
        }
        QueryPerformanceCounter((LARGE_INTEGER 
* ) & result);
        
return  ( double )result;
    }
#else
  #include 
< time.h >
#endif

#define  _IS_USES_MY_RAND
// 单线程执行rand函数,所以使用自定义rand是安全的
                                              
const   long  DefthreadCount = 2 // 1,2,4,8,16,..   把计算任务分成多个任务并行执行


float &  m128_value(__m128 &  x, const   long  index) {  return  (( float * )( & x))[index]; }

#define  NPARTS 1000
#define  NITER 201
#define  DIMS 3

#ifdef _IS_USES_MY_RAND
    
class  CMyRand
    {
    
private :
        unsigned 
long  _my_holdrand;
    
public :
        CMyRand():_my_holdrand(
1 ){}
        inline 
int  _my_rand ( void )
        {
            unsigned 
long  result = _my_holdrand  *   214013   +   2531011 ;
            _my_holdrand 
= result;
            
return   ( (result >> 16 &  RAND_MAX );
        }
    };
    CMyRand _MyRand;
    inline 
int  _my_rand ( void ){  return  _MyRand._my_rand(); }
#else
    
#define   _my_rand  rand
#endif

int  rand(  void  );
int  computePot();
void  initPositions( void );
void  updatePositions( void );

__declspec(align(
16 ))  float  r[DIMS][(NPARTS + 3 ) / 4 * 4 ];   // 16byte对齐
double  pot;

int  main() {
   
int  i;
   clock_t start, stop;
   
// char ctmp; std::cin>>ctmp;

   initPositions();
   updatePositions();
 
   start
= clock();
   
for ( i = 0 ; i < NITER; i ++  ) {
          pot 
=   0.0 ;
     computePot();
      
if  (i % 10   ==   0
      {     
     printf(
" %5d: Potential: %20.7f " , i, pot);}
     updatePositions();

   }
   pot 
=   0.0 ;
   stop
= clock();
   printf (
" Seconds = %10.9f " ,( double )(stop - start) /  CLOCKS_PER_SEC);
   
   
return   0 ;
}
 
 
void  initPositions() {
   
int  i, j;
   
for ( i = 0 ; i < DIMS; i ++  ){
       
for ( j = 0 ; j < NPARTS; j ++  ) 
            r[i][j]
=  ( float )(  0.5   +  _my_rand()  * ( 1.0 / RAND_MAX) );
   }
}
 
void  updatePositions() {
   
int   i,j;

   
for  (i = 0 ;i < DIMS; ++ i)
   {
       
for ( j = 0 ; j < NPARTS;  ++ j )
       {
            r[i][j] 
-= ( float )(  0.5   +  _my_rand()  * ( 1.0 / RAND_MAX) );
       }
   }
}

struct  TWorkData
{
    
long      iBegin;
    
long      iEnd;
    
double    fResult;
};

const  __m128  xmms1_5 = { ( 1.5 ),( 1.5 ),( 1.5 ),( 1.5 ) };
const  __m128  xmms_0_5 = { ( - 0.5 ),( - 0.5 ),( - 0.5 ),( - 0.5 ) };

void  computePotPart_forj( int  i,__m128 *  pResult)
{
      __m128 lcpot
= _mm_setzero_ps();
      __m128 _mmi0
= _mm_set1_ps( - r[ 0 ][i]);
      __m128 _mmi1
= _mm_set1_ps( - r[ 1 ][i]);
      __m128 _mmi2
= _mm_set1_ps( - r[ 2 ][i]);
      
int  j = 0 ;
      
// for( j=0; j<i-1; j++ ) {       // "j<i-1"比较奇怪,疑为"j<i"  !  
      
// * 把这个循环做4次展开
       for (;j + 4 < i;j += 4 )
      {
          __m128 sm0
= _mm_add_ps( * (__m128 * ) & r[ 0 ][j],_mmi0);
          sm0
= _mm_mul_ps(sm0,sm0);
          __m128 sm1
= _mm_add_ps( * (__m128 * ) & r[ 1 ][j],_mmi1);
          sm1
= _mm_mul_ps(sm1,sm1);
          __m128 sm2
= _mm_add_ps( * (__m128 * ) & r[ 2 ][j],_mmi2);
          sm2
= _mm_mul_ps(sm2,sm2);
          sm0
= _mm_add_ps(sm0,sm1);
          sm0
= _mm_add_ps(sm0,sm2);

            sm1
= _mm_rsqrt_ps(sm0);   //  1/sqrt(,,,)

          #ifndef _IS_FAST
            
// 牛顿迭代,提高开方精度
            
//  1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x 
            sm0 = _mm_mul_ps(sm0,xmms_0_5);  // a*(-0.5)
            sm0 = _mm_mul_ps(sm0,sm1);
            sm0
= _mm_mul_ps(sm0,sm1); 
            sm0
= _mm_add_ps(sm0,xmms1_5); 
            sm0
= _mm_mul_ps(sm0,sm1); 
          
#else
            sm0
= sm1; 
          
#endif

          lcpot
= _mm_add_ps(lcpot,sm0);
      }
// */
       for (;j < i - 1 ; ++ j)
      {
          __m128 sm0
= _mm_set_ss(r[ 0 ][j]);
          sm0
= _mm_add_ss(sm0,_mmi0);
          sm0
= _mm_mul_ss(sm0,sm0);
          __m128 sm1
= _mm_set_ss(r[ 1 ][j]);
          sm1
= _mm_add_ss(sm1,_mmi1);
          sm1
= _mm_mul_ss(sm1,sm1);
          __m128 sm2
= _mm_set_ss(r[ 2 ][j]);
          sm2
= _mm_add_ss(sm2,_mmi2);
          sm2
= _mm_mul_ss(sm2,sm2);
          sm0
= _mm_add_ss(sm0,sm1);
          sm0
= _mm_add_ss(sm0,sm2);

          sm1
= _mm_rsqrt_ss(sm0); 
          #ifndef _IS_FAST
            
// 牛顿迭代,提高开方精度
            
//  1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x 
            sm0 = _mm_mul_ps(sm0,xmms_0_5);  // a*(-0.5)
            sm0 = _mm_mul_ps(sm0,sm1);
            sm0
= _mm_mul_ps(sm0,sm1); 
            sm0
= _mm_add_ps(sm0,xmms1_5); 
            sm0
= _mm_mul_ps(sm0,sm1); 
          
#else
            sm0
= sm1; 
          
#endif

          lcpot
= _mm_add_ss(lcpot,sm0);
      }
      
* pResult = _mm_add_ps( * pResult,lcpot);
}

void  computePotPart(TWorkData *  work_data) {
   
int  i;

   __m128 lcpot
= _mm_setzero_ps();

  
//  #pragma omp parallel for schedule(static)
   for ( i = work_data -> iBegin; i < work_data -> iEnd; i ++  ) {
      computePotPart_forj(i,
& lcpot);
   }

   __m128 dt0;      
   dt0
= _mm_movehl_ps(lcpot,lcpot);
   lcpot
= _mm_add_ps(lcpot,dt0); 
   dt0
= _mm_shuffle_ps(lcpot,lcpot, 1 );
   lcpot
= _mm_add_ss(lcpot,dt0); 

   work_data
-> fResult = m128_value(lcpot, 0 );
}

//////////////////////////////////////////////////////////// /
// 工作线程池 TWorkThreadPool
// 用于把一个任务拆分成多个线程任务
// 要求每个小任务任务量相近
// todo:改成任务领取模式
class  TWorkThreadPool;

typedef 
void  ( * TThreadCallBack)( void   *  pData);
enum  TThreadState{ thrStartup = 0 , thrReady,  thrBusy, thrTerminate, thrDeath };

class  TWorkThread
{
public :
    
volatile  HANDLE             thread_handle;
    
volatile   enum  TThreadState  state;
    
volatile  TThreadCallBack    func;
    
volatile   void   *              pdata;   // work data     
     volatile  HANDLE             waitfor_event;
    TWorkThreadPool
*             pool;
    
volatile  DWORD              thread_ThreadAffinityMask;

    TWorkThread() { memset(
this , 0 , sizeof (TWorkThread));  }
};

void  do_work_end(TWorkThread *  thread_data);

void  __cdecl thread_dowork(TWorkThread *  thread_data)  // void __stdcall thread_dowork(TWorkThread* thread_data)
{
    
volatile  TThreadState &  state = thread_data -> state;
    SetThreadAffinityMask(GetCurrentThread(),thread_data
-> thread_ThreadAffinityMask);
    state 
=  thrStartup;

    
while ( true )
    {
        WaitForSingleObject(thread_data
-> waitfor_event,  - 1 );
        
if (state  ==  thrTerminate)
            
break ;

        state 
=  thrBusy;
        
volatile  TThreadCallBack &  func = thread_data -> func;
        
if  (func != 0 )
            func((
void   * )thread_data -> pdata);
        do_work_end(thread_data);
    }
    state 
=  thrDeath;
    _endthread();
// ExitThread(0);
}

class  TWorkThreadPool
{
private :
    
volatile  HANDLE             thread_event;
    
volatile  HANDLE             new_thread_event;
    std::vector
< TWorkThread >     work_threads;
    inline 
int  passel_count()  const  {  return  ( int )work_threads.size() + 1 ; }
    
void  inti_threads( long  work_count) {
        SYSTEM_INFO SystemInfo;
        GetSystemInfo(
& SystemInfo);
        
long  cpu_count  = SystemInfo.dwNumberOfProcessors;
        
long  best_count  = cpu_count;
        
if  (cpu_count > work_count) best_count = work_count;

        
long  newthrcount = best_count  -   1 ;
        work_threads.resize(newthrcount);
        thread_event 
=  CreateSemaphore( 0 0 ,newthrcount ,  0 );
        new_thread_event 
=  CreateSemaphore( 0 0 ,newthrcount ,  0 );
        
for ( long  i  =   0 ; i  <  newthrcount;  ++ i)
        {
            work_threads[i].waitfor_event
= thread_event;
            work_threads[i].state 
=  thrTerminate;
            work_threads[i].pool
= this ;
            work_threads[i].thread_ThreadAffinityMask
= 1 << (i + 1 );
            
// DWORD thr_id;
            work_threads[i].thread_handle  = (HANDLE)_beginthread(( void  (__cdecl  * )( void   * ))thread_dowork,  0 , ( void * ) & work_threads[i]); 
                
// CreateThread(0, 0, (LPTHREAD_START_ROUTINE)thread_dowork,(void*) &work_threads[i], 0, &thr_id);
        }
        SetThreadAffinityMask(GetCurrentThread(),
0x01 );
        
for ( long  i  =   0 ; i  <  newthrcount;  ++ i)
        {
            
while ( true ) { 
                
if  (work_threads[i].state  ==  thrStartup)  break ;
                
else  Sleep( 0 );
            }
            work_threads[i].state 
=  thrReady;
        }
    }
    
void  free_threads( void )
    {
        
long  thr_count = ( long )work_threads.size();
        
long  i;
        
for (i  =   0 ; i  < thr_count;  ++ i)
        {
            
while ( true ) {  
                
if  (work_threads[i].state  ==  thrReady)  break ;
                
else  Sleep( 0 );
            }
            work_threads[i].state
= thrTerminate;
        }
        
if  (thr_count > 0 )
            ReleaseSemaphore(thread_event,thr_count, 
0 );
        
for (i  =   0 ; i  < thr_count;  ++ i)
        {
            
while ( true ) {  
                
if  (work_threads[i].state  ==  thrDeath)  break ;
                
else  Sleep( 0 );
            }
        }
        CloseHandle(thread_event);
        CloseHandle(new_thread_event);
        work_threads.clear();
    }
    
void  passel_work(TThreadCallBack work_proc, void **  word_data_list, int  work_count)    {
        
// assert(work_count>=1);
        
// assert(work_count<=passel_count());
         if  (work_count == 1 )
        {
            work_proc(word_data_list[
0 ]);
        }
        
else
        {
            
long  i;
            
long  thr_count = ( long )work_threads.size();
            
for (i  =   0 ; i  <  work_count - 1 ++ i)
            {
                work_threads[i].func  
=  work_proc;
                work_threads[i].pdata  
= word_data_list[i];
                work_threads[i].state 
=  thrBusy;
            }
            
for (i  =   work_count - 1 ; i  <  thr_count;  ++ i)
            {
                work_threads[i].func  
=   0 ;
                work_threads[i].pdata  
= 0 ;
                work_threads[i].state 
=  thrBusy;
            }
            
if  (thr_count > 0 )
                ReleaseSemaphore(thread_event,thr_count, 
0 );

            
// current thread do a work
            work_proc(word_data_list[work_count - 1 ]);


            
// wait for work finish  
             for (i  =   0 ; i  < thr_count;  ++ i)
            {
                
while ( true ) {  
                    
if  (work_threads[i].state  ==  thrReady)  break ;
                    
else  Sleep( 0 );
                }
            }
            std::swap(thread_event,new_thread_event);
        }
    }
public :
    
explicit  TWorkThreadPool(unsigned  long  work_count):thread_event( 0 ),work_threads() { 
        
// assert(work_count>=1);  
        inti_threads(work_count);            }
    
~ TWorkThreadPool() {  free_threads(); }
    
long  best_work_count()  const  {  return  passel_count(); }
    
void  work_execute(TThreadCallBack work_proc, void **  word_data_list, int  work_count)    {        
        
while  (work_count > 0 )
        {
            
long  passel_work_count;
            
if  (work_count >= passel_count())
                passel_work_count
= passel_count();
            
else
                passel_work_count
= work_count;

            passel_work(work_proc,word_data_list,passel_work_count);

            work_count
-= passel_work_count;
            word_data_list
=& word_data_list[passel_work_count];
        }
    }
    inline 
void  DoWorkEnd(TWorkThread *  thread_data){ 
        thread_data
-> waitfor_event = new_thread_event; 
        thread_data
-> func = 0 ;
        thread_data
-> state  =  thrReady;
    }
};
void  do_work_end(TWorkThread *  thread_data)
{
    thread_data
-> pool -> DoWorkEnd(thread_data);
}
// TWorkThreadPool end;
/////////////////////////////////////// /

static  TWorkThreadPool g_work_thread_pool(DefthreadCount); // 线程池

int  computePot() {
    
static   bool  is_inti_work = false ;
    
static  TWorkData   work_list[DefthreadCount];
    
static  TWorkData *   pwork_list[DefthreadCount];

    
int  i;
    
if  ( ! is_inti_work)
    {
        
long  fi = 0 ;
        
for  ( int  i = 0 ;i < DefthreadCount; ++ i)
        {
            
if  ( 0 == i)
                work_list[i].iBegin
= 0 ;
            
else
                work_list[i].iBegin
= work_list[i - 1 ].iEnd;
            
if  (i == DefthreadCount - 1 )
                work_list[i].iEnd
= ( long )NPARTS;
            
else
                work_list[i].iEnd
= ( long )( ( double )(NPARTS - 1 ) * sqrt(( double )(i + 1 ) / DefthreadCount) + 1 + 0.5  );
            pwork_list[i]
=& work_list[i];
        }
        is_inti_work
= true ;
    }

    g_work_thread_pool.work_execute((TThreadCallBack)computePotPart,(
void   ** )( & pwork_list[ 0 ]),DefthreadCount);


    
for  (i = 0 ;i < DefthreadCount; ++ i)
       pot
+= work_list[i].fResult;

    
return   0 ;
}

代码执行时间          0.125秒    相对于原始代码加速比:3176.0%


注意到一个事实,float比double版快出了很多,why?
原来,double版中为了使用SSE而在float和double之间的转换花费了很多的时间!
我不知道这个问题是AMD64x2 CPU的问题还是在酷睿2上也一样;
double版中为了优化这个转换,我预先保存一份r数组的float转化值计算,这样就能节约double到float转换;
(double版完整源代码参见《英特尔多核平台编码优化大赛的优化过程》)
定义一个临时数组rf:
  __declspec(align(16)) float  rf[DIMS][(NPARTS+3)/4*4];
  修改updatePositions函数:

void  updatePositions() {
   
int   i,j;

   
for  (i = 0 ;i < DIMS; ++ i)
   {
       
for ( j = 0 ; j < NPARTS;  ++ j )
       {
      r[i][j] 
-= 0.5   +  _my_rand()  * ( 1.0 / RAND_MAX) );
   rf[i][j]
= ( float )r[i][j];
       }
   }
}


然后重新实现computePotPart_forj:

void  computePotPart_forj_double_float( int  i,__m128d *  pResult)
{
      __m128d lcpot
= _mm_setzero_pd();
      __m128d _mmi0
= _mm_set1_pd( - r[ 0 ][i]);
      __m128d _mmi1
= _mm_set1_pd( - r[ 1 ][i]);
      __m128d _mmi2
= _mm_set1_pd( - r[ 2 ][i]);
      __m128 _mmi0f
= _mm_set1_ps( - rf[ 0 ][i]);
      __m128 _mmi1f
= _mm_set1_ps( - rf[ 1 ][i]);
      __m128 _mmi2f
= _mm_set1_ps( - rf[ 2 ][i]);
      
int  j = 0 ;
      
// *
       for (;j + 4 < i;j += 4 )
      {
          __m128d dm0
= _mm_add_pd( * (__m128d * ) & r[ 0 ][j],_mmi0);
          dm0
= _mm_mul_pd(dm0,dm0);
          __m128d dm1
= _mm_add_pd( * (__m128d * ) & r[ 1 ][j],_mmi1);
          dm1
= _mm_mul_pd(dm1,dm1);
          __m128d dm2
= _mm_add_pd( * (__m128d * ) & r[ 2 ][j],_mmi2);
          dm2
= _mm_mul_pd(dm2,dm2);
          dm0
= _mm_add_pd(dm0,dm1);
          dm0
= _mm_add_pd(dm0,dm2);

          __m128d dm5
= _mm_add_pd( * (__m128d * ) & r[ 0 ][j + 2 ],_mmi0);
          dm5
= _mm_mul_pd(dm5,dm5);
          __m128d dm6
= _mm_add_pd( * (__m128d * ) & r[ 1 ][j + 2 ],_mmi1);
          dm6
= _mm_mul_pd(dm6,dm6);
          dm2
= _mm_add_pd( * (__m128d * ) & r[ 2 ][j + 2 ],_mmi2);
          dm2
= _mm_mul_pd(dm2,dm2);
          dm5
= _mm_add_pd(dm5,dm6);
          dm5
= _mm_add_pd(dm5,dm2);

          
// 用SSE的rsqrt近似计算1/sqrt(a); 然后使用牛顿迭代来提高开方精度
          
//  1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x 
          {
             
/*  __m128 sm0=_mm_cvtpd_ps(dm0);
              __m128 sm1=_mm_cvtpd_ps(dm5);
              sm0=_mm_movelh_ps(sm0,sm1);
*/

              __m128 sm0
= _mm_add_ps( * (__m128 * ) & rf[ 0 ][j],_mmi0f);
              sm0
= _mm_mul_ps(sm0,sm0);
              __m128 sm1
= _mm_add_ps( * (__m128 * ) & rf[ 1 ][j],_mmi1f);
              sm1
= _mm_mul_ps(sm1,sm1);
              __m128 sm2
= _mm_add_ps( * (__m128 * ) & rf[ 2 ][j],_mmi2f);
              sm2
= _mm_mul_ps(sm2,sm2);
              sm0
= _mm_add_ps(sm0,sm1);
              sm0
= _mm_add_ps(sm0,sm2);

              __m128 sma
= _mm_mul_ps(sm0,xmms_0_5);  // a*(-0.5)
              sm0 = _mm_rsqrt_ps(sm0);  // 计算1/sqrt(a)
              
// 牛顿迭代,提高开方精度
              sma = _mm_mul_ps(sma,sm0);
              sma
= _mm_mul_ps(sma,sm0); 
              sma
= _mm_add_ps(sma,xmms1_5); 
              sm0
= _mm_mul_ps(sm0,sma); 

                  __m128d dma
= _mm_mul_pd(dm0,xmmd_0_5);  // a*(-0.5)
                  __m128d dmb = _mm_mul_pd(dm5,xmmd_0_5);

              sm1
= _mm_movehl_ps(sm1,sm0);
              dm0
= _mm_cvtps_pd(sm0);  //  
              dm5 = _mm_cvtps_pd(sm1);  //  

              
// 再次迭代,加倍精度 
                  dma = _mm_mul_pd(dma,dm0);
                  dmb
= _mm_mul_pd(dmb,dm5);
                  dma
= _mm_mul_pd(dma,dm0);
                  dmb
= _mm_mul_pd(dmb,dm5);
                  dma
= _mm_add_pd(dma,xmmd1_5);
                  dmb
= _mm_add_pd(dmb,xmmd1_5);
                  dm0
= _mm_mul_pd(dm0,dma);
                  dm5
= _mm_mul_pd(dm5,dmb);
          }

          lcpot
= _mm_add_pd(lcpot,dm0);
          lcpot
= _mm_add_pd(lcpot,dm5);
      }

      
for  (;j + 1 < i; ++ j)
      {
          __m128d dm0
= _mm_set_sd(r[ 0 ][j]);
          dm0
= _mm_add_pd(dm0,_mmi0);
          dm0
= _mm_mul_pd(dm0,dm0);
          __m128d dm1
= _mm_set_sd(r[ 1 ][j]);
          dm1
= _mm_add_sd(dm1,_mmi1);
          dm1
= _mm_mul_sd(dm1,dm1);
          __m128d dm2
= _mm_set_sd(r[ 2 ][j]);
          dm2
= _mm_add_sd(dm2,_mmi2);
          dm2
= _mm_mul_sd(dm2,dm2);
          dm0
= _mm_add_sd(dm0,dm1);
          dm0
= _mm_add_sd(dm0,dm2);

         { 
              __m128 sm0
= _mm_cvtpd_ps(dm0);
              __m128d dma
= _mm_mul_sd(dm0,xmmd_0_5);  // a*(-0.5)
              sm0 = _mm_rsqrt_ss(sm0);  // 计算1/sqrt(a)
              dm0 = _mm_cvtps_pd(sm0);  //  
              
// 牛顿迭代,提高开方精度
              dm1 = _mm_mul_sd(dm0,dm0);
              dm1
= _mm_mul_sd(dm1,dma);
              dm1
= _mm_add_sd(dm1,xmmd1_5);
              dm0
= _mm_mul_sd(dm0,dm1);

              
// 再次迭代
                  dma = _mm_mul_sd(dma,dm0);
                  dma
= _mm_mul_sd(dma,dm0);
                  dma
= _mm_add_sd(dma,xmmd1_5);
                  dm0
= _mm_mul_sd(dm0,dma);
          }

          lcpot
= _mm_add_sd(lcpot,dm0);
      }

      
* pResult = _mm_add_pd( * pResult,lcpot);
}

该代码对速度有很小的改进,但精度确只到小数点后5/6位(过早的降低了运算精度,后面的计算使误差放大),
再增加一次牛顿叠代就能弥补精度的缺失,但速度上就没有优势了;只能放弃该方案;


既然硬件的float和double之间的转换慢,那我手工来写一个会不会更好一些呢? (比较奇怪的尝试:)
见代码:
(该函数利用了double/float的IEE浮点编码结构)

const  __m64   _mmd2f_esub  = _mm_set_pi32(( 1023 - 127 <<  ( 52 - 32 ), 0 ); 
const  __m128i _xmmd2f_esub = _mm_set_epi64(_mmd2f_esub,_mmd2f_esub); 
void  computePotPart_forj_d2f( int  i,__m128d *  pResult)
{
      __m128d lcpot
= _mm_setzero_pd();
      __m128d _mmi0
= _mm_set1_pd( - r[ 0 ][i]);
      __m128d _mmi1
= _mm_set1_pd( - r[ 1 ][i]);
      __m128d _mmi2
= _mm_set1_pd( - r[ 2 ][i]);
      
int  j = 0 ;
  
// *
       for (;j + 4 < i;j += 4 )
      {
          __m128d dm0
= _mm_add_pd( * (__m128d * ) & r[ 0 ][j],_mmi0);
          dm0
= _mm_mul_pd(dm0,dm0);
          __m128d dm1
= _mm_add_pd( * (__m128d * ) & r[ 1 ][j],_mmi1);
          dm1
= _mm_mul_pd(dm1,dm1);
          __m128d dm2
= _mm_add_pd( * (__m128d * ) & r[ 2 ][j],_mmi2);
          dm2
= _mm_mul_pd(dm2,dm2);
          dm0
= _mm_add_pd(dm0,dm1);
          dm0
= _mm_add_pd(dm0,dm2);

          __m128d dm5
= _mm_add_pd( * (__m128d * ) & r[ 0 ][j + 2 ],_mmi0);
          dm5
= _mm_mul_pd(dm5,dm5);
          __m128d dm6
= _mm_add_pd( * (__m128d * ) & r[ 1 ][j + 2 ],_mmi1);
          dm6
= _mm_mul_pd(dm6,dm6);
          dm2
= _mm_add_pd( * (__m128d * ) & r[ 2 ][j + 2 ],_mmi2);
          dm2
= _mm_mul_pd(dm2,dm2);
          dm5
= _mm_add_pd(dm5,dm6);
          dm5
= _mm_add_pd(dm5,dm2);

            
// 用SSE的rsqrt近似计算1/sqrt(a); 然后使用牛顿迭代来提高开方精度
          
//  1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x 
          {
           __m128d dma
= _mm_mul_pd(dm0,xmmd_0_5);  // a*(-0.5)
           __m128d dmb = _mm_mul_pd(dm5,xmmd_0_5);

            (
* (__m128i * ) & dm0) = _mm_sub_epi64(( * (__m128i * ) & dm0),_xmmd2f_esub);
            (
* (__m128i * ) & dm5) = _mm_sub_epi64(( * (__m128i * ) & dm5),_xmmd2f_esub);
                       
          (
* (__m128i * ) & dm0) = _mm_srli_epi64( * (__m128i * ) & dm0, 32 - 3 );
            (
* (__m128i * ) & dm5) = _mm_srli_epi64( * (__m128i * ) & dm5, 32 - 3 );
            (
* (__m128i * ) & dm0) = _mm_slli_epi64( * (__m128i * ) & dm0, 32 );
          __m128 sm0;
            (
* (__m128i * ) & sm0) = _mm_xor_si128( * (__m128i * ) & dm0, * (__m128i * ) & dm5);

          __m128 sma
= _mm_mul_ps(sm0,xmms_0_5);  // a*(-0.5)
          sm0 = _mm_rsqrt_ps(sm0);  // 计算1/sqrt(a)
                      
// 牛顿迭代,提高开方精度
                      sma = _mm_mul_ps(sma,sm0);
                      sma
= _mm_mul_ps(sma,sm0); 
                      sma
= _mm_add_ps(sma,xmms1_5); 
                      sm0
= _mm_mul_ps(sm0,sma); 

          (
* (__m128i * ) & dm0) = _mm_srli_epi64( * (__m128i * ) & sm0, 3 );  //  
          ( * (__m128i * ) & dm5) = _mm_slli_epi64( * (__m128i * ) & sm0, 32 );  //  
          ( * (__m128i * ) & dm5) = _mm_srli_epi64( * (__m128i * ) & dm5, 3 );  //  
                      ( * (__m128i * ) & dm0) = _mm_add_epi64(( * (__m128i * ) & dm0),_xmmd2f_esub);
                      (
* (__m128i * ) & dm5) = _mm_add_epi64(( * (__m128i * ) & dm5),_xmmd2f_esub);

          
// 再次迭代,加倍精度 
          dma = _mm_mul_pd(dma,dm0);
          dmb
= _mm_mul_pd(dmb,dm5);
          dma
= _mm_mul_pd(dma,dm0);
          dmb
= _mm_mul_pd(dmb,dm5);
          dma
= _mm_add_pd(dma,xmmd1_5);
          dmb
= _mm_add_pd(dmb,xmmd1_5);
          dm0
= _mm_mul_pd(dm0,dma);
          dm5
= _mm_mul_pd(dm5,dmb);
          }

          lcpot
= _mm_add_pd(lcpot,dm0);
          lcpot
= _mm_add_pd(lcpot,dm5);
  }

  
for  (;j + 1 < i; ++ j)
      {
          __m128d dm0
= _mm_set_sd(r[ 0 ][j]);
          dm0
= _mm_add_pd(dm0,_mmi0);
          dm0
= _mm_mul_pd(dm0,dm0);
          __m128d dm1
= _mm_set_sd(r[ 1 ][j]);
          dm1
= _mm_add_sd(dm1,_mmi1);
          dm1
= _mm_mul_sd(dm1,dm1);
          __m128d dm2
= _mm_set_sd(r[ 2 ][j]);
          dm2
= _mm_add_sd(dm2,_mmi2);
          dm2
= _mm_mul_sd(dm2,dm2);
          dm0
= _mm_add_sd(dm0,dm1);
          dm0
= _mm_add_sd(dm0,dm2);

        { 
          __m128 sm0
= _mm_cvtpd_ps(dm0);
          __m128d dma
= _mm_mul_sd(dm0,xmmd_0_5);  // a*(-0.5)
          sm0 = _mm_rsqrt_ss(sm0);  // 计算1/sqrt(a)
          dm0 = _mm_cvtps_pd(sm0);  //  
                      
// 牛顿迭代,提高开方精度
          dm1 = _mm_mul_sd(dm0,dm0);
          dm1
= _mm_mul_sd(dm1,dma);
          dm1
= _mm_add_sd(dm1,xmmd1_5);
          dm0
= _mm_mul_sd(dm0,dm1);

          
// 再次迭代
          dma = _mm_mul_sd(dma,dm0);
          dma
= _mm_mul_sd(dma,dm0);
          dma
= _mm_add_sd(dma,xmmd1_5);
          dm0
= _mm_mul_sd(dm0,dma);
          }

          lcpot
= _mm_add_sd(lcpot,dm0);
      }

  
* pResult = _mm_add_pd( * pResult,lcpot);
}

该函数的速度比原来的代码稍慢:)  放弃之
(我还尝试过这样的代码(代码没有保存:( )
代码原理和上面的很接近,利用IEE的浮点格式,强制把double转成float后(通过指数平衡和移位),
分两路使用_mm_rsqrt_ps、牛顿叠代等;但这个慢慢,也放弃了)

 

既然硬件的float和double之间的转换慢,那我就不转换来看看怎样实现;
不使用SSE的_mm_rsqrt_ps指令,而利用IEE浮点格式生成一个粗略的近似解,然后迭代(迭代了3次);
(也可以用查表的方式来得到初始解,但在SSE体系中,这种实现很可能得不偿失,所以没有去实现);(该函数利用了double的IEE浮点编码结构)这样以后,速度有了一些提高,但精度还有点不够(大概有6位小数位精度),再次迭代的话就失去了速度优势;
不知道有没有比魔法数0x5fe6ec85,0xe7de30da更好的魔法数:)
放弃;

const  __m64   _mmi_mn = _mm_set_pi32( 0x5fe6ec85 , 0xe7de30da );
const  __m128i xmmi64_mn = _mm_set1_epi64(_mmi_mn);
void  computePotPart_forj_int( int  i,__m128d *  pResult)
{
      __m128d lcpot
= _mm_setzero_pd();
      __m128d _mmi0
= _mm_set1_pd( - r[ 0 ][i]);
      __m128d _mmi1
= _mm_set1_pd( - r[ 1 ][i]);
      __m128d _mmi2
= _mm_set1_pd( - r[ 2 ][i]);
        
int  j = 0 ;
         
// *
       for (;j + 4 < i;j += 4 )
      {
          __m128d dm0
= _mm_add_pd( * (__m128d * ) & r[ 0 ][j],_mmi0);
          dm0
= _mm_mul_pd(dm0,dm0);
          __m128d dm1
= _mm_add_pd( * (__m128d * ) & r[ 1 ][j],_mmi1);
          dm1
= _mm_mul_pd(dm1,dm1);
          __m128d dm2
= _mm_add_pd( * (__m128d * ) & r[ 2 ][j],_mmi2);
          dm2
= _mm_mul_pd(dm2,dm2);
          dm0
= _mm_add_pd(dm0,dm1);
          dm0
= _mm_add_pd(dm0,dm2);

          __m128d dm5
= _mm_add_pd( * (__m128d * ) & r[ 0 ][j + 2 ],_mmi0);
          dm5
= _mm_mul_pd(dm5,dm5);
          __m128d dm6
= _mm_add_pd( * (__m128d * ) & r[ 1 ][j + 2 ],_mmi1);
          dm6
= _mm_mul_pd(dm6,dm6);
          dm2
= _mm_add_pd( * (__m128d * ) & r[ 2 ][j + 2 ],_mmi2);
          dm2
= _mm_mul_pd(dm2,dm2);
          dm5
= _mm_add_pd(dm5,dm6);
          dm5
= _mm_add_pd(dm5,dm2);

          
// 利用IEE double 浮点格式的编码生成1/sqrt(a)的一个近似值; 然后使用牛顿迭代来提高精度
          
//  1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x 
          {
              __m128i xmmi0
= xmmi64_mn;
              __m128d dma
= _mm_mul_pd(dm0,xmmd_0_5);  // a*(-0.5)
              __m128d dmb = _mm_mul_pd(dm5,xmmd_0_5);
              
* (__m128i * ) & dm0 = _mm_srli_epi64( * (__m128i * ) & dm0, 1 );
              
* (__m128i * ) & dm5 = _mm_srli_epi64( * (__m128i * ) & dm5, 1 );
              
* (__m128i * ) & dm0 = _mm_sub_epi64(xmmi0, * (__m128i * ) & dm0);
              
* (__m128i * ) & dm5 = _mm_sub_epi64(xmmi0, * (__m128i * ) & dm5);

              
// 迭代,加倍精度

              dm1
= _mm_mul_pd(dma,dm0);
              dm2
= _mm_mul_pd(dmb,dm5);
              dm1
= _mm_mul_pd(dm1,dm0);
              dm2
= _mm_mul_pd(dm2,dm5);
              dm1
= _mm_add_pd(dm1,xmmd1_5);
              dm2
= _mm_add_pd(dm2,xmmd1_5);
              dm0
= _mm_mul_pd(dm0,dm1);
              dm5
= _mm_mul_pd(dm5,dm2);

              dm1
= _mm_mul_pd(dma,dm0);
              dm2
= _mm_mul_pd(dmb,dm5);
              dm1
= _mm_mul_pd(dm1,dm0);
              dm2
= _mm_mul_pd(dm2,dm5);
              dm1
= _mm_add_pd(dm1,xmmd1_5);
              dm2
= _mm_add_pd(dm2,xmmd1_5);
              dm0
= _mm_mul_pd(dm0,dm1);
              dm5
= _mm_mul_pd(dm5,dm2);

              dma
= _mm_mul_pd(dma,dm0);
              dmb
= _mm_mul_pd(dmb,dm5);
              dma
= _mm_mul_pd(dma,dm0);
              dmb
= _mm_mul_pd(dmb,dm5);
              dma
= _mm_add_pd(dma,xmmd1_5);
              dmb
= _mm_add_pd(dmb,xmmd1_5);
              dm0
= _mm_mul_pd(dm0,dma);
              dm5
= _mm_mul_pd(dm5,dmb);
              
// 精度还有点不够  但再次迭代的话就失去了速度优势
              }

          lcpot
= _mm_add_pd(lcpot,dm0);
          lcpot
= _mm_add_pd(lcpot,dm5);
      }

      for (;j+1<i;++j)
      {
          __m128d dm0=_mm_set_sd(r[0][j]);
          dm0=_mm_add_pd(dm0,_mmi0);
          dm0=_mm_mul_pd(dm0,dm0);
          __m128d dm1=_mm_set_sd(r[1][j]);
          dm1=_mm_add_sd(dm1,_mmi1);
          dm1=_mm_mul_sd(dm1,dm1);
          __m128d dm2=_mm_set_sd(r[2][j]);
          dm2=_mm_add_sd(dm2,_mmi2);
          dm2=_mm_mul_sd(dm2,dm2);
          dm0=_mm_add_sd(dm0,dm1);
          dm0=_mm_add_sd(dm0,dm2);
   {
     __m128 sm0=_mm_cvtpd_ps(dm0);
     __m128d dma=_mm_mul_sd(dm0,xmmd_0_5); //a*(-0.5)
     sm0=_mm_rsqrt_ss(sm0); //计算1/sqrt(a)
     dm0=_mm_cvtps_pd(sm0); //
              //牛顿迭代,提高开方精度
     dm1=_mm_mul_sd(dm0,dm0);
     dm1=_mm_mul_sd(dm1,dma);
     dm1=_mm_add_sd(dm1,xmmd1_5);
     dm0=_mm_mul_sd(dm0,dm1);
     //再次迭代
      dma=_mm_mul_sd(dma,dm0);
      dma=_mm_mul_sd(dma,dm0);
      dma=_mm_add_sd(dma,xmmd1_5);
      dm0=_mm_mul_sd(dm0,dma);
    }


          lcpot
= _mm_add_sd(lcpot,dm0);
      }

  
* pResult = _mm_add_pd( * pResult,lcpot);
}

 

 还想到一个没有时间去实现的方案:由于SSE和x87是独立的两个硬件,那么可以使它们并行执行;
SSE部件代码不变,但循环做更大的展开,然后让x87承担一路或两路运算;

 

你可能感兴趣的:(thread,优化,list,float,英特尔,平台)