英特尔多核平台编码优化大赛的优化过程--补充
[email protected] 2007.01.20
tag: 多核编程,sse2,牛顿迭代,代码优化,优化大赛,invsqrt,开方
主要文章请参看我的《英特尔多核平台编码优化大赛的优化过程》: http://blog.csdn.net/housisong/archive/2007/01/20/1488465.aspx
本文章是其补充;提供一个完整的float实现版本、double到float的手工转换、手工得到invSqrt的粗略起始迭代值 等其它几个不算成功的实现;
我测试和优化过程中用的 CPU:AMD64x2 3600+ (双核CPU)
操作系统:Windows XP 32bit
编译器:Visual Studio 2005
大赛公布的原始代码执行时间 3.97秒
一个float完整实现版本(牺牲了计算精度),源代码如下:
(如果用汇编实现应该还可以把速度提高一些,或者使用ICC编译器:)
/*
compute the potential energy of a collection of
*/
/*
particles interacting via pairwise potential
*/
#include
<
stdio.h
>
#include
<
stdlib.h
>
#include
<
math.h
>
#include
<
windows.h
>
#include
<
xmmintrin.h
>
//
使用SSE1
#include
<
process.h
>
#include
<
vector
>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////
/
//
作者:侯思松 [email protected]
//
计算结果的精度为float单精度浮点 版本
#define
_IS_FAST
//
以牺牲精度的方式加快速度,否则就运算达到float单精度
///////////////////////////////////////////////////////////////////////////////////////////////////////////////
/
//
#define _NEW_TIME_CLOCK
#ifdef _NEW_TIME_CLOCK
#define
clock_t double
double
CLOCKS_PER_SEC
=
0.0
;
inline
double
clock() {
__int64 result;
if
(CLOCKS_PER_SEC
==
0
)
{
QueryPerformanceFrequency((LARGE_INTEGER
*
)
&
result);
CLOCKS_PER_SEC
=
(
double
)result;
}
QueryPerformanceCounter((LARGE_INTEGER
*
)
&
result);
return
(
double
)result;
}
#else
#include
<
time.h
>
#endif
#define
_IS_USES_MY_RAND
//
单线程执行rand函数,所以使用自定义rand是安全的
const
long
DefthreadCount
=
2
;
//
1,2,4,8,16,.. 把计算任务分成多个任务并行执行
float
&
m128_value(__m128
&
x,
const
long
index) {
return
((
float
*
)(
&
x))[index]; }
#define
NPARTS 1000
#define
NITER 201
#define
DIMS 3
#ifdef _IS_USES_MY_RAND
class
CMyRand
{
private
:
unsigned
long
_my_holdrand;
public
:
CMyRand():_my_holdrand(
1
){}
inline
int
_my_rand (
void
)
{
unsigned
long
result
=
_my_holdrand
*
214013
+
2531011
;
_my_holdrand
=
result;
return
( (result
>>
16
)
&
RAND_MAX );
}
};
CMyRand _MyRand;
inline
int
_my_rand (
void
){
return
_MyRand._my_rand(); }
#else
#define
_my_rand rand
#endif
int
rand(
void
);
int
computePot();
void
initPositions(
void
);
void
updatePositions(
void
);
__declspec(align(
16
))
float
r[DIMS][(NPARTS
+
3
)
/
4
*
4
];
//
16byte对齐
double
pot;
int
main() {
int
i;
clock_t start, stop;
//
char ctmp; std::cin>>ctmp;
initPositions();
updatePositions();
start
=
clock();
for
( i
=
0
; i
<
NITER; i
++
) {
pot
=
0.0
;
computePot();
if
(i
%
10
==
0
)
{
printf(
"
%5d: Potential: %20.7f
"
, i, pot);}
updatePositions();
}
pot
=
0.0
;
stop
=
clock();
printf (
"
Seconds = %10.9f
"
,(
double
)(stop
-
start)
/
CLOCKS_PER_SEC);
return
0
;
}
void
initPositions() {
int
i, j;
for
( i
=
0
; i
<
DIMS; i
++
){
for
( j
=
0
; j
<
NPARTS; j
++
)
r[i][j]
=
(
float
)(
0.5
+
_my_rand()
*
(
1.0
/
RAND_MAX) );
}
}
void
updatePositions() {
int
i,j;
for
(i
=
0
;i
<
DIMS;
++
i)
{
for
( j
=
0
; j
<
NPARTS;
++
j )
{
r[i][j]
-=
(
float
)(
0.5
+
_my_rand()
*
(
1.0
/
RAND_MAX) );
}
}
}
struct
TWorkData
{
long
iBegin;
long
iEnd;
double
fResult;
};
const
__m128 xmms1_5
=
{ (
1.5
),(
1.5
),(
1.5
),(
1.5
) };
const
__m128 xmms_0_5
=
{ (
-
0.5
),(
-
0.5
),(
-
0.5
),(
-
0.5
) };
void
computePotPart_forj(
int
i,__m128
*
pResult)
{
__m128 lcpot
=
_mm_setzero_ps();
__m128 _mmi0
=
_mm_set1_ps(
-
r[
0
][i]);
__m128 _mmi1
=
_mm_set1_ps(
-
r[
1
][i]);
__m128 _mmi2
=
_mm_set1_ps(
-
r[
2
][i]);
int
j
=
0
;
//
for( j=0; j<i-1; j++ ) {
//
"j<i-1"比较奇怪,疑为"j<i" !
//
* 把这个循环做4次展开
for
(;j
+
4
<
i;j
+=
4
)
{
__m128 sm0
=
_mm_add_ps(
*
(__m128
*
)
&
r[
0
][j],_mmi0);
sm0
=
_mm_mul_ps(sm0,sm0);
__m128 sm1
=
_mm_add_ps(
*
(__m128
*
)
&
r[
1
][j],_mmi1);
sm1
=
_mm_mul_ps(sm1,sm1);
__m128 sm2
=
_mm_add_ps(
*
(__m128
*
)
&
r[
2
][j],_mmi2);
sm2
=
_mm_mul_ps(sm2,sm2);
sm0
=
_mm_add_ps(sm0,sm1);
sm0
=
_mm_add_ps(sm0,sm2);
sm1
=
_mm_rsqrt_ps(sm0);
//
1/sqrt(,,,)
#ifndef _IS_FAST
//
牛顿迭代,提高开方精度
//
1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x
sm0
=
_mm_mul_ps(sm0,xmms_0_5);
//
a*(-0.5)
sm0
=
_mm_mul_ps(sm0,sm1);
sm0
=
_mm_mul_ps(sm0,sm1);
sm0
=
_mm_add_ps(sm0,xmms1_5);
sm0
=
_mm_mul_ps(sm0,sm1);
#else
sm0
=
sm1;
#endif
lcpot
=
_mm_add_ps(lcpot,sm0);
}
//
*/
for
(;j
<
i
-
1
;
++
j)
{
__m128 sm0
=
_mm_set_ss(r[
0
][j]);
sm0
=
_mm_add_ss(sm0,_mmi0);
sm0
=
_mm_mul_ss(sm0,sm0);
__m128 sm1
=
_mm_set_ss(r[
1
][j]);
sm1
=
_mm_add_ss(sm1,_mmi1);
sm1
=
_mm_mul_ss(sm1,sm1);
__m128 sm2
=
_mm_set_ss(r[
2
][j]);
sm2
=
_mm_add_ss(sm2,_mmi2);
sm2
=
_mm_mul_ss(sm2,sm2);
sm0
=
_mm_add_ss(sm0,sm1);
sm0
=
_mm_add_ss(sm0,sm2);
sm1
=
_mm_rsqrt_ss(sm0);
#ifndef _IS_FAST
//
牛顿迭代,提高开方精度
//
1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x
sm0
=
_mm_mul_ps(sm0,xmms_0_5);
//
a*(-0.5)
sm0
=
_mm_mul_ps(sm0,sm1);
sm0
=
_mm_mul_ps(sm0,sm1);
sm0
=
_mm_add_ps(sm0,xmms1_5);
sm0
=
_mm_mul_ps(sm0,sm1);
#else
sm0
=
sm1;
#endif
lcpot
=
_mm_add_ss(lcpot,sm0);
}
*
pResult
=
_mm_add_ps(
*
pResult,lcpot);
}
void
computePotPart(TWorkData
*
work_data) {
int
i;
__m128 lcpot
=
_mm_setzero_ps();
//
#pragma omp parallel for schedule(static)
for
( i
=
work_data
->
iBegin; i
<
work_data
->
iEnd; i
++
) {
computePotPart_forj(i,
&
lcpot);
}
__m128 dt0;
dt0
=
_mm_movehl_ps(lcpot,lcpot);
lcpot
=
_mm_add_ps(lcpot,dt0);
dt0
=
_mm_shuffle_ps(lcpot,lcpot,
1
);
lcpot
=
_mm_add_ss(lcpot,dt0);
work_data
->
fResult
=
m128_value(lcpot,
0
);
}
////////////////////////////////////////////////////////////
/
//
工作线程池 TWorkThreadPool
//
用于把一个任务拆分成多个线程任务
//
要求每个小任务任务量相近
//
todo:改成任务领取模式
class
TWorkThreadPool;
typedef
void
(
*
TThreadCallBack)(
void
*
pData);
enum
TThreadState{ thrStartup
=
0
, thrReady, thrBusy, thrTerminate, thrDeath };
class
TWorkThread
{
public
:
volatile
HANDLE thread_handle;
volatile
enum
TThreadState state;
volatile
TThreadCallBack func;
volatile
void
*
pdata;
//
work data
volatile
HANDLE waitfor_event;
TWorkThreadPool
*
pool;
volatile
DWORD thread_ThreadAffinityMask;
TWorkThread() { memset(
this
,
0
,
sizeof
(TWorkThread)); }
};
void
do_work_end(TWorkThread
*
thread_data);
void
__cdecl thread_dowork(TWorkThread
*
thread_data)
//
void __stdcall thread_dowork(TWorkThread* thread_data)
{
volatile
TThreadState
&
state
=
thread_data
->
state;
SetThreadAffinityMask(GetCurrentThread(),thread_data
->
thread_ThreadAffinityMask);
state
=
thrStartup;
while
(
true
)
{
WaitForSingleObject(thread_data
->
waitfor_event,
-
1
);
if
(state
==
thrTerminate)
break
;
state
=
thrBusy;
volatile
TThreadCallBack
&
func
=
thread_data
->
func;
if
(func
!=
0
)
func((
void
*
)thread_data
->
pdata);
do_work_end(thread_data);
}
state
=
thrDeath;
_endthread();
//
ExitThread(0);
}
class
TWorkThreadPool
{
private
:
volatile
HANDLE thread_event;
volatile
HANDLE new_thread_event;
std::vector
<
TWorkThread
>
work_threads;
inline
int
passel_count()
const
{
return
(
int
)work_threads.size()
+
1
; }
void
inti_threads(
long
work_count) {
SYSTEM_INFO SystemInfo;
GetSystemInfo(
&
SystemInfo);
long
cpu_count
=
SystemInfo.dwNumberOfProcessors;
long
best_count
=
cpu_count;
if
(cpu_count
>
work_count) best_count
=
work_count;
long
newthrcount
=
best_count
-
1
;
work_threads.resize(newthrcount);
thread_event
=
CreateSemaphore(
0
,
0
,newthrcount ,
0
);
new_thread_event
=
CreateSemaphore(
0
,
0
,newthrcount ,
0
);
for
(
long
i
=
0
; i
<
newthrcount;
++
i)
{
work_threads[i].waitfor_event
=
thread_event;
work_threads[i].state
=
thrTerminate;
work_threads[i].pool
=
this
;
work_threads[i].thread_ThreadAffinityMask
=
1
<<
(i
+
1
);
//
DWORD thr_id;
work_threads[i].thread_handle
=
(HANDLE)_beginthread((
void
(__cdecl
*
)(
void
*
))thread_dowork,
0
, (
void
*
)
&
work_threads[i]);
//
CreateThread(0, 0, (LPTHREAD_START_ROUTINE)thread_dowork,(void*) &work_threads[i], 0, &thr_id);
}
SetThreadAffinityMask(GetCurrentThread(),
0x01
);
for
(
long
i
=
0
; i
<
newthrcount;
++
i)
{
while
(
true
) {
if
(work_threads[i].state
==
thrStartup)
break
;
else
Sleep(
0
);
}
work_threads[i].state
=
thrReady;
}
}
void
free_threads(
void
)
{
long
thr_count
=
(
long
)work_threads.size();
long
i;
for
(i
=
0
; i
<
thr_count;
++
i)
{
while
(
true
) {
if
(work_threads[i].state
==
thrReady)
break
;
else
Sleep(
0
);
}
work_threads[i].state
=
thrTerminate;
}
if
(thr_count
>
0
)
ReleaseSemaphore(thread_event,thr_count,
0
);
for
(i
=
0
; i
<
thr_count;
++
i)
{
while
(
true
) {
if
(work_threads[i].state
==
thrDeath)
break
;
else
Sleep(
0
);
}
}
CloseHandle(thread_event);
CloseHandle(new_thread_event);
work_threads.clear();
}
void
passel_work(TThreadCallBack work_proc,
void
**
word_data_list,
int
work_count) {
//
assert(work_count>=1);
//
assert(work_count<=passel_count());
if
(work_count
==
1
)
{
work_proc(word_data_list[
0
]);
}
else
{
long
i;
long
thr_count
=
(
long
)work_threads.size();
for
(i
=
0
; i
<
work_count
-
1
;
++
i)
{
work_threads[i].func
=
work_proc;
work_threads[i].pdata
=
word_data_list[i];
work_threads[i].state
=
thrBusy;
}
for
(i
=
work_count
-
1
; i
<
thr_count;
++
i)
{
work_threads[i].func
=
0
;
work_threads[i].pdata
=
0
;
work_threads[i].state
=
thrBusy;
}
if
(thr_count
>
0
)
ReleaseSemaphore(thread_event,thr_count,
0
);
//
current thread do a work
work_proc(word_data_list[work_count
-
1
]);
//
wait for work finish
for
(i
=
0
; i
<
thr_count;
++
i)
{
while
(
true
) {
if
(work_threads[i].state
==
thrReady)
break
;
else
Sleep(
0
);
}
}
std::swap(thread_event,new_thread_event);
}
}
public
:
explicit
TWorkThreadPool(unsigned
long
work_count):thread_event(
0
),work_threads() {
//
assert(work_count>=1);
inti_threads(work_count); }
~
TWorkThreadPool() { free_threads(); }
long
best_work_count()
const
{
return
passel_count(); }
void
work_execute(TThreadCallBack work_proc,
void
**
word_data_list,
int
work_count) {
while
(work_count
>
0
)
{
long
passel_work_count;
if
(work_count
>=
passel_count())
passel_work_count
=
passel_count();
else
passel_work_count
=
work_count;
passel_work(work_proc,word_data_list,passel_work_count);
work_count
-=
passel_work_count;
word_data_list
=&
word_data_list[passel_work_count];
}
}
inline
void
DoWorkEnd(TWorkThread
*
thread_data){
thread_data
->
waitfor_event
=
new_thread_event;
thread_data
->
func
=
0
;
thread_data
->
state
=
thrReady;
}
};
void
do_work_end(TWorkThread
*
thread_data)
{
thread_data
->
pool
->
DoWorkEnd(thread_data);
}
//
TWorkThreadPool end;
///////////////////////////////////////
/
static
TWorkThreadPool g_work_thread_pool(DefthreadCount);
//
线程池
int
computePot() {
static
bool
is_inti_work
=
false
;
static
TWorkData work_list[DefthreadCount];
static
TWorkData
*
pwork_list[DefthreadCount];
int
i;
if
(
!
is_inti_work)
{
long
fi
=
0
;
for
(
int
i
=
0
;i
<
DefthreadCount;
++
i)
{
if
(
0
==
i)
work_list[i].iBegin
=
0
;
else
work_list[i].iBegin
=
work_list[i
-
1
].iEnd;
if
(i
==
DefthreadCount
-
1
)
work_list[i].iEnd
=
(
long
)NPARTS;
else
work_list[i].iEnd
=
(
long
)( (
double
)(NPARTS
-
1
)
*
sqrt((
double
)(i
+
1
)
/
DefthreadCount)
+
1
+
0.5
);
pwork_list[i]
=&
work_list[i];
}
is_inti_work
=
true
;
}
g_work_thread_pool.work_execute((TThreadCallBack)computePotPart,(
void
**
)(
&
pwork_list[
0
]),DefthreadCount);
for
(i
=
0
;i
<
DefthreadCount;
++
i)
pot
+=
work_list[i].fResult;
return
0
;
}
代码执行时间 0.125秒 相对于原始代码加速比:3176.0%
注意到一个事实,float比double版快出了很多,why?
原来,double版中为了使用SSE而在float和double之间的转换花费了很多的时间!
我不知道这个问题是AMD64x2 CPU的问题还是在酷睿2上也一样;
double版中为了优化这个转换,我预先保存一份r数组的float转化值计算,这样就能节约double到float转换;
(double版完整源代码参见《英特尔多核平台编码优化大赛的优化过程》)
定义一个临时数组rf:
__declspec(align(16)) float rf[DIMS][(NPARTS+3)/4*4];
修改updatePositions函数:
void
updatePositions() {
int
i,j;
for
(i
=
0
;i
<
DIMS;
++
i)
{
for
( j
=
0
; j
<
NPARTS;
++
j )
{
r[i][j]
-=
(
0.5
+
_my_rand()
*
(
1.0
/
RAND_MAX) );
rf[i][j]
=
(
float
)r[i][j];
}
}
}
然后重新实现computePotPart_forj:
void
computePotPart_forj_double_float(
int
i,__m128d
*
pResult)
{
__m128d lcpot
=
_mm_setzero_pd();
__m128d _mmi0
=
_mm_set1_pd(
-
r[
0
][i]);
__m128d _mmi1
=
_mm_set1_pd(
-
r[
1
][i]);
__m128d _mmi2
=
_mm_set1_pd(
-
r[
2
][i]);
__m128 _mmi0f
=
_mm_set1_ps(
-
rf[
0
][i]);
__m128 _mmi1f
=
_mm_set1_ps(
-
rf[
1
][i]);
__m128 _mmi2f
=
_mm_set1_ps(
-
rf[
2
][i]);
int
j
=
0
;
//
*
for
(;j
+
4
<
i;j
+=
4
)
{
__m128d dm0
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
0
][j],_mmi0);
dm0
=
_mm_mul_pd(dm0,dm0);
__m128d dm1
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
1
][j],_mmi1);
dm1
=
_mm_mul_pd(dm1,dm1);
__m128d dm2
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
2
][j],_mmi2);
dm2
=
_mm_mul_pd(dm2,dm2);
dm0
=
_mm_add_pd(dm0,dm1);
dm0
=
_mm_add_pd(dm0,dm2);
__m128d dm5
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
0
][j
+
2
],_mmi0);
dm5
=
_mm_mul_pd(dm5,dm5);
__m128d dm6
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
1
][j
+
2
],_mmi1);
dm6
=
_mm_mul_pd(dm6,dm6);
dm2
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
2
][j
+
2
],_mmi2);
dm2
=
_mm_mul_pd(dm2,dm2);
dm5
=
_mm_add_pd(dm5,dm6);
dm5
=
_mm_add_pd(dm5,dm2);
//
用SSE的rsqrt近似计算1/sqrt(a); 然后使用牛顿迭代来提高开方精度
//
1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x
{
/*
__m128 sm0=_mm_cvtpd_ps(dm0);
__m128 sm1=_mm_cvtpd_ps(dm5);
sm0=_mm_movelh_ps(sm0,sm1);
*/
__m128 sm0
=
_mm_add_ps(
*
(__m128
*
)
&
rf[
0
][j],_mmi0f);
sm0
=
_mm_mul_ps(sm0,sm0);
__m128 sm1
=
_mm_add_ps(
*
(__m128
*
)
&
rf[
1
][j],_mmi1f);
sm1
=
_mm_mul_ps(sm1,sm1);
__m128 sm2
=
_mm_add_ps(
*
(__m128
*
)
&
rf[
2
][j],_mmi2f);
sm2
=
_mm_mul_ps(sm2,sm2);
sm0
=
_mm_add_ps(sm0,sm1);
sm0
=
_mm_add_ps(sm0,sm2);
__m128 sma
=
_mm_mul_ps(sm0,xmms_0_5);
//
a*(-0.5)
sm0
=
_mm_rsqrt_ps(sm0);
//
计算1/sqrt(a)
//
牛顿迭代,提高开方精度
sma
=
_mm_mul_ps(sma,sm0);
sma
=
_mm_mul_ps(sma,sm0);
sma
=
_mm_add_ps(sma,xmms1_5);
sm0
=
_mm_mul_ps(sm0,sma);
__m128d dma
=
_mm_mul_pd(dm0,xmmd_0_5);
//
a*(-0.5)
__m128d dmb
=
_mm_mul_pd(dm5,xmmd_0_5);
sm1
=
_mm_movehl_ps(sm1,sm0);
dm0
=
_mm_cvtps_pd(sm0);
//
dm5
=
_mm_cvtps_pd(sm1);
//
//
再次迭代,加倍精度
dma
=
_mm_mul_pd(dma,dm0);
dmb
=
_mm_mul_pd(dmb,dm5);
dma
=
_mm_mul_pd(dma,dm0);
dmb
=
_mm_mul_pd(dmb,dm5);
dma
=
_mm_add_pd(dma,xmmd1_5);
dmb
=
_mm_add_pd(dmb,xmmd1_5);
dm0
=
_mm_mul_pd(dm0,dma);
dm5
=
_mm_mul_pd(dm5,dmb);
}
lcpot
=
_mm_add_pd(lcpot,dm0);
lcpot
=
_mm_add_pd(lcpot,dm5);
}
for
(;j
+
1
<
i;
++
j)
{
__m128d dm0
=
_mm_set_sd(r[
0
][j]);
dm0
=
_mm_add_pd(dm0,_mmi0);
dm0
=
_mm_mul_pd(dm0,dm0);
__m128d dm1
=
_mm_set_sd(r[
1
][j]);
dm1
=
_mm_add_sd(dm1,_mmi1);
dm1
=
_mm_mul_sd(dm1,dm1);
__m128d dm2
=
_mm_set_sd(r[
2
][j]);
dm2
=
_mm_add_sd(dm2,_mmi2);
dm2
=
_mm_mul_sd(dm2,dm2);
dm0
=
_mm_add_sd(dm0,dm1);
dm0
=
_mm_add_sd(dm0,dm2);
{
__m128 sm0
=
_mm_cvtpd_ps(dm0);
__m128d dma
=
_mm_mul_sd(dm0,xmmd_0_5);
//
a*(-0.5)
sm0
=
_mm_rsqrt_ss(sm0);
//
计算1/sqrt(a)
dm0
=
_mm_cvtps_pd(sm0);
//
//
牛顿迭代,提高开方精度
dm1
=
_mm_mul_sd(dm0,dm0);
dm1
=
_mm_mul_sd(dm1,dma);
dm1
=
_mm_add_sd(dm1,xmmd1_5);
dm0
=
_mm_mul_sd(dm0,dm1);
//
再次迭代
dma
=
_mm_mul_sd(dma,dm0);
dma
=
_mm_mul_sd(dma,dm0);
dma
=
_mm_add_sd(dma,xmmd1_5);
dm0
=
_mm_mul_sd(dm0,dma);
}
lcpot
=
_mm_add_sd(lcpot,dm0);
}
*
pResult
=
_mm_add_pd(
*
pResult,lcpot);
}
该代码对速度有很小的改进,但精度确只到小数点后5/6位(过早的降低了运算精度,后面的计算使误差放大),
再增加一次牛顿叠代就能弥补精度的缺失,但速度上就没有优势了;只能放弃该方案;
既然硬件的float和double之间的转换慢,那我手工来写一个会不会更好一些呢? (比较奇怪的尝试:)
见代码:
(该函数利用了double/float的IEE浮点编码结构)
const
__m64 _mmd2f_esub
=
_mm_set_pi32((
1023
-
127
)
<<
(
52
-
32
),
0
);
const
__m128i _xmmd2f_esub
=
_mm_set_epi64(_mmd2f_esub,_mmd2f_esub);
void
computePotPart_forj_d2f(
int
i,__m128d
*
pResult)
{
__m128d lcpot
=
_mm_setzero_pd();
__m128d _mmi0
=
_mm_set1_pd(
-
r[
0
][i]);
__m128d _mmi1
=
_mm_set1_pd(
-
r[
1
][i]);
__m128d _mmi2
=
_mm_set1_pd(
-
r[
2
][i]);
int
j
=
0
;
//
*
for
(;j
+
4
<
i;j
+=
4
)
{
__m128d dm0
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
0
][j],_mmi0);
dm0
=
_mm_mul_pd(dm0,dm0);
__m128d dm1
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
1
][j],_mmi1);
dm1
=
_mm_mul_pd(dm1,dm1);
__m128d dm2
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
2
][j],_mmi2);
dm2
=
_mm_mul_pd(dm2,dm2);
dm0
=
_mm_add_pd(dm0,dm1);
dm0
=
_mm_add_pd(dm0,dm2);
__m128d dm5
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
0
][j
+
2
],_mmi0);
dm5
=
_mm_mul_pd(dm5,dm5);
__m128d dm6
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
1
][j
+
2
],_mmi1);
dm6
=
_mm_mul_pd(dm6,dm6);
dm2
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
2
][j
+
2
],_mmi2);
dm2
=
_mm_mul_pd(dm2,dm2);
dm5
=
_mm_add_pd(dm5,dm6);
dm5
=
_mm_add_pd(dm5,dm2);
//
用SSE的rsqrt近似计算1/sqrt(a); 然后使用牛顿迭代来提高开方精度
//
1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x
{
__m128d dma
=
_mm_mul_pd(dm0,xmmd_0_5);
//
a*(-0.5)
__m128d dmb
=
_mm_mul_pd(dm5,xmmd_0_5);
(
*
(__m128i
*
)
&
dm0)
=
_mm_sub_epi64((
*
(__m128i
*
)
&
dm0),_xmmd2f_esub);
(
*
(__m128i
*
)
&
dm5)
=
_mm_sub_epi64((
*
(__m128i
*
)
&
dm5),_xmmd2f_esub);
(
*
(__m128i
*
)
&
dm0)
=
_mm_srli_epi64(
*
(__m128i
*
)
&
dm0,
32
-
3
);
(
*
(__m128i
*
)
&
dm5)
=
_mm_srli_epi64(
*
(__m128i
*
)
&
dm5,
32
-
3
);
(
*
(__m128i
*
)
&
dm0)
=
_mm_slli_epi64(
*
(__m128i
*
)
&
dm0,
32
);
__m128 sm0;
(
*
(__m128i
*
)
&
sm0)
=
_mm_xor_si128(
*
(__m128i
*
)
&
dm0,
*
(__m128i
*
)
&
dm5);
__m128 sma
=
_mm_mul_ps(sm0,xmms_0_5);
//
a*(-0.5)
sm0
=
_mm_rsqrt_ps(sm0);
//
计算1/sqrt(a)
//
牛顿迭代,提高开方精度
sma
=
_mm_mul_ps(sma,sm0);
sma
=
_mm_mul_ps(sma,sm0);
sma
=
_mm_add_ps(sma,xmms1_5);
sm0
=
_mm_mul_ps(sm0,sma);
(
*
(__m128i
*
)
&
dm0)
=
_mm_srli_epi64(
*
(__m128i
*
)
&
sm0,
3
);
//
(
*
(__m128i
*
)
&
dm5)
=
_mm_slli_epi64(
*
(__m128i
*
)
&
sm0,
32
);
//
(
*
(__m128i
*
)
&
dm5)
=
_mm_srli_epi64(
*
(__m128i
*
)
&
dm5,
3
);
//
(
*
(__m128i
*
)
&
dm0)
=
_mm_add_epi64((
*
(__m128i
*
)
&
dm0),_xmmd2f_esub);
(
*
(__m128i
*
)
&
dm5)
=
_mm_add_epi64((
*
(__m128i
*
)
&
dm5),_xmmd2f_esub);
//
再次迭代,加倍精度
dma
=
_mm_mul_pd(dma,dm0);
dmb
=
_mm_mul_pd(dmb,dm5);
dma
=
_mm_mul_pd(dma,dm0);
dmb
=
_mm_mul_pd(dmb,dm5);
dma
=
_mm_add_pd(dma,xmmd1_5);
dmb
=
_mm_add_pd(dmb,xmmd1_5);
dm0
=
_mm_mul_pd(dm0,dma);
dm5
=
_mm_mul_pd(dm5,dmb);
}
lcpot
=
_mm_add_pd(lcpot,dm0);
lcpot
=
_mm_add_pd(lcpot,dm5);
}
for
(;j
+
1
<
i;
++
j)
{
__m128d dm0
=
_mm_set_sd(r[
0
][j]);
dm0
=
_mm_add_pd(dm0,_mmi0);
dm0
=
_mm_mul_pd(dm0,dm0);
__m128d dm1
=
_mm_set_sd(r[
1
][j]);
dm1
=
_mm_add_sd(dm1,_mmi1);
dm1
=
_mm_mul_sd(dm1,dm1);
__m128d dm2
=
_mm_set_sd(r[
2
][j]);
dm2
=
_mm_add_sd(dm2,_mmi2);
dm2
=
_mm_mul_sd(dm2,dm2);
dm0
=
_mm_add_sd(dm0,dm1);
dm0
=
_mm_add_sd(dm0,dm2);
{
__m128 sm0
=
_mm_cvtpd_ps(dm0);
__m128d dma
=
_mm_mul_sd(dm0,xmmd_0_5);
//
a*(-0.5)
sm0
=
_mm_rsqrt_ss(sm0);
//
计算1/sqrt(a)
dm0
=
_mm_cvtps_pd(sm0);
//
//
牛顿迭代,提高开方精度
dm1
=
_mm_mul_sd(dm0,dm0);
dm1
=
_mm_mul_sd(dm1,dma);
dm1
=
_mm_add_sd(dm1,xmmd1_5);
dm0
=
_mm_mul_sd(dm0,dm1);
//
再次迭代
dma
=
_mm_mul_sd(dma,dm0);
dma
=
_mm_mul_sd(dma,dm0);
dma
=
_mm_add_sd(dma,xmmd1_5);
dm0
=
_mm_mul_sd(dm0,dma);
}
lcpot
=
_mm_add_sd(lcpot,dm0);
}
*
pResult
=
_mm_add_pd(
*
pResult,lcpot);
}
该函数的速度比原来的代码稍慢:) 放弃之
(我还尝试过这样的代码(代码没有保存:( )
代码原理和上面的很接近,利用IEE的浮点格式,强制把double转成float后(通过指数平衡和移位),
分两路使用_mm_rsqrt_ps、牛顿叠代等;但这个慢慢,也放弃了)
既然硬件的float和double之间的转换慢,那我就不转换来看看怎样实现;
不使用SSE的_mm_rsqrt_ps指令,而利用IEE浮点格式生成一个粗略的近似解,然后迭代(迭代了3次);
(也可以用查表的方式来得到初始解,但在SSE体系中,这种实现很可能得不偿失,所以没有去实现);(该函数利用了double的IEE浮点编码结构)这样以后,速度有了一些提高,但精度还有点不够(大概有6位小数位精度),再次迭代的话就失去了速度优势;
不知道有没有比魔法数0x5fe6ec85,0xe7de30da更好的魔法数:)
放弃;
const
__m64 _mmi_mn
=
_mm_set_pi32(
0x5fe6ec85
,
0xe7de30da
);
const
__m128i xmmi64_mn
=
_mm_set1_epi64(_mmi_mn);
void
computePotPart_forj_int(
int
i,__m128d
*
pResult)
{
__m128d lcpot
=
_mm_setzero_pd();
__m128d _mmi0
=
_mm_set1_pd(
-
r[
0
][i]);
__m128d _mmi1
=
_mm_set1_pd(
-
r[
1
][i]);
__m128d _mmi2
=
_mm_set1_pd(
-
r[
2
][i]);
int
j
=
0
;
//
*
for
(;j
+
4
<
i;j
+=
4
)
{
__m128d dm0
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
0
][j],_mmi0);
dm0
=
_mm_mul_pd(dm0,dm0);
__m128d dm1
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
1
][j],_mmi1);
dm1
=
_mm_mul_pd(dm1,dm1);
__m128d dm2
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
2
][j],_mmi2);
dm2
=
_mm_mul_pd(dm2,dm2);
dm0
=
_mm_add_pd(dm0,dm1);
dm0
=
_mm_add_pd(dm0,dm2);
__m128d dm5
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
0
][j
+
2
],_mmi0);
dm5
=
_mm_mul_pd(dm5,dm5);
__m128d dm6
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
1
][j
+
2
],_mmi1);
dm6
=
_mm_mul_pd(dm6,dm6);
dm2
=
_mm_add_pd(
*
(__m128d
*
)
&
r[
2
][j
+
2
],_mmi2);
dm2
=
_mm_mul_pd(dm2,dm2);
dm5
=
_mm_add_pd(dm5,dm6);
dm5
=
_mm_add_pd(dm5,dm2);
//
利用IEE double 浮点格式的编码生成1/sqrt(a)的一个近似值; 然后使用牛顿迭代来提高精度
//
1/sqrt(a)的牛顿迭代公式x_next=(3-a*x*x)*x*0.5 =( 1.5 + (a*(-0.5)) * x*x) ) * x
{
__m128i xmmi0
=
xmmi64_mn;
__m128d dma
=
_mm_mul_pd(dm0,xmmd_0_5);
//
a*(-0.5)
__m128d dmb
=
_mm_mul_pd(dm5,xmmd_0_5);
*
(__m128i
*
)
&
dm0
=
_mm_srli_epi64(
*
(__m128i
*
)
&
dm0,
1
);
*
(__m128i
*
)
&
dm5
=
_mm_srli_epi64(
*
(__m128i
*
)
&
dm5,
1
);
*
(__m128i
*
)
&
dm0
=
_mm_sub_epi64(xmmi0,
*
(__m128i
*
)
&
dm0);
*
(__m128i
*
)
&
dm5
=
_mm_sub_epi64(xmmi0,
*
(__m128i
*
)
&
dm5);
//
迭代,加倍精度
dm1
=
_mm_mul_pd(dma,dm0);
dm2
=
_mm_mul_pd(dmb,dm5);
dm1
=
_mm_mul_pd(dm1,dm0);
dm2
=
_mm_mul_pd(dm2,dm5);
dm1
=
_mm_add_pd(dm1,xmmd1_5);
dm2
=
_mm_add_pd(dm2,xmmd1_5);
dm0
=
_mm_mul_pd(dm0,dm1);
dm5
=
_mm_mul_pd(dm5,dm2);
dm1
=
_mm_mul_pd(dma,dm0);
dm2
=
_mm_mul_pd(dmb,dm5);
dm1
=
_mm_mul_pd(dm1,dm0);
dm2
=
_mm_mul_pd(dm2,dm5);
dm1
=
_mm_add_pd(dm1,xmmd1_5);
dm2
=
_mm_add_pd(dm2,xmmd1_5);
dm0
=
_mm_mul_pd(dm0,dm1);
dm5
=
_mm_mul_pd(dm5,dm2);
dma
=
_mm_mul_pd(dma,dm0);
dmb
=
_mm_mul_pd(dmb,dm5);
dma
=
_mm_mul_pd(dma,dm0);
dmb
=
_mm_mul_pd(dmb,dm5);
dma
=
_mm_add_pd(dma,xmmd1_5);
dmb
=
_mm_add_pd(dmb,xmmd1_5);
dm0
=
_mm_mul_pd(dm0,dma);
dm5
=
_mm_mul_pd(dm5,dmb);
//
精度还有点不够 但再次迭代的话就失去了速度优势
}
lcpot
=
_mm_add_pd(lcpot,dm0);
lcpot
=
_mm_add_pd(lcpot,dm5);
}
for (;j+1<i;++j)
{
__m128d dm0=_mm_set_sd(r[0][j]);
dm0=_mm_add_pd(dm0,_mmi0);
dm0=_mm_mul_pd(dm0,dm0);
__m128d dm1=_mm_set_sd(r[1][j]);
dm1=_mm_add_sd(dm1,_mmi1);
dm1=_mm_mul_sd(dm1,dm1);
__m128d dm2=_mm_set_sd(r[2][j]);
dm2=_mm_add_sd(dm2,_mmi2);
dm2=_mm_mul_sd(dm2,dm2);
dm0=_mm_add_sd(dm0,dm1);
dm0=_mm_add_sd(dm0,dm2);
{
__m128 sm0=_mm_cvtpd_ps(dm0);
__m128d dma=_mm_mul_sd(dm0,xmmd_0_5); //a*(-0.5)
sm0=_mm_rsqrt_ss(sm0); //计算1/sqrt(a)
dm0=_mm_cvtps_pd(sm0); //
//牛顿迭代,提高开方精度
dm1=_mm_mul_sd(dm0,dm0);
dm1=_mm_mul_sd(dm1,dma);
dm1=_mm_add_sd(dm1,xmmd1_5);
dm0=_mm_mul_sd(dm0,dm1);
//再次迭代
dma=_mm_mul_sd(dma,dm0);
dma=_mm_mul_sd(dma,dm0);
dma=_mm_add_sd(dma,xmmd1_5);
dm0=_mm_mul_sd(dm0,dma);
}
lcpot
=
_mm_add_sd(lcpot,dm0);
}
*
pResult
=
_mm_add_pd(
*
pResult,lcpot);
}
还想到一个没有时间去实现的方案:由于SSE和x87是独立的两个硬件,那么可以使它们并行执行;
SSE部件代码不变,但循环做更大的展开,然后让x87承担一路或两路运算;
完