SSE指令简单示例

#include 
#include 
#include 
#include 
#include 
#include "timer.cpp"

using namespace std;

//疑点,数组长度必须是4的倍数吗?
void ComputerArrayCPlusPlusSSE(float* pArray1,float* pArray2,float* pResult,int nSize){
    int nLoop = nSize/4;
     __m128 m1,m2,m3,m4;
     __m128* pSrc1 = (__m128*)pArray1;
     __m128* pSrc2 = (__m128*)pArray2;
     __m128* pDest = (__m128*)pResult;
     __m128 m0_5 = _mm_set_ps1(0.5f);

     for(int i=0;i < nLoop; i++){
         m1 = _mm_mul_ps(*pSrc1,*pSrc1);
         m2 = _mm_mul_ps(*pSrc2,*pSrc2);
         m3 = _mm_add_ps(m1,m2);
         m4 = _mm_sqrt_ps(m3);
         *pDest = _mm_add_ps(m4,m0_5);

         pSrc1++;
         pSrc2++;
         pDest++;
     }
}
void Other(float* pArray1,float* pArray2,float* pResult,int nSize)
{

    int i;

    float* pSource1 = pArray1;
    float* pSource2 = pArray2;
    float* pDest = pResult;

    for ( i = 0; i < nSize; i++ )
    {
        *pDest = (float)sqrt((*pSource1) * (*pSource1) + (*pSource2)
                                                         * (*pSource2)) + 0.5f;

        pSource1++;
        pSource2++;
        pDest++;
    }
}

int main(){
    float p1[5] = {1.0,1.0,1.0,1.0,1.0};
    float p2[5] = {1.0,1.0,1.0,1.0,1.0};
    float result[5] = {0.0,0.0,0.0,0.0,0.0};
    ComputerArrayCPlusPlusSSE(p1,p2,result,5);
    for(int i=0;i<5;i++){
        cout<

 

你可能感兴趣的:(计算机中的伟大设计以及优化思想)