SSE intrinsic函数_优化

    编写一个基于SSE多媒体指令集的快速矩阵加法运算函数,输入参数为两个单精度浮点型数组srcAsrcB,长度为N,输出结果保存在一个单精度浮点型数组dest中,假设srcAsrcB以及dest内存空间的首地址均按照16-byte对齐。请利用多媒体指令集获得最大的程序性能(可以使用Visual Studio中的SSE intrinsic函数)

 

推荐函数定义:void SSE_Add(float* srcA, float* srcB, float* dest, int M) {}

 

编程实现:

// SSE.cpp : 定义控制台应用程序的入口点。

//

#include "stdafx.h"

#include

#include

#include

#include

#include

#include

using namespace std;

 

void SSE_Add(float* srcA, float* srcB, float* dest, int N)

{

 

     __m128 a, b, c;

     int len = N/4;

 

     for(int i=0;i<4*len;i=i+4)

     {

         a = _mm_set_ps(srcA[i+3], srcA[i+2], srcA[i+1], srcA[i]);

         b = _mm_set_ps(srcB[i+3], srcB[i+2], srcB[i+1], srcB[i]);

         c = _mm_set_ps(0, 0, 0, 0);

         c = _mm_add_ps(a, b);

        

         dest[i+3] = c.m128_f32[3];

         dest[i+2] = c.m128_f32[2];

         dest[i+1] = c.m128_f32[1];

         dest[i] = c.m128_f32[0];

     }

     int last = N-4*len;

     //cout <

     if(last == 3)

     {

         int i = 4*len;

         a = _mm_set_ps(0, srcA[i+2], srcA[i+1], srcA[i]);

         b = _mm_set_ps(0, srcB[i+2], srcB[i+1], srcB[i]);

         c = _mm_set_ps(0, 0, 0, 0);

         c = _mm_add_ps(a, b);

        

         dest[i+2] = c.m128_f32[2];

         dest[i+1] = c.m128_f32[1];

         dest[i] = c.m128_f32[0];

     }

 

     if(last == 2)

     {

         int i = 4*len;

         a = _mm_set_ps(0, 0, srcA[i+1], srcA[i]);

         b = _mm_set_ps(0, 0, srcB[i+1], srcB[i]);

         c = _mm_set_ps(0, 0, 0, 0);

         c = _mm_add_ps(a, b);

        

         dest[i+1] = c.m128_f32[1];

         dest[i] = c.m128_f32[0];

     }

     if(last == 1)

     {

         int i = 4*len;

         a = _mm_set_ps(0, 0, 0, srcA[i]);

         b = _mm_set_ps(0, 0, 0, srcB[i]);

         c = _mm_set_ps(0, 0, 0, 0);

         c = _mm_add_ps(a, b);

        

         dest[i] = c.m128_f32[0];

     }

}

 

void normal_Add(float* srcA, float* srcB, float* dest, int N)

{

     for(int i=0;i

         dest[i] = srcA[i] + srcB[i];

}

 

int main()

{

     double len=100009;//len=100010;

     double run_time;

     double  duration,duration1;

 

     float *srcA = new float[len];

     float *srcB = new float[len];

     float *dest = new float[len];

 

     int i;

     for( i=0;i

     {

         srcA[i] = (float)i;

         srcB[i] = (float)i;

     }

    

     SYSTEMTIME sys;

     SYSTEMTIME sys_end;

     double calcRunTime;

 

/*

     SSE_Add(srcA,srcB,dest,len);

     for(int i=0;i

         cout<

*/

    

     for(int m =0 ;m<3;m++)

     {

         cout<<""<"次测试:"<

         run_time = 10;

         for(;run_time <1000000;run_time = run_time*10)

         {

              calcRunTime = len * run_time;

              cout<<"运行"<"次加法:";

              //优化前

              GetLocalTime( &sys );

              for(i=0;i

                   normal_Add(srcA,srcB,dest,len);

              GetLocalTime( &sys_end );

              duration = sys_end.wHour*3600000+sys_end.wMinute*60000 + sys_end.wSecond*1000+sys_end.wMilliseconds

                   -(sys.wHour*3600000+sys.wMinute*60000 + sys.wSecond*1000+sys.wMilliseconds);

             

              cout<<"优化前"<<"用时"<"ms    ";

 

 

              //优化后

              GetLocalTime( &sys );

              for(i=0;i

                   SSE_Add(srcA,srcB,dest,10000);

 

              GetLocalTime( &sys_end );

              duration1 = sys_end.wHour*3600000+sys_end.wMinute*60000 + sys_end.wSecond*1000+sys_end.wMilliseconds

                   -(sys.wHour*3600000+sys.wMinute*60000 + sys.wSecond*1000+sys.wMilliseconds);

 

              float speedup;

              if(duration1 == 0)

                   speedup = 0;

              else

                   speedup = duration/duration1;

 

              cout<<"优化后"<<"用时"<"ms"<<"速度提高"<""<

         }

     }

     return 0;

}

 

 

运行环境:

Cpu T7250 ,内存:1GXP系统

 

优化结果截图

 

 

 

 

你可能感兴趣的:(C/C++/VC)