编写一个基于SSE多媒体指令集的快速矩阵加法运算函数,输入参数为两个单精度浮点型数组srcA与srcB,长度为N,输出结果保存在一个单精度浮点型数组dest中,假设srcA、srcB以及dest内存空间的首地址均按照16-byte对齐。请利用多媒体指令集获得最大的程序性能(可以使用Visual Studio中的SSE intrinsic函数)
推荐函数定义:void SSE_Add(float* srcA, float* srcB, float* dest, int M) {}
编程实现:
// SSE.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <xmmintrin.h>
#include <iomanip>
#include <stdlib.h>
#include <time.h>
#include <windows.h>
#include<iostream>
using namespace std;
void SSE_Add(float* srcA, float* srcB, float* dest, int N)
{
__m128 a, b, c;
int len = N/4;
for(int i=0;i<4*len;i=i+4)
{
a = _mm_set_ps(srcA[i+3], srcA[i+2], srcA[i+1], srcA[i]);
b = _mm_set_ps(srcB[i+3], srcB[i+2], srcB[i+1], srcB[i]);
c = _mm_set_ps(0, 0, 0, 0);
c = _mm_add_ps(a, b);
dest[i+3] = c.m128_f32[3];
dest[i+2] = c.m128_f32[2];
dest[i+1] = c.m128_f32[1];
dest[i] = c.m128_f32[0];
}
int last = N-4*len;
//cout <<last<<endl;
if(last == 3)
{
int i = 4*len;
a = _mm_set_ps(0, srcA[i+2], srcA[i+1], srcA[i]);
b = _mm_set_ps(0, srcB[i+2], srcB[i+1], srcB[i]);
c = _mm_set_ps(0, 0, 0, 0);
c = _mm_add_ps(a, b);
dest[i+2] = c.m128_f32[2];
dest[i+1] = c.m128_f32[1];
dest[i] = c.m128_f32[0];
}
if(last == 2)
{
int i = 4*len;
a = _mm_set_ps(0, 0, srcA[i+1], srcA[i]);
b = _mm_set_ps(0, 0, srcB[i+1], srcB[i]);
c = _mm_set_ps(0, 0, 0, 0);
c = _mm_add_ps(a, b);
dest[i+1] = c.m128_f32[1];
dest[i] = c.m128_f32[0];
}
if(last == 1)
{
int i = 4*len;
a = _mm_set_ps(0, 0, 0, srcA[i]);
b = _mm_set_ps(0, 0, 0, srcB[i]);
c = _mm_set_ps(0, 0, 0, 0);
c = _mm_add_ps(a, b);
dest[i] = c.m128_f32[0];
}
}
void normal_Add(float* srcA, float* srcB, float* dest, int N)
{
for(int i=0;i<N;i++)
dest[i] = srcA[i] + srcB[i];
}
int main()
{
double len=100009;//len=100010;
double run_time;
double duration,duration1;
float *srcA = new float[len];
float *srcB = new float[len];
float *dest = new float[len];
int i;
for( i=0;i<len;i++)
{
srcA[i] = (float)i;
srcB[i] = (float)i;
}
SYSTEMTIME sys;
SYSTEMTIME sys_end;
double calcRunTime;
/*
SSE_Add(srcA,srcB,dest,len);
for(int i=0;i<len;i++)
cout<<setw(7)<<dest[i]<<endl;
*/
for(int m =0 ;m<3;m++)
{
cout<<"第"<<m<<"次测试:"<<endl;
run_time = 10;
for(;run_time <1000000;run_time = run_time*10)
{
calcRunTime = len * run_time;
cout<<"运行"<<calcRunTime<<"次加法:";
//优化前
GetLocalTime( &sys );
for(i=0;i<run_time;i++)
normal_Add(srcA,srcB,dest,len);
GetLocalTime( &sys_end );
duration = sys_end.wHour*3600000+sys_end.wMinute*60000 + sys_end.wSecond*1000+sys_end.wMilliseconds
-(sys.wHour*3600000+sys.wMinute*60000 + sys.wSecond*1000+sys.wMilliseconds);
cout<<"优化前"<<"用时"<<duration<<"ms ";
//优化后
GetLocalTime( &sys );
for(i=0;i<run_time;i++)
SSE_Add(srcA,srcB,dest,10000);
GetLocalTime( &sys_end );
duration1 = sys_end.wHour*3600000+sys_end.wMinute*60000 + sys_end.wSecond*1000+sys_end.wMilliseconds
-(sys.wHour*3600000+sys.wMinute*60000 + sys.wSecond*1000+sys.wMilliseconds);
float speedup;
if(duration1 == 0)
speedup = 0;
else
speedup = duration/duration1;
cout<<"优化后"<<"用时"<<duration1<<"ms"<<"速度提高"<<speedup<<"倍"<<endl;
}
}
return 0;
}
运行环境:
Cpu T7250 ,内存:1G,XP系统
优化结果截图