转自 http://blog.csdn.net/housisong/article/details/6318890
图形图像处理-之-高质量的快速的图像缩放 补充 使用SSE2优化
[email protected] 2011.04.12
tag: 图像缩放,速度优化,线性插值,三次卷积插值,SSE2,scale,bilinear,bicubic,StretchBlt
摘要:
本文章对线性插值和三次卷积插值(bicubic)的实现做了一些新的优化尝试;
使用了SSE2的128bit寄存器及相关指令;并预先建立SSE2用到的缩放系数表;
实现的结果在我的i7电脑上比以前的版本分别快出145%和75%!
线性插值的速度是StretchBlt的13倍!
正文:
(请先看看我的blog里<高质量的快速的图像缩放>的前3篇文章!)
支持SSE2指令集的CPU越来越多,CPU的SSE2实现性能也好了很多(以前不比MMX好多少),
而且软件在64位模式的时候不再支持MMX,所以尝试了SSE2的缩放优化,效果不错!
速度测试说明:
只测试内存数据到内存数据的缩放
测试图片都是800*600缩放到1024*768,单线程;fps表示每秒钟的帧数,值越大表示函数越快.
速度测试对比: (CPU:i7 920 内存:DDR3 1333 3通道)
(windows)
StretchBlt 近邻取样 869.09 fps
StretchBlt 线性插值 44.46 fps //SetStretchBltMode(dc,4);?
PicZoom0: 95.69 fps
PicZoom1: 158.35 fps
PicZoom2: 332.78 fps
PicZoom3: 1172.79 fps
PicZoom3_float: 874.13 fps
PicZoom3_Table: 1158.30 fps
PicZoom3_SSE: 1908.40 fps
PicZoom_Bilinear0: 28.80 fps
PicZoom_Bilinear1: 56.09 fps
PicZoom_Bilinear2: 97.09 fps
PicZoom_Bilinear_Common: 119.83 fps
PicZoom_Bilinear_MMX: 180.12 fps
PicZoom_Bilinear_MMX_Ex: 237.34 fps
PicZoom_ftBilinear_Common: 118.67 fps
PicZoom_ftBilinear_MMX: 213.68 fps
PicZoom_ThreeOrder0: 6.11 fps
PicZoom_ThreeOrder_Common: 25.38 fps
PicZoom_ThreeOrder_MMX: 52.32 fps
(SSE2的实现)
PicZoom_ftBilinearTable_SSE2: 588.24 fps
PicZoom_ThreeOrderTable_SSE2: 93.24 fps
PicZoom_ftBilinearTable_SSE2实现代码如下:
- typedef UInt64 TMMXData64;
-
-
- #define ftBilinearTable_SSE2() /
- asm mov eax,[edx+ebx] /
- asm movq xmm0,qword ptr[esi+eax*4] /
- asm movq xmm1,qword ptr[ecx+eax*4] /
- asm punpcklbw xmm0,xmm7 /
- asm punpcklbw xmm1,xmm7 /
- asm pmullw xmm0,mm5 /
- asm pmullw xmm1,mm6 /
- asm paddw xmm0,xmm1 /
- asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] /
- asm movdqa xmm1,xmm0 /
- asm punpckhqdq xmm0,xmm0 /
- asm paddw xmm0,xmm1 /
- asm packuswb xmm0,xmm7 /
- asm movd dword ptr [edi+ebx],xmm0
-
-
-
-
- #define ftBilinearTable_SSE2_expand2() /
- asm mov eax,[edx+ebx] /
- asm movq xmm0,qword ptr[esi+eax*4] /
- asm movq xmm1,qword ptr[ecx+eax*4] /
- asm mov eax,[edx+ebx+4] /
- asm movq xmm2,qword ptr[esi+eax*4] /
- asm movq xmm3,qword ptr[ecx+eax*4] /
- asm punpcklbw xmm0,xmm7 /
- asm punpcklbw xmm1,xmm7 /
- asm punpcklbw xmm2,xmm7 /
- asm punpcklbw xmm3,xmm7 /
- asm pmullw xmm0,mm5 /
- asm pmullw xmm1,mm6 /
- asm pmullw xmm2,mm5 /
- asm pmullw xmm3,mm6 /
- asm paddw xmm0,xmm1 /
- asm paddw xmm2,xmm3 /
- asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] /
- asm pmulhw xmm2,xmmword ptr [ebp+ebx*4+16] /
- asm movdqa xmm1,xmm0 /
- asm punpcklqdq xmm0,xmm2 /
- asm punpckhqdq xmm1,xmm2 /
- asm paddw xmm0,xmm1 /
- asm packuswb xmm0,xmm7 /
- asm movq qword ptr [edi+ebx],xmm0 /
-
-
-
-
- void PicZoom_ftBilinearTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
- {
- if ( (0==Dst.width)||(0==Dst.height)
- ||(2>Src.width)||(2>Src.height)) return;
- long xrIntFloat_16=((Src.width-1)<<16)/Dst.width;
- long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;
- long dst_width=Dst.width;
- UInt8* _bufMem=new UInt8[(dst_width*2*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];
- TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4);
- Int32* xList=(Int32*)(uList+dst_width*2);
- {
- long srcx_16=0;
- for (long x=0;x<dst_width*2;x+=2){
- xList[x>>1]=(srcx_16>>16);
- unsigned long u=(srcx_16>>8)&0xFF;
- unsigned long ur=(256-u)<<1;
- u=u<<1;
- uList[x+0]=(ur|(ur<<16));
- uList[x+0]|=uList[x+0]<<32;
- uList[x+1]=u|(u<<16);
- uList[x+1]|=uList[x+1]<<32;
- srcx_16+=xrIntFloat_16;
- }
- }
- Color32* pDstLine=Dst.pdata;
- long srcy_16=0;
- asm pxor xmm7,xmm7
- for (long y=0;y<Dst.height;++y){
- unsigned long v=(srcy_16>>8) & 0xFF;
- unsigned long vr=(256-v)>>1;
- v>>=1;
- Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src.byte_width*(srcy_16>>16)) ;
- Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src.byte_width) ;
- asm{
- movd xmm5,vr
- movd xmm6,v
- punpcklwd xmm5,xmm5
- punpcklwd xmm6,xmm6
- punpckldq xmm5,xmm5
- punpckldq xmm6,xmm6
- punpcklqdq xmm5,xmm5
- punpcklqdq xmm6,xmm6
-
- mov esi,PSrcLineColor
- mov ecx,PSrcLineColorNext
- mov edx,xList
- mov ebx,dst_width
- mov edi,pDstLine
- push ebp
- mov ebp,uList
- push ebx
-
- and ebx,(not 1)
- test ebx,ebx
- jle end_loop2
-
- lea ebx,[ebx*4]
- lea edi,[edi+ebx]
- lea edx,[edx+ebx]
- lea ebp,[ebp+ebx*4]
- neg ebx
- loop2_start:
-
- ftBilinearTable_SSE2_expand2()
- add ebx,8
- jnz loop2_start
- end_loop2:
- pop ebx
- and ebx,1
- test ebx,ebx
- jle end_write
- lea ebx,[ebx*4]
- lea edi,[edi+ebx]
- lea edx,[edx+ebx]
- lea ebp,[ebp+ebx*4]
- neg ebx
- loop1_start:
-
- ftBilinearTable_SSE2()
- add ebx,4
- jnz loop1_start
- end_write:
- pop ebp
- }
- srcy_16+=yrIntFloat_16;
- ((UInt8*&)pDstLine)+=Dst.byte_width;
- }
- delete []_bufMem;
- }
PicZoom_ThreeOrderTable_SSE2实现代码如下:
- static TMMXData64 SinXDivX_Table64_MMX[(2<<8)+1];
- class _CAutoInti_SinXDivX_Table64_MMX {
- private:
- void _Inti_SinXDivX_Table64_MMX()
- {
- for (long i=0;i<=(2<<8);++i)
- {
- unsigned short t=(unsigned short)(0.5+(1<<14)*SinXDivX(i*(1.0/(256))));
- unsigned long tl=t|(((unsigned long)t)<<16);
- TMMXData64 tll=tl|(((TMMXData64)tl)<<32);
- SinXDivX_Table64_MMX[i]=tll;
- }
- };
- public:
- _CAutoInti_SinXDivX_Table64_MMX() { _Inti_SinXDivX_Table64_MMX(); }
- };
- static _CAutoInti_SinXDivX_Table64_MMX __tmp_CAutoInti_SinXDivX_Table64_MMX;
-
- #define _private_ThreeOrderTable_Fast_SSE2_2() /
- asm movq xmm0,qword ptr [eax] /
- asm movq xmm1,qword ptr [eax+8] /
- asm movq xmm2,qword ptr [eax+edx] /
- asm movq xmm3,qword ptr [eax+edx+8] /
- asm punpcklbw xmm0,xmm7 /
- asm punpcklbw xmm1,xmm7 /
- asm punpcklbw xmm2,xmm7 /
- asm punpcklbw xmm3,xmm7 /
- asm psllw xmm0,7 /
- asm psllw xmm1,7 /
- asm psllw xmm2,7 /
- asm psllw xmm3,7 /
- asm pmulhw xmm0,xmmword ptr [ecx] /
- asm pmulhw xmm1,xmmword ptr [ecx+16] /
- asm pmulhw xmm2,xmmword ptr [ecx] /
- asm pmulhw xmm3,xmmword ptr [ecx+16] /
- asm paddsw xmm0,xmm1 /
- asm paddsw xmm2,xmm3 /
- asm pmulhw xmm0,xmmword ptr [ebx] /
- asm pmulhw xmm2,xmmword ptr [ebx+16] /
- asm paddsw xmm0,xmm2 /
-
-
- must_inline UInt32 ThreeOrderTable_Fast_SSE2(const Color32* pixel,long byte_width,const TMMXData64* v4,const TMMXData64* u4){
- asm mov eax,pixel
- asm mov edx,byte_width
- asm mov ebx,v4
- asm mov ecx,u4
-
- _private_ThreeOrderTable_Fast_SSE2_2();
- asm movdqa xmm6,xmm0
- asm lea eax,[eax+edx*2]
- asm lea ebx,[ebx+32]
-
- _private_ThreeOrderTable_Fast_SSE2_2();
- asm paddsw xmm6,xmm0
- asm movdqa xmm5,xmm6
- asm psrldq xmm6,8
- asm paddsw xmm5,xmm6
- asm psraw xmm5,3
- asm packuswb xmm5,xmm7
- asm movd eax,xmm5
- }
- must_inline long getSizeBorder(long x,long maxx){
- if (x<=0)
- return 0;
- else if (x>=maxx)
- return maxx;
- else
- return x;
- }
- must_inline UInt32 ThreeOrderTable_Border_SSE2(const TPixels32Ref& pic,const long x0_sub1,const long y0_sub1,const TMMXData64* v4,const TMMXData64* u4){
- Color32 pixel[16];
- long height_sub_1=pic.height-1;
- long width_sub_1=pic.width-1;
- Color32* pbuf=pixel;
- for (long i=0;i<4;++i,pbuf+=4){
- long y=getSizeBorder(y0_sub1+i,height_sub_1);
- Color32* pLine=pic.getLinePixels(y);
- pbuf[0]=pLine[getSizeBorder(x0_sub1+0,width_sub_1)];
- pbuf[1]=pLine[getSizeBorder(x0_sub1+1,width_sub_1)];
- pbuf[2]=pLine[getSizeBorder(x0_sub1+2,width_sub_1)];
- pbuf[3]=pLine[getSizeBorder(x0_sub1+3,width_sub_1)];
- }
- return ThreeOrderTable_Fast_SSE2(pixel,4*sizeof(Color32),v4,u4);
- }
- void PicZoom_ThreeOrderTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
- {
- if ( (0==Dst.width)||(0==Dst.height)
- ||(0==Src.width)||(0==Src.height)) return;
- long dst_width=Dst.width;
- long dst_height=Dst.height;
- long xrIntFloat_16=((Src.width)<<16)/dst_width+1;
- long yrIntFloat_16=((Src.height)<<16)/dst_height+1;
- const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
- const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);
-
- long border_y0=((1<<16)-csDErrorY)/yrIntFloat_16+1;
- if (border_y0>=dst_height) border_y0=dst_height;
- long border_x0=((1<<16)-csDErrorX)/xrIntFloat_16+1;
- if (border_x0>=dst_width ) border_x0=dst_width;
- long border_y1=(((Src.height-3)<<16)-csDErrorY)/yrIntFloat_16+1;
- if (border_y1<border_y0) border_y1=border_y0;
- long border_x1=(((Src.width-3)<<16)-csDErrorX)/xrIntFloat_16+1;;
- if (border_x1<border_x0) border_x1=border_x0;
- UInt8* _bufMem=new UInt8[(dst_width*4*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];
- TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4);
- Int32* xList=(Int32*)(uList+dst_width*4);
- {
- long srcx_16=csDErrorX;
- for (long x=0;x<dst_width*4;x+=4){
- xList[x>>2]=(srcx_16>>16)-1;
- long u=(srcx_16>>8)&0xFF;
- uList[x+0]=SinXDivX_Table64_MMX[256+u];
- uList[x+1]=SinXDivX_Table64_MMX[u];
- uList[x+2]=SinXDivX_Table64_MMX[256-u];
- uList[x+3]=SinXDivX_Table64_MMX[512-u];
- srcx_16+=xrIntFloat_16;
- }
- }
- TMMXData64 _v4[8+2];
- TMMXData64* v4=(&_v4[0]); v4=(TMMXData64*)( (((ptrdiff_t)v4)+15)>>4<<4);
- asm pxor xmm7,xmm7
- Color32* pDstLine=Dst.pdata;
- long srcy_16=csDErrorY;
- for (long y=0;y<dst_height;++y){
-
- const long srcy_sub1=(srcy_16>>16)-1;
- const long v=(srcy_16>>8)&0xFF;
- v4[0]=SinXDivX_Table64_MMX[256+v];
- v4[1]=v4[0];
- v4[2]=SinXDivX_Table64_MMX[v];
- v4[3]=v4[2];
- v4[4]=SinXDivX_Table64_MMX[256-v];
- v4[5]=v4[4];
- v4[6]=SinXDivX_Table64_MMX[512-v];
- v4[7]=v4[6];
- if ((y<border_y0)||(y>=border_y1)){
- for (long x=0;x<dst_width;++x)
- pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);
- }else{
- for (long x=0;x<border_x0;++x)
- pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);
- const Color32* pixelLine=Src.getLinePixels(srcy_sub1);
- long byte_width=Src.byte_width;
- for (long x=border_x0;x<border_x1;++x)
- pDstLine[x].argb=ThreeOrderTable_Fast_SSE2(&pixelLine[xList[x]],byte_width,v4,&uList[x*4]);
- for (long x=border_x1;x<dst_width;++x)
- pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);
- }
- srcy_16+=yrIntFloat_16;
- ((UInt8*&)pDstLine)+=Dst.byte_width;
- }
- delete []_bufMem;
- }