图形图像处理-之-高质量的快速的图像缩放 补充 使用SSE2优化
[email protected] 2011.04.12
tag: 图像缩放,速度优化,线性插值,三次卷积插值,SSE2,scale,bilinear,bicubic,StretchBlt
摘要:
本文章对线性插值和三次卷积插值(bicubic)的实现做了一些新的优化尝试;
使用了SSE2的128bit寄存器及相关指令;并预先建立SSE2用到的缩放系数表;
实现的结果在我的i7电脑上比以前的版本分别快出145%和75%!
线性插值的速度是StretchBlt的13倍!
正文:
(请先看看我的blog里<高质量的快速的图像缩放>的前3篇文章!)
支持SSE2指令集的CPU越来越多,CPU的SSE2实现性能也好了很多(以前不比MMX好多少),
而且软件在64位模式的时候不再支持MMX,所以尝试了SSE2的缩放优化,效果不错!
速度测试说明:
只测试内存数据到内存数据的缩放
测试图片都是800*600缩放到1024*768,单线程;fps表示每秒钟的帧数,值越大表示函数越快.
速度测试对比: (CPU:i7 920 内存:DDR3 1333 3通道)
(windows)
StretchBlt 近邻取样 869.09 fps
StretchBlt 线性插值 44.46 fps //SetStretchBltMode(dc,4);?
PicZoom0: 95.69 fps
PicZoom1: 158.35 fps
PicZoom2: 332.78 fps
PicZoom3: 1172.79 fps
PicZoom3_float: 874.13 fps
PicZoom3_Table: 1158.30 fps
PicZoom3_SSE: 1908.40 fps
PicZoom_Bilinear0: 28.80 fps
PicZoom_Bilinear1: 56.09 fps
PicZoom_Bilinear2: 97.09 fps
PicZoom_Bilinear_Common: 119.83 fps
PicZoom_Bilinear_MMX: 180.12 fps
PicZoom_Bilinear_MMX_Ex: 237.34 fps
PicZoom_ftBilinear_Common: 118.67 fps
PicZoom_ftBilinear_MMX: 213.68 fps
PicZoom_ThreeOrder0: 6.11 fps
PicZoom_ThreeOrder_Common: 25.38 fps
PicZoom_ThreeOrder_MMX: 52.32 fps
(SSE2的实现)
PicZoom_ftBilinearTable_SSE2: 588.24 fps
PicZoom_ThreeOrderTable_SSE2: 93.24 fps
PicZoom_ftBilinearTable_SSE2实现代码如下:
typedef UInt64 TMMXData64; //ftBilinearTable_SSE2(out [edi+ebx*4]; xmm5=v,xmm6=vr,xmm7=0,[ebp]=(u,ur),[edx]=srx_x,esi=PSrcLineColor,ecx=PSrcLineColorNext) //void __declspec(naked) ftBilinearTable_SSE2(){ #define ftBilinearTable_SSE2() / asm mov eax,[edx+ebx] / asm movq xmm0,qword ptr[esi+eax*4] / asm movq xmm1,qword ptr[ecx+eax*4] / asm punpcklbw xmm0,xmm7 / asm punpcklbw xmm1,xmm7 / asm pmullw xmm0,mm5 / asm pmullw xmm1,mm6 / asm paddw xmm0,xmm1 / asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] / asm movdqa xmm1,xmm0 / asm punpckhqdq xmm0,xmm0 / asm paddw xmm0,xmm1 / asm packuswb xmm0,xmm7 / asm movd dword ptr [edi+ebx],xmm0 //ret //for __declspec(naked) //} //} //void __declspec(naked) ftBilinearTable_SSE2_expand2(){ #define ftBilinearTable_SSE2_expand2() / asm mov eax,[edx+ebx] / asm movq xmm0,qword ptr[esi+eax*4] / asm movq xmm1,qword ptr[ecx+eax*4] / asm mov eax,[edx+ebx+4] / asm movq xmm2,qword ptr[esi+eax*4] / asm movq xmm3,qword ptr[ecx+eax*4] / asm punpcklbw xmm0,xmm7 / asm punpcklbw xmm1,xmm7 / asm punpcklbw xmm2,xmm7 / asm punpcklbw xmm3,xmm7 / asm pmullw xmm0,mm5 / asm pmullw xmm1,mm6 / asm pmullw xmm2,mm5 / asm pmullw xmm3,mm6 / asm paddw xmm0,xmm1 / asm paddw xmm2,xmm3 / asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] / asm pmulhw xmm2,xmmword ptr [ebp+ebx*4+16] / asm movdqa xmm1,xmm0 / asm punpcklqdq xmm0,xmm2 / asm punpckhqdq xmm1,xmm2 / asm paddw xmm0,xmm1 / asm packuswb xmm0,xmm7 / asm movq qword ptr [edi+ebx],xmm0 / //ret //for __declspec(naked) //} //} void PicZoom_ftBilinearTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src) { if ( (0==Dst.width)||(0==Dst.height) ||(2>Src.width)||(2>Src.height)) return; long xrIntFloat_16=((Src.width-1)<<16)/Dst.width; long yrIntFloat_16=((Src.height-1)<<16)/Dst.height; long dst_width=Dst.width; UInt8* _bufMem=new UInt8[(dst_width*2*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)]; TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte对齐 Int32* xList=(Int32*)(uList+dst_width*2); {//init u table long srcx_16=0; for (long x=0;x<dst_width*2;x+=2){ xList[x>>1]=(srcx_16>>16); unsigned long u=(srcx_16>>8)&0xFF; unsigned long ur=(256-u)<<1; u=u<<1; uList[x+0]=(ur|(ur<<16)); uList[x+0]|=uList[x+0]<<32; uList[x+1]=u|(u<<16); uList[x+1]|=uList[x+1]<<32; srcx_16+=xrIntFloat_16; } } Color32* pDstLine=Dst.pdata; long srcy_16=0; asm pxor xmm7,xmm7 //xmm7=0 for (long y=0;y<Dst.height;++y){ unsigned long v=(srcy_16>>8) & 0xFF; unsigned long vr=(256-v)>>1; v>>=1; Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src.byte_width*(srcy_16>>16)) ; Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src.byte_width) ; asm{ movd xmm5,vr movd xmm6,v punpcklwd xmm5,xmm5 punpcklwd xmm6,xmm6 punpckldq xmm5,xmm5 punpckldq xmm6,xmm6 punpcklqdq xmm5,xmm5 punpcklqdq xmm6,xmm6 mov esi,PSrcLineColor mov ecx,PSrcLineColorNext mov edx,xList //x mov ebx,dst_width mov edi,pDstLine push ebp mov ebp,uList push ebx and ebx,(not 1) test ebx,ebx jle end_loop2 lea ebx,[ebx*4] lea edi,[edi+ebx] lea edx,[edx+ebx] lea ebp,[ebp+ebx*4] neg ebx loop2_start: //call ftBilinearTable_SSE2_expand2 ftBilinearTable_SSE2_expand2() add ebx,8 jnz loop2_start end_loop2: pop ebx and ebx,1 test ebx,ebx jle end_write lea ebx,[ebx*4] lea edi,[edi+ebx] lea edx,[edx+ebx] lea ebp,[ebp+ebx*4] neg ebx loop1_start: //call ftBilinearTable_SSE2 ftBilinearTable_SSE2() add ebx,4 jnz loop1_start end_write: pop ebp } srcy_16+=yrIntFloat_16; ((UInt8*&)pDstLine)+=Dst.byte_width; } delete []_bufMem; }
PicZoom_ThreeOrderTable_SSE2实现代码如下:
static TMMXData64 SinXDivX_Table64_MMX[(2<<8)+1]; class _CAutoInti_SinXDivX_Table64_MMX { private: void _Inti_SinXDivX_Table64_MMX() { for (long i=0;i<=(2<<8);++i) { unsigned short t=(unsigned short)(0.5+(1<<14)*SinXDivX(i*(1.0/(256)))); unsigned long tl=t|(((unsigned long)t)<<16); TMMXData64 tll=tl|(((TMMXData64)tl)<<32); SinXDivX_Table64_MMX[i]=tll; } }; public: _CAutoInti_SinXDivX_Table64_MMX() { _Inti_SinXDivX_Table64_MMX(); } }; static _CAutoInti_SinXDivX_Table64_MMX __tmp_CAutoInti_SinXDivX_Table64_MMX; //void __declspec(naked) _private_ThreeOrderTable_Fast_SSE2_2(){ #define _private_ThreeOrderTable_Fast_SSE2_2() / asm movq xmm0,qword ptr [eax] / asm movq xmm1,qword ptr [eax+8] / asm movq xmm2,qword ptr [eax+edx] / asm movq xmm3,qword ptr [eax+edx+8] / asm punpcklbw xmm0,xmm7 / asm punpcklbw xmm1,xmm7 / asm punpcklbw xmm2,xmm7 / asm punpcklbw xmm3,xmm7 / asm psllw xmm0,7 / asm psllw xmm1,7 / asm psllw xmm2,7 / asm psllw xmm3,7 / asm pmulhw xmm0,xmmword ptr [ecx] / asm pmulhw xmm1,xmmword ptr [ecx+16] / asm pmulhw xmm2,xmmword ptr [ecx] / asm pmulhw xmm3,xmmword ptr [ecx+16] / asm paddsw xmm0,xmm1 / asm paddsw xmm2,xmm3 / asm pmulhw xmm0,xmmword ptr [ebx] / asm pmulhw xmm2,xmmword ptr [ebx+16] / asm paddsw xmm0,xmm2 / //asm ret //for __declspec(naked) //} must_inline UInt32 ThreeOrderTable_Fast_SSE2(const Color32* pixel,long byte_width,const TMMXData64* v4,const TMMXData64* u4){ asm mov eax,pixel asm mov edx,byte_width asm mov ebx,v4 asm mov ecx,u4 //asm call _private_ThreeOrderTable_Fast_SSE2_2 _private_ThreeOrderTable_Fast_SSE2_2(); asm movdqa xmm6,xmm0 asm lea eax,[eax+edx*2] //+pic.byte_width asm lea ebx,[ebx+32] //asm call _private_ThreeOrderTable_Fast_SSE2_2 _private_ThreeOrderTable_Fast_SSE2_2(); asm paddsw xmm6,xmm0 asm movdqa xmm5,xmm6 asm psrldq xmm6,8 //srl 8*8 bit! asm paddsw xmm5,xmm6 asm psraw xmm5,3 asm packuswb xmm5,xmm7 asm movd eax,xmm5 } must_inline long getSizeBorder(long x,long maxx){ if (x<=0) return 0; else if (x>=maxx) return maxx; else return x; } must_inline UInt32 ThreeOrderTable_Border_SSE2(const TPixels32Ref& pic,const long x0_sub1,const long y0_sub1,const TMMXData64* v4,const TMMXData64* u4){ Color32 pixel[16]; long height_sub_1=pic.height-1; long width_sub_1=pic.width-1; Color32* pbuf=pixel; for (long i=0;i<4;++i,pbuf+=4){ long y=getSizeBorder(y0_sub1+i,height_sub_1); Color32* pLine=pic.getLinePixels(y); pbuf[0]=pLine[getSizeBorder(x0_sub1+0,width_sub_1)]; pbuf[1]=pLine[getSizeBorder(x0_sub1+1,width_sub_1)]; pbuf[2]=pLine[getSizeBorder(x0_sub1+2,width_sub_1)]; pbuf[3]=pLine[getSizeBorder(x0_sub1+3,width_sub_1)]; } return ThreeOrderTable_Fast_SSE2(pixel,4*sizeof(Color32),v4,u4); } void PicZoom_ThreeOrderTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src) { if ( (0==Dst.width)||(0==Dst.height) ||(0==Src.width)||(0==Src.height)) return; long dst_width=Dst.width; long dst_height=Dst.height; long xrIntFloat_16=((Src.width)<<16)/dst_width+1; long yrIntFloat_16=((Src.height)<<16)/dst_height+1; const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1); const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1); //计算出需要特殊处理的边界 long border_y0=((1<<16)-csDErrorY)/yrIntFloat_16+1;//y0+y*yr>=1; y0=csDErrorY => y>=(1-csDErrorY)/yr if (border_y0>=dst_height) border_y0=dst_height; long border_x0=((1<<16)-csDErrorX)/xrIntFloat_16+1; if (border_x0>=dst_width ) border_x0=dst_width; long border_y1=(((Src.height-3)<<16)-csDErrorY)/yrIntFloat_16+1; //y0+y*yr<=(height-3) => y<=(height-3-csDErrorY)/yr if (border_y1<border_y0) border_y1=border_y0; long border_x1=(((Src.width-3)<<16)-csDErrorX)/xrIntFloat_16+1;; if (border_x1<border_x0) border_x1=border_x0; UInt8* _bufMem=new UInt8[(dst_width*4*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)]; TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte对齐 Int32* xList=(Int32*)(uList+dst_width*4); {//init u table long srcx_16=csDErrorX; for (long x=0;x<dst_width*4;x+=4){ xList[x>>2]=(srcx_16>>16)-1; long u=(srcx_16>>8)&0xFF; uList[x+0]=SinXDivX_Table64_MMX[256+u]; uList[x+1]=SinXDivX_Table64_MMX[u]; uList[x+2]=SinXDivX_Table64_MMX[256-u]; uList[x+3]=SinXDivX_Table64_MMX[512-u]; srcx_16+=xrIntFloat_16; } } TMMXData64 _v4[8+2]; TMMXData64* v4=(&_v4[0]); v4=(TMMXData64*)( (((ptrdiff_t)v4)+15)>>4<<4); asm pxor xmm7,xmm7 Color32* pDstLine=Dst.pdata; long srcy_16=csDErrorY; for (long y=0;y<dst_height;++y){ //v table const long srcy_sub1=(srcy_16>>16)-1; const long v=(srcy_16>>8)&0xFF; v4[0]=SinXDivX_Table64_MMX[256+v]; v4[1]=v4[0]; v4[2]=SinXDivX_Table64_MMX[v]; v4[3]=v4[2]; v4[4]=SinXDivX_Table64_MMX[256-v]; v4[5]=v4[4]; v4[6]=SinXDivX_Table64_MMX[512-v]; v4[7]=v4[6]; if ((y<border_y0)||(y>=border_y1)){ for (long x=0;x<dst_width;++x) pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]); //border }else{ for (long x=0;x<border_x0;++x) pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);//border const Color32* pixelLine=Src.getLinePixels(srcy_sub1); long byte_width=Src.byte_width; for (long x=border_x0;x<border_x1;++x) pDstLine[x].argb=ThreeOrderTable_Fast_SSE2(&pixelLine[xList[x]],byte_width,v4,&uList[x*4]);//fast MMX ! for (long x=border_x1;x<dst_width;++x) pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);//border } srcy_16+=yrIntFloat_16; ((UInt8*&)pDstLine)+=Dst.byte_width; } delete []_bufMem; }