Resize 优化

CPU优化测试

结论 :预处理速度方面,采用三方库进行处理时还不是瓶颈,难度在于保持结果与速度 和三方库一致时预处理的自实现。

实现与测试内容:resize 函数实现和优化(双线性插值)

测试工程代码github: https://github.com/sisong/demoForHssBlog/tree/master/ZoomDemo

测试环境: i5-8400 2.8Ghz 6核心

图像缩放大小: 1960 * 1080 -> 1360 * 720

优化方式 函数名称 时间 (ms)
1、基础版本(浮点实现版本) PicZoom_Bilinear0 23.66
2、浮点优化为整数 PicZoom_Bilinear1 12.18
3、图像边界与其他区域分开计算 PicZoom_Bilinear2 7.11
4、边界处使用近似值处理 PicZoom_ftBilinear_Common 5.36
5、MMX指令改写(3) PicZoom_Bilinear_MMX 4.01
6、SSE2指令集改写(4) PicZoom_ftBilinear_SSE2 2.83
7、SSE2指令集改写(4)+ 预计算缩放系数表 PicZoom_ftBilinearTable_SSE2 1.12
Opencv3.1 安装版 1.36
双线性插值公式
Resize 优化_第1张图片
img

如图,已知Q12,Q22,Q11,Q21,但是要插值的点为P点,这就要用双线性插值,

首先在x轴方向上,对R1和R2两个点进行插值,然后根据R1和R2对P点进行插值,这就是所谓的双线性插值

在图像处理的时候,我们先根据
srcX=dstX* (srcWidth/dstWidth)   
srcY = dstY * (srcHeight/dstHeight) 来计算目标像素在源图像中的位置,这里计算的srcX和srcY一般都是浮点数,比如f(1.2, 3.4)这个像素点是虚拟存在的,先找到与它临近的四个实际存在的像素点

(1,3) (2,3)   (1,4) (2,4)   
写成f(i+u,j+v)的形式,则u=0.2,v=0.4, i=1, j=3   
直接整理一步计算,f(i+u,j+v) = (1-u)(1-v)f(i,j) + (1-u)vf(i,j+1) + u(1-v)f(i+1,j) + uvf(i+1,j+1) 。

假设源图像是3 * 3,中心点坐标(1,1)目标图像是9 * 9,中心点坐标(4,4),我们在进行插值映射的时候,尽可能希望均匀的用到源图像的像素信息,最直观的就是(4,4)映射到(1,1)现在直接计算srcX=4*3/9=1.3333!=1,也就是我们在插值的时候所利用的像素集中在图像的右下方,而不是均匀分布整个图像。

为了保证图像缩放时候均匀的用到源图像的像素信息,我们在原始的浮点坐标上加上了0.5*(srcWidth/dstWidth-1)这样一个控制因子,即:

srcX=dstX* (srcWidth/dstWidth)+0.5*(srcWidth/dstWidth-1)

此时 srcX=(4+0.5)*3/9-0.5=1

所以,在双线性插值计算时候 ,大多都采用中心对齐方式(Opencv,Matlab也是);中心对齐公式 SrcX=(dstX+0.5)* (srcWidth/dstWidth) -0.5 SrcY=(dstY+0.5) * (srcHeight/dstHeight)-0.5

代码块

注:为保证代码更为方便的阅读,均将调用的子函数写在了主函数的下方(未将声明现在主函数前);

一、像素格式

//图像数据区的描述信息
struct TPixels32Ref{
public:
    Color32*    pdata;        //图像数据区首地址  即 y==0行的颜色首地址
    long        byte_width;   //一行图像数据的字节宽度  正负值都有可能 
    long        width;        //图像宽度
    long        height;       //图像高度
    inline TPixels32Ref() :pdata(0),byte_width(0),width(0),height(0){}
    inline TPixels32Ref(const TPixels32Ref& ref) :pdata(ref.pdata),byte_width(ref.byte_width),width(ref.width),height(ref.height){}
    
    //访问(x,y)坐标处的颜色
    inline Color32& pixels(const long x,const long y) const { return getLinePixels(y)[x]; }
    //得到y行的颜色首地址
    inline Color32* getLinePixels(const long y) const { return (Color32*) ( ((UInt8*)pdata) + byte_width*y ); }

    //是否是空图像区
    inline bool getIsEmpty()const { return ((width<=0)||(height<=0)); }
    //将pline指向下一行颜色
    inline void nextLine(Color32*& pline)const {  ((UInt8*&)pline)+=byte_width;   }

    //坐标边界饱和  如果(x,y)坐标在图片数据区外,(x,y)值会被设置到图片最近的边界内,并返回false(否则什么也不做,返回true) //警告! 图片区域不能为空
    inline bool clipToBorder(long& x, long& y)const{ //a=2  /a+1 
        bool isIn = true;
        if (x < 0) { 
            isIn = false; x = 0;
        } else if (x >= width) { 
            isIn = false; x = width - 1;
        }
        if (y < 0) {
            isIn = false; y = 0;
        } else if (y >= height) {
            isIn = false; y = height - 1;
        }
        return isIn;
    }
    //获取一个点的颜色,默认执行边界饱和测试  当坐标超出区域的时候返回的颜色为最近的边界上的颜色值并且其alpha通道置零  //警告! 图片区域不能为空 速度很慢 
    inline Color32 getPixelsBorder(long x, long y) const {
        bool isInPic = clipToBorder(x,y);
        Color32 result = pixels(x,y);
        if (!isInPic)
            result.a=0;
        return result;
    }
};

二、双线性插值函数:

1、基础版本(浮点实现版本)

//主函数
void PicZoom_Bilinear0(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
    if (  (0==Dst.width)||(0==Dst.height)
        ||(0==Src.width)||(0==Src.height)) return;

    long dst_width=Dst.width;
    Color32* pDstLine=Dst.pdata;
    for (long y=0;yfx) --x; //x=floor(fx);    
        long y=(long)fy; if (y>fy) --y; //y=floor(fy);
        
        Color32 Color0=pic.getPixelsBorder(x,y);
        Color32 Color2=pic.getPixelsBorder(x+1,y);
        Color32 Color1=pic.getPixelsBorder(x,y+1);
        Color32 Color3=pic.getPixelsBorder(x+1,y+1);

        double u=fx-x;
        double v=fy-y;
        double pm3=u*v;
        double pm2=u*(1-v);
        double pm1=v*(1-u);
        double pm0=(1-u)*(1-v);

        result->a=(UInt8)(pm0*Color0.a+pm1*Color1.a+pm2*Color2.a+pm3*Color3.a);
        result->r=(UInt8)(pm0*Color0.r+pm1*Color1.r+pm2*Color2.r+pm3*Color3.r);
        result->g=(UInt8)(pm0*Color0.g+pm1*Color1.g+pm2*Color2.g+pm3*Color3.g);
        result->b=(UInt8)(pm0*Color0.b+pm1*Color1.b+pm2*Color2.b+pm3*Color3.b);
    }

2、浮点优化为整数

//主函数 
//将浮点数改成整数计算
void PicZoom_Bilinear1(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
    if (  (0==Dst.width)||(0==Dst.height)
        ||(0==Src.width)||(0==Src.height)) return;

    long xrIntFloat_16=((Src.width)<<16)/Dst.width+1; 
    long yrIntFloat_16=((Src.height)<<16)/Dst.height+1;
    const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
    const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);

    long dst_width=Dst.width;

    Color32* pDstLine=Dst.pdata;
    long srcy_16=csDErrorY;
    long y;
    for (y=0;y>16;
        long y=y_16>>16;
        Color32 Color0=pic.getPixelsBorder(x,y);
        Color32 Color2=pic.getPixelsBorder(x+1,y);
        Color32 Color1=pic.getPixelsBorder(x,y+1);
        Color32 Color3=pic.getPixelsBorder(x+1,y+1);

        unsigned long u_8=(x_16 & 0xFFFF)>>8;
        unsigned long v_8=(y_16 & 0xFFFF)>>8;
        unsigned long pm3_16=(u_8*v_8);
        unsigned long pm2_16=(u_8*(unsigned long)(256-v_8));
        unsigned long pm1_16=(v_8*(unsigned long)(256-u_8));
        unsigned long pm0_16=((256-u_8)*(256-v_8));

        result->a=(UInt8)((pm0_16*Color0.a+pm1_16*Color1.a+pm2_16*Color2.a+pm3_16*Color3.a)>>16);
        result->r=(UInt8)((pm0_16*Color0.r+pm1_16*Color1.r+pm2_16*Color2.r+pm3_16*Color3.r)>>16);
        result->g=(UInt8)((pm0_16*Color0.g+pm1_16*Color1.g+pm2_16*Color2.g+pm3_16*Color3.g)>>16);
        result->b=(UInt8)((pm0_16*Color0.b+pm1_16*Color1.b+pm2_16*Color2.b+pm3_16*Color3.b)>>16);
    }

3、图像边界与其他区域分开计算

 //主函数
void PicZoom_Bilinear2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
    if (  (0==Dst.width)||(0==Dst.height)
        ||(0==Src.width)||(0==Src.height)) return;

    long xrIntFloat_16=((Src.width)<<16)/Dst.width+1; 
    long yrIntFloat_16=((Src.height)<<16)/Dst.height+1;
    const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
    const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);

    long dst_width=Dst.width;

    //计算出需要特殊处理的边界
    long border_y0=-csDErrorY/yrIntFloat_16+1;              //y0+y*yr>=0; y0=csDErrorY => y>=-csDErrorY/yr
    if (border_y0>=Dst.height) border_y0=Dst.height;
    long border_x0=-csDErrorX/xrIntFloat_16+1;     
    if (border_x0>=Dst.width ) border_x0=Dst.width; 
    long border_y1=(((Src.height-2)<<16)-csDErrorY)/yrIntFloat_16+1; //y0+y*yr<=(height-2) => y<=(height-2-csDErrorY)/yr
    if (border_y1>8;
            Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
            for (long x=border_x0;x>16];
                Color32* PColor1=(Color32*)((UInt8*)(PColor0)+Src_byte_width);
                Bilinear2_Fast(PColor0,PColor1,(srcx_16 & 0xFFFF)>>8,v_8,&pDstLine[x]);
                srcx_16+=xrIntFloat_16;
            }
        }

        for (x=border_x1;xa=(UInt8)((pm0_16*PColor0[0].a+pm2_16*PColor0[1].a+pm1_16*PColor1[0].a+pm3_16*PColor1[1].a)>>16);
        result->r=(UInt8)((pm0_16*PColor0[0].r+pm2_16*PColor0[1].r+pm1_16*PColor1[0].r+pm3_16*PColor1[1].r)>>16);
        result->g=(UInt8)((pm0_16*PColor0[0].g+pm2_16*PColor0[1].g+pm1_16*PColor1[0].g+pm3_16*PColor1[1].g)>>16);
        result->b=(UInt8)((pm0_16*PColor0[0].b+pm2_16*PColor0[1].b+pm1_16*PColor1[0].b+pm3_16*PColor1[1].b)>>16);
    }

    inline void Bilinear2_Border(const TPixels32Ref& pic,const long x_16,const long y_16,Color32* result)
    {
        long x=(x_16>>16);
        long y=(y_16>>16);
        unsigned long u_16=((unsigned short)(x_16));
        unsigned long v_16=((unsigned short)(y_16));

        Color32 pixel[4];
        pixel[0]=pic.getPixelsBorder(x,y);
        pixel[1]=pic.getPixelsBorder(x+1,y);
        pixel[2]=pic.getPixelsBorder(x,y+1);
        pixel[3]=pic.getPixelsBorder(x+1,y+1);
        
        Bilinear2_Fast(&pixel[0],&pixel[2],u_16>>8,v_16>>8,result);
    }

4、边界处使用近似值处理

如果不想处理边界访问超界问题,可以考虑扩大源图片的尺寸,加一个边框 (“哨兵”优化); 这样插值算法就不用考虑边界问题了,程序写起来也简单很多! 如果对缩放结果的边界像素级精度要求不是太高,可使用如下缩放公式: Sx=Dx(SW-1)/DW; Sy=Dy(SH-1)/DH; (源图片宽和高:SW>=2;SH>=2)

这个公式不会造成内存访问超界:

//主函数
void PicZoom_ftBilinear_Common(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
    if (  (0==Dst.width)||(0==Dst.height)
        ||(2>Src.width)||(2>Src.height)) return;

    // Sx=Dx*(SW-1)/DW; Sy=Dy*(SH-1)/DH
    long xrIntFloat_16=((Src.width-1)<<16)/Dst.width; 
    long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;

    long dst_width=Dst.width;
    long Src_byte_width=Src.byte_width;
    Color32* pDstLine=Dst.pdata;
    long srcy_16=0;
    for (long y=0;y>8;
        Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
        long srcx_16=0;
        for (long x=0;x>16];
            Bilinear_Fast_Common(PColor0,(Color32*)((UInt8*)(PColor0)+Src_byte_width),(srcx_16 & 0xFFFF)>>8,v_8,&pDstLine[x]);
            srcx_16+=xrIntFloat_16;
        }
        srcy_16+=yrIntFloat_16;
        ((UInt8*&)pDstLine)+=Dst.byte_width;
    }
}

must_inline void Bilinear_Fast_Common(Color32* PColor0,Color32* PColor1,unsigned long u_8,unsigned long v_8,Color32* result)
    {
        unsigned long pm3_8=(u_8*v_8)>>8;
        unsigned long pm2_8=u_8-pm3_8;
        unsigned long pm1_8=v_8-pm3_8;
        unsigned long pm0_8=256-pm1_8-pm2_8-pm3_8;

        unsigned long Color=*(unsigned long*)(PColor0);
        unsigned long BR=(Color & 0x00FF00FF)*pm0_8;
        unsigned long GA=((Color & 0xFF00FF00)>>8)*pm0_8;
                      Color=((unsigned long*)(PColor0))[1];
                      GA+=((Color & 0xFF00FF00)>>8)*pm2_8;
                      BR+=(Color & 0x00FF00FF)*pm2_8;
                      Color=*(unsigned long*)(PColor1);
                      GA+=((Color & 0xFF00FF00)>>8)*pm1_8;
                      BR+=(Color & 0x00FF00FF)*pm1_8;
                      Color=((unsigned long*)(PColor1))[1];
                      GA+=((Color & 0xFF00FF00)>>8)*pm3_8;
                      BR+=(Color & 0x00FF00FF)*pm3_8;

        *(unsigned long*)(result)=(GA & 0xFF00FF00)|((BR & 0xFF00FF00)>>8);
    }

5、MMX指令改写(3)

//主函数
void PicZoom_Bilinear_MMX(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
    if (  (0==Dst.width)||(0==Dst.height)
        ||(0==Src.width)||(0==Src.height)) return;

    long xrIntFloat_16=((Src.width)<<16)/Dst.width+1; 
    long yrIntFloat_16=((Src.height)<<16)/Dst.height+1;
    const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
    const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);

    long dst_width=Dst.width;

    //计算出需要特殊处理的边界
    long border_y0=-csDErrorY/yrIntFloat_16+1;              //y0+y*yr>=0; y0=csDErrorY => y>=-csDErrorY/yr
    if (border_y0>=Dst.height) border_y0=Dst.height;
    long border_x0=-csDErrorX/xrIntFloat_16+1;     
    if (border_x0>=Dst.width ) border_x0=Dst.width; 
    long border_y1=(((Src.height-2)<<16)-csDErrorY)/yrIntFloat_16+1; //y0+y*yr<=(height-2) => y<=(height-2-csDErrorY)/yr
    if (border_y1>8;
            Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
            for (long x=border_x0;x>16];
                Color32* PColor1=(Color32*)((UInt8*)(PColor0)+Src_byte_width);
                Bilinear_Fast_MMX(PColor0,PColor1,(srcx_16 & 0xFFFF)>>8,v_8,&pDstLine[x]);
                srcx_16+=xrIntFloat_16;
            }
        }

        for (x=border_x1;x>16);
        long y=(y_16>>16);
        unsigned long u_16=((unsigned short)(x_16));
        unsigned long v_16=((unsigned short)(y_16));

        Color32 pixel[4];
        pixel[0]=pic.getPixelsBorder(x,y);
        pixel[1]=pic.getPixelsBorder(x+1,y);
        pixel[2]=pic.getPixelsBorder(x,y+1);
        pixel[3]=pic.getPixelsBorder(x+1,y+1);
        
        Bilinear_Fast_MMX(&pixel[0],&pixel[2],u_16>>8,v_16>>8,result);
    }

must_inline void  Bilinear_Fast_MMX(Color32* PColor0,Color32* PColor1,unsigned long u_8,unsigned long v_8,Color32* result)
    {
        asm
        {    
              MOVD      MM6,v_8
              MOVD      MM5,u_8
              mov       edx,PColor0
              mov       eax,PColor1
              PXOR      mm7,mm7

              MOVD         MM2,dword ptr [eax]  
              MOVD         MM0,dword ptr [eax+4]
              PUNPCKLWD    MM5,MM5
              PUNPCKLWD    MM6,MM6
              MOVD         MM3,dword ptr [edx]  
              MOVD         MM1,dword ptr [edx+4]
              PUNPCKLDQ    MM5,MM5 
              PUNPCKLBW    MM0,MM7
              PUNPCKLBW    MM1,MM7
              PUNPCKLBW    MM2,MM7
              PUNPCKLBW    MM3,MM7
              PSUBw        MM0,MM2
              PSUBw        MM1,MM3
              PSLLw        MM2,8
              PSLLw        MM3,8
              PMULlw       MM0,MM5
              PMULlw       MM1,MM5
              PUNPCKLDQ    MM6,MM6 
              PADDw        MM0,MM2
              PADDw        MM1,MM3

              PSRLw        MM0,8
              PSRLw        MM1,8
              PSUBw        MM0,MM1
              PSLLw        MM1,8
              PMULlw       MM0,MM6
              mov       eax,result
              PADDw        MM0,MM1

              PSRLw        MM0,8
              PACKUSwb     MM0,MM7
              movd      [eax],MM0 
              //emms
        }
    }

6、SSE2指令集改写(4)

//主函数
void PicZoom_ftBilinear_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
    if (  (0==Dst.width)||(0==Dst.height)
        ||(2>Src.width)||(2>Src.height)) return;

    long xrIntFloat_16=((Src.width-1)<<16)/Dst.width; 
    long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;

    long dst_width=Dst.width;
    long Src_byte_width=Src.byte_width;
    Color32* pDstLine=Dst.pdata;
    long srcy_16=0;
    asm pxor  xmm7,xmm7 //xmm7=0
    for (long y=0;y>8;
        Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
        Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src_byte_width) ;
        asm
        {
              movd        xmm6,v_8
              PUNPCKLWD   xmm6,xmm6
              PUNPCKLDQ   xmm6,xmm6
              PUNPCKLQDQ  xmm6,xmm6//xmm6=v_8
            
              mov       esi,PSrcLineColor
              mov       ecx,PSrcLineColorNext
              xor       edx,edx   //srcx_16=0
              mov       ebx,dst_width
              mov       edi,pDstLine
              push      ebp
              mov       ebp,xrIntFloat_16
              push      ebx
              and       ebx,(not 1)
              test      ebx,ebx   //nop
              jle     end_loop2


              lea       edi,[edi+ebx*4]
              neg       ebx
        loop2_start:
              call ftBilinear_SSE2_expand2
              lea       edx,[edx+ebp*2]
              add       ebx,2

              jnz       loop2_start


        end_loop2:
            pop    ebx
            and    ebx,1  
            test   ebx,ebx
            jle    end_write

              lea       edi,[edi+ebx*4]
              neg       ebx
        loop1_start:
              call ftBilinear_SSE2
              lea       edx,[edx+ebp]
              add       ebx,1

              jnz       loop1_start
        end_write:

              pop       ebp
        }
        srcy_16+=yrIntFloat_16;
        ((UInt8*&)pDstLine)+=Dst.byte_width;
    }
    asm emms
}

//ftBilinear_SSE2_expand2(out [edi+ebx*4];xmm6=v_8,xmm7=0,edx=srcx_16,esi=PSrcLineColor,ecx=PSrcLineColorNext,ebp=xrIntFloat_16)
void __declspec(naked) ftBilinear_SSE2_expand2()
    {
        asm
        {
              lea       eax,[edx+ebp]
              MOVD      XMM5,edx
              MOVD      XMM4,eax
              PUNPCKLWD XMM5,XMM4
              PSRLW     XMM5,8

              mov       eax,edx
              shr       eax,16     //srcx_16>>16
              PUNPCKLWD    XMM5,XMM5
              MOVQ         XMM2,  qword ptr [ecx+eax*4]//XMM2=0  0  Color0 Color2
              MOVQ         XMM3,  qword ptr [esi+eax*4]//XMM3=0  0  Color1 Color3
              lea       eax,[edx+ebp]
              shr       eax,16     //srcx_16>>16
              PUNPCKLDQ    XMM5,XMM5 //mm5=u_8' u_8' u_8' u_8' u_8 u_8 u_8 u_8 
              movq   xmm4,qword ptr [ecx+eax*4]
              PUNPCKLDQ    XMM2,xmm4//XMM2=Color0' Color0  Color2' Color2
              movq   xmm4,qword ptr [esi+eax*4]
              PUNPCKLDQ    XMM3,xmm4//XMM3=Color1' Color1  Color3' Color3
              MOVHLPS      XMM0,XMM2 //XMM0= X  X  Color0' Color0
              MOVHLPS      XMM1,XMM3 //XMM1= X  X  Color1' Color1

              PUNPCKLBW    XMM0,XMM7
              PUNPCKLBW    XMM1,XMM7
              PUNPCKLBW    XMM2,XMM7
              PUNPCKLBW    XMM3,XMM7
              PSUBw        XMM0,XMM2
              PSUBw        XMM1,XMM3
              PSLLw        XMM2,8
              PSLLw        XMM3,8
              PMULlw       XMM0,XMM5
              PMULlw       XMM1,XMM5
              PADDw        XMM0,XMM2
              PADDw        XMM1,XMM3

              PSRLw        XMM0,8
              PSRLw        XMM1,8
              PSUBw        XMM0,XMM1
              PSLLw        XMM1,8
              PMULlw       XMM0,XMM6
              PADDw        XMM0,XMM1

              PSRLw     XMM0,8
              PACKUSwb  XMM0,XMM7

              //MOVQ qword ptr [edi+ebx*4], xmm0//write two DstColor
              MOVDQ2Q   mm4,xmm0
              movntq  qword ptr  [edi+ebx*4],mm4

              ret
        }
    }

 //ftBilinear_SSE2(out [edi+ebx*4];xmm6=v_8,xmm7=0,edx=srcx_16,esi=PSrcLineColor,ecx=PSrcLineColorNext,ebp=xrIntFloat_16)
    void __declspec(naked) ftBilinear_SSE2()
    {
        asm
        {
              mov       eax,edx
              shl       eax,16
              shr       eax,24
              //== movzx       eax,dh  //eax=u_8
              MOVD      XMM5,eax
              mov       eax,edx
              shr       eax,16     //srcx_16>>16

              MOVD         XMM0,  dword ptr [ecx+eax*4+4]//XMM0=Color2
              MOVD         XMM2,  dword ptr [ecx+eax*4]  //XMM2=Color0
              PUNPCKLWD    XMM5,XMM5
              MOVD         XMM1,  dword ptr [esi+eax*4+4]//XMM1=Color3
              MOVD         XMM3,  dword ptr [esi+eax*4]  //XMM3=Color1
              PUNPCKLDQ    XMM5,XMM5 //mm5=u_8
              PUNPCKLBW    XMM0,XMM7
              PUNPCKLBW    XMM1,XMM7
              PUNPCKLBW    XMM2,XMM7
              PUNPCKLBW    XMM3,XMM7
              PSUBw        XMM0,XMM2
              PSUBw        XMM1,XMM3
              PSLLw        XMM2,8
              PSLLw        XMM3,8
              PMULlw       XMM0,XMM5
              PMULlw       XMM1,XMM5
              PADDw        XMM0,XMM2
              PADDw        XMM1,XMM3

              PSRLw        XMM0,8
              PSRLw        XMM1,8
              PSUBw        XMM0,XMM1
              PSLLw        XMM1,8
              PMULlw       XMM0,XMM6
              PADDw        XMM0,XMM1

              PSRLw     XMM0,8
              PACKUSwb  XMM0,XMM7
              MOVd  dword ptr  [edi+ebx*4],XMM0 //write DstColor

              ret
        }
    }

7、SSE2指令集改写(4)+预计算缩放系数表

void PicZoom_ftBilinearTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
    if (  (0==Dst.width)||(0==Dst.height)
        ||(2>Src.width)||(2>Src.height)) return;

    long xrIntFloat_16=((Src.width-1)<<16)/Dst.width; 
    long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;

    long dst_width=Dst.width;
    UInt8* _bufMem=new UInt8[(dst_width*2*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];
    TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte对齐 
    Int32* xList=(Int32*)(uList+dst_width*2);
    {//init u table
        long srcx_16=0;
        for (long x=0;x>1]=(srcx_16>>16);
            unsigned long u=(srcx_16>>8)&0xFF;
            unsigned long ur=(256-u)<<1;
            u=u<<1;
            uList[x+0]=(ur|(ur<<16));
            uList[x+0]|=uList[x+0]<<32;
            uList[x+1]=u|(u<<16);
            uList[x+1]|=uList[x+1]<<32;
            srcx_16+=xrIntFloat_16;
        }
    }

    Color32* pDstLine=Dst.pdata;
    long srcy_16=0;
    asm pxor  xmm7,xmm7 //xmm7=0
    for (long y=0;y>8) & 0xFF;
        unsigned long vr=(256-v)>>1;
        v>>=1;
        Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src.byte_width*(srcy_16>>16)) ;
        Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src.byte_width) ;
        asm{
              movd        xmm5,vr
              movd        xmm6,v
              punpcklwd   xmm5,xmm5
              punpcklwd   xmm6,xmm6
              punpckldq   xmm5,xmm5
              punpckldq   xmm6,xmm6
              punpcklqdq  xmm5,xmm5
              punpcklqdq  xmm6,xmm6
            
              mov       esi,PSrcLineColor
              mov       ecx,PSrcLineColorNext
              mov       edx,xList //x
              mov       ebx,dst_width
              mov       edi,pDstLine
              push      ebp
              mov       ebp,uList
              
              push      ebx
              and       ebx,(not 1)
              test      ebx,ebx
              jle     end_loop2


              lea       ebx,[ebx*4]
              lea       edi,[edi+ebx]
              lea       edx,[edx+ebx]
              lea       ebp,[ebp+ebx*4]
              neg       ebx
        loop2_start:
              //call ftBilinearTable_SSE2_expand2
              ftBilinearTable_SSE2_expand2()
              add       ebx,8

              jnz       loop2_start

        end_loop2:
            pop    ebx
            and    ebx,1  
            test   ebx,ebx
            jle    end_write

              lea       ebx,[ebx*4]
              lea       edi,[edi+ebx]
              lea       edx,[edx+ebx]
              lea       ebp,[ebp+ebx*4]
              neg       ebx
        loop1_start:
              //call ftBilinearTable_SSE2
              ftBilinearTable_SSE2()
              add       ebx,4

              jnz       loop1_start
        end_write:

              pop       ebp
        }
        srcy_16+=yrIntFloat_16;
        ((UInt8*&)pDstLine)+=Dst.byte_width;
    }
    delete []_bufMem;
}

 //ftBilinearTable_SSE2(out [edi+ebx*4]; xmm5=v,xmm6=vr,xmm7=0,[ebp]=(u,ur),[edx]=srx_x,esi=PSrcLineColor,ecx=PSrcLineColorNext)
    //void __declspec(naked) ftBilinearTable_SSE2(){
    #define  ftBilinearTable_SSE2()                     \
        asm mov         eax,[edx+ebx]                   \
        asm movq        xmm0,qword ptr[esi+eax*4]       \
        asm movq        xmm1,qword ptr[ecx+eax*4]       \
        asm punpcklbw   xmm0,xmm7                       \
        asm punpcklbw   xmm1,xmm7                       \
        asm pmullw      xmm0,xmm5                       \
        asm pmullw      xmm1,xmm6                       \
        asm paddw       xmm0,xmm1                       \
        asm pmulhw      xmm0,xmmword ptr [ebp+ebx*4]    \
        asm movdqa      xmm1,xmm0                       \
        asm punpckhqdq  xmm0,xmm0                       \
        asm paddw       xmm0,xmm1                       \
        asm packuswb    xmm0,xmm7                       \
        asm movd  dword ptr  [edi+ebx],xmm0             
        //ret //for  __declspec(naked)
        //}
    //}

 //void __declspec(naked) ftBilinearTable_SSE2_expand2(){
    #define  ftBilinearTable_SSE2_expand2()             \
        asm mov         eax,[edx+ebx]                   \
        asm movq        xmm0,qword ptr[esi+eax*4]       \
        asm movq        xmm1,qword ptr[ecx+eax*4]       \
        asm mov         eax,[edx+ebx+4]                 \
        asm movq        xmm2,qword ptr[esi+eax*4]       \
        asm movq        xmm3,qword ptr[ecx+eax*4]       \
        asm punpcklbw   xmm0,xmm7                       \
        asm punpcklbw   xmm1,xmm7                       \
        asm punpcklbw   xmm2,xmm7                       \
        asm punpcklbw   xmm3,xmm7                       \
        asm pmullw      xmm0,xmm5                       \
        asm pmullw      xmm1,xmm6                       \
        asm pmullw      xmm2,xmm5                       \
        asm pmullw      xmm3,xmm6                       \
        asm paddw       xmm0,xmm1                       \
        asm paddw       xmm2,xmm3                       \
        asm pmulhw      xmm0,xmmword ptr [ebp+ebx*4]    \
        asm pmulhw      xmm2,xmmword ptr [ebp+ebx*4+16] \
        asm movdqa      xmm1,xmm0                       \
        asm punpcklqdq  xmm0,xmm2                       \
        asm punpckhqdq  xmm1,xmm2                       \
        asm paddw       xmm0,xmm1                       \
        asm packuswb    xmm0,xmm7                       \
        asm movq  qword ptr  [edi+ebx],xmm0             \
        //ret //for  __declspec(naked)
        //}
    //}

你可能感兴趣的:(Resize 优化)