CPU优化测试
结论 :预处理速度方面,采用三方库进行处理时还不是瓶颈,难度在于保持结果与速度 和三方库一致时预处理的自实现。
实现与测试内容:resize 函数实现和优化(双线性插值)
测试工程代码github: https://github.com/sisong/demoForHssBlog/tree/master/ZoomDemo
测试环境: i5-8400 2.8Ghz 6核心
图像缩放大小: 1960 * 1080 -> 1360 * 720
优化方式 | 函数名称 | 时间 (ms) |
---|---|---|
1、基础版本(浮点实现版本) | PicZoom_Bilinear0 | 23.66 |
2、浮点优化为整数 | PicZoom_Bilinear1 | 12.18 |
3、图像边界与其他区域分开计算 | PicZoom_Bilinear2 | 7.11 |
4、边界处使用近似值处理 | PicZoom_ftBilinear_Common | 5.36 |
5、MMX指令改写(3) | PicZoom_Bilinear_MMX | 4.01 |
6、SSE2指令集改写(4) | PicZoom_ftBilinear_SSE2 | 2.83 |
7、SSE2指令集改写(4)+ 预计算缩放系数表 | PicZoom_ftBilinearTable_SSE2 | 1.12 |
Opencv3.1 安装版 | 1.36 | |
双线性插值公式
如图,已知Q12,Q22,Q11,Q21,但是要插值的点为P点,这就要用双线性插值,
首先在x轴方向上,对R1和R2两个点进行插值,然后根据R1和R2对P点进行插值,这就是所谓的双线性插值
在图像处理的时候,我们先根据
srcX=dstX* (srcWidth/dstWidth)
srcY = dstY * (srcHeight/dstHeight) 来计算目标像素在源图像中的位置,这里计算的srcX和srcY一般都是浮点数,比如f(1.2, 3.4)这个像素点是虚拟存在的,先找到与它临近的四个实际存在的像素点
(1,3) (2,3) (1,4) (2,4)
写成f(i+u,j+v)的形式,则u=0.2,v=0.4, i=1, j=3
直接整理一步计算,f(i+u,j+v) = (1-u)(1-v)f(i,j) + (1-u)vf(i,j+1) + u(1-v)f(i+1,j) + uvf(i+1,j+1) 。
假设源图像是3 * 3,中心点坐标(1,1)目标图像是9 * 9,中心点坐标(4,4),我们在进行插值映射的时候,尽可能希望均匀的用到源图像的像素信息,最直观的就是(4,4)映射到(1,1)现在直接计算srcX=4*3/9=1.3333!=1,也就是我们在插值的时候所利用的像素集中在图像的右下方,而不是均匀分布整个图像。
为了保证图像缩放时候均匀的用到源图像的像素信息,我们在原始的浮点坐标上加上了0.5*(srcWidth/dstWidth-1)这样一个控制因子,即:
srcX=dstX* (srcWidth/dstWidth)+0.5*(srcWidth/dstWidth-1)
此时 srcX=(4+0.5)*3/9-0.5=1
所以,在双线性插值计算时候 ,大多都采用中心对齐方式(Opencv,Matlab也是);中心对齐公式 SrcX=(dstX+0.5)* (srcWidth/dstWidth) -0.5 SrcY=(dstY+0.5) * (srcHeight/dstHeight)-0.5
代码块:
注:为保证代码更为方便的阅读,均将调用的子函数写在了主函数的下方(未将声明现在主函数前);
一、像素格式
//图像数据区的描述信息
struct TPixels32Ref{
public:
Color32* pdata; //图像数据区首地址 即 y==0行的颜色首地址
long byte_width; //一行图像数据的字节宽度 正负值都有可能
long width; //图像宽度
long height; //图像高度
inline TPixels32Ref() :pdata(0),byte_width(0),width(0),height(0){}
inline TPixels32Ref(const TPixels32Ref& ref) :pdata(ref.pdata),byte_width(ref.byte_width),width(ref.width),height(ref.height){}
//访问(x,y)坐标处的颜色
inline Color32& pixels(const long x,const long y) const { return getLinePixels(y)[x]; }
//得到y行的颜色首地址
inline Color32* getLinePixels(const long y) const { return (Color32*) ( ((UInt8*)pdata) + byte_width*y ); }
//是否是空图像区
inline bool getIsEmpty()const { return ((width<=0)||(height<=0)); }
//将pline指向下一行颜色
inline void nextLine(Color32*& pline)const { ((UInt8*&)pline)+=byte_width; }
//坐标边界饱和 如果(x,y)坐标在图片数据区外,(x,y)值会被设置到图片最近的边界内,并返回false(否则什么也不做,返回true) //警告! 图片区域不能为空
inline bool clipToBorder(long& x, long& y)const{ //a=2 /a+1
bool isIn = true;
if (x < 0) {
isIn = false; x = 0;
} else if (x >= width) {
isIn = false; x = width - 1;
}
if (y < 0) {
isIn = false; y = 0;
} else if (y >= height) {
isIn = false; y = height - 1;
}
return isIn;
}
//获取一个点的颜色,默认执行边界饱和测试 当坐标超出区域的时候返回的颜色为最近的边界上的颜色值并且其alpha通道置零 //警告! 图片区域不能为空 速度很慢
inline Color32 getPixelsBorder(long x, long y) const {
bool isInPic = clipToBorder(x,y);
Color32 result = pixels(x,y);
if (!isInPic)
result.a=0;
return result;
}
};
二、双线性插值函数:
1、基础版本(浮点实现版本)
//主函数
void PicZoom_Bilinear0(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(0==Src.width)||(0==Src.height)) return;
long dst_width=Dst.width;
Color32* pDstLine=Dst.pdata;
for (long y=0;yfx) --x; //x=floor(fx);
long y=(long)fy; if (y>fy) --y; //y=floor(fy);
Color32 Color0=pic.getPixelsBorder(x,y);
Color32 Color2=pic.getPixelsBorder(x+1,y);
Color32 Color1=pic.getPixelsBorder(x,y+1);
Color32 Color3=pic.getPixelsBorder(x+1,y+1);
double u=fx-x;
double v=fy-y;
double pm3=u*v;
double pm2=u*(1-v);
double pm1=v*(1-u);
double pm0=(1-u)*(1-v);
result->a=(UInt8)(pm0*Color0.a+pm1*Color1.a+pm2*Color2.a+pm3*Color3.a);
result->r=(UInt8)(pm0*Color0.r+pm1*Color1.r+pm2*Color2.r+pm3*Color3.r);
result->g=(UInt8)(pm0*Color0.g+pm1*Color1.g+pm2*Color2.g+pm3*Color3.g);
result->b=(UInt8)(pm0*Color0.b+pm1*Color1.b+pm2*Color2.b+pm3*Color3.b);
}
2、浮点优化为整数
//主函数
//将浮点数改成整数计算
void PicZoom_Bilinear1(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(0==Src.width)||(0==Src.height)) return;
long xrIntFloat_16=((Src.width)<<16)/Dst.width+1;
long yrIntFloat_16=((Src.height)<<16)/Dst.height+1;
const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);
long dst_width=Dst.width;
Color32* pDstLine=Dst.pdata;
long srcy_16=csDErrorY;
long y;
for (y=0;y>16;
long y=y_16>>16;
Color32 Color0=pic.getPixelsBorder(x,y);
Color32 Color2=pic.getPixelsBorder(x+1,y);
Color32 Color1=pic.getPixelsBorder(x,y+1);
Color32 Color3=pic.getPixelsBorder(x+1,y+1);
unsigned long u_8=(x_16 & 0xFFFF)>>8;
unsigned long v_8=(y_16 & 0xFFFF)>>8;
unsigned long pm3_16=(u_8*v_8);
unsigned long pm2_16=(u_8*(unsigned long)(256-v_8));
unsigned long pm1_16=(v_8*(unsigned long)(256-u_8));
unsigned long pm0_16=((256-u_8)*(256-v_8));
result->a=(UInt8)((pm0_16*Color0.a+pm1_16*Color1.a+pm2_16*Color2.a+pm3_16*Color3.a)>>16);
result->r=(UInt8)((pm0_16*Color0.r+pm1_16*Color1.r+pm2_16*Color2.r+pm3_16*Color3.r)>>16);
result->g=(UInt8)((pm0_16*Color0.g+pm1_16*Color1.g+pm2_16*Color2.g+pm3_16*Color3.g)>>16);
result->b=(UInt8)((pm0_16*Color0.b+pm1_16*Color1.b+pm2_16*Color2.b+pm3_16*Color3.b)>>16);
}
3、图像边界与其他区域分开计算
//主函数
void PicZoom_Bilinear2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(0==Src.width)||(0==Src.height)) return;
long xrIntFloat_16=((Src.width)<<16)/Dst.width+1;
long yrIntFloat_16=((Src.height)<<16)/Dst.height+1;
const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);
long dst_width=Dst.width;
//计算出需要特殊处理的边界
long border_y0=-csDErrorY/yrIntFloat_16+1; //y0+y*yr>=0; y0=csDErrorY => y>=-csDErrorY/yr
if (border_y0>=Dst.height) border_y0=Dst.height;
long border_x0=-csDErrorX/xrIntFloat_16+1;
if (border_x0>=Dst.width ) border_x0=Dst.width;
long border_y1=(((Src.height-2)<<16)-csDErrorY)/yrIntFloat_16+1; //y0+y*yr<=(height-2) => y<=(height-2-csDErrorY)/yr
if (border_y1>8;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
for (long x=border_x0;x>16];
Color32* PColor1=(Color32*)((UInt8*)(PColor0)+Src_byte_width);
Bilinear2_Fast(PColor0,PColor1,(srcx_16 & 0xFFFF)>>8,v_8,&pDstLine[x]);
srcx_16+=xrIntFloat_16;
}
}
for (x=border_x1;xa=(UInt8)((pm0_16*PColor0[0].a+pm2_16*PColor0[1].a+pm1_16*PColor1[0].a+pm3_16*PColor1[1].a)>>16);
result->r=(UInt8)((pm0_16*PColor0[0].r+pm2_16*PColor0[1].r+pm1_16*PColor1[0].r+pm3_16*PColor1[1].r)>>16);
result->g=(UInt8)((pm0_16*PColor0[0].g+pm2_16*PColor0[1].g+pm1_16*PColor1[0].g+pm3_16*PColor1[1].g)>>16);
result->b=(UInt8)((pm0_16*PColor0[0].b+pm2_16*PColor0[1].b+pm1_16*PColor1[0].b+pm3_16*PColor1[1].b)>>16);
}
inline void Bilinear2_Border(const TPixels32Ref& pic,const long x_16,const long y_16,Color32* result)
{
long x=(x_16>>16);
long y=(y_16>>16);
unsigned long u_16=((unsigned short)(x_16));
unsigned long v_16=((unsigned short)(y_16));
Color32 pixel[4];
pixel[0]=pic.getPixelsBorder(x,y);
pixel[1]=pic.getPixelsBorder(x+1,y);
pixel[2]=pic.getPixelsBorder(x,y+1);
pixel[3]=pic.getPixelsBorder(x+1,y+1);
Bilinear2_Fast(&pixel[0],&pixel[2],u_16>>8,v_16>>8,result);
}
4、边界处使用近似值处理
如果不想处理边界访问超界问题,可以考虑扩大源图片的尺寸,加一个边框 (“哨兵”优化); 这样插值算法就不用考虑边界问题了,程序写起来也简单很多! 如果对缩放结果的边界像素级精度要求不是太高,可使用如下缩放公式: Sx=Dx(SW-1)/DW; Sy=Dy(SH-1)/DH; (源图片宽和高:SW>=2;SH>=2)
这个公式不会造成内存访问超界:
//主函数
void PicZoom_ftBilinear_Common(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(2>Src.width)||(2>Src.height)) return;
// Sx=Dx*(SW-1)/DW; Sy=Dy*(SH-1)/DH
long xrIntFloat_16=((Src.width-1)<<16)/Dst.width;
long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;
long dst_width=Dst.width;
long Src_byte_width=Src.byte_width;
Color32* pDstLine=Dst.pdata;
long srcy_16=0;
for (long y=0;y>8;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
long srcx_16=0;
for (long x=0;x>16];
Bilinear_Fast_Common(PColor0,(Color32*)((UInt8*)(PColor0)+Src_byte_width),(srcx_16 & 0xFFFF)>>8,v_8,&pDstLine[x]);
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
}
must_inline void Bilinear_Fast_Common(Color32* PColor0,Color32* PColor1,unsigned long u_8,unsigned long v_8,Color32* result)
{
unsigned long pm3_8=(u_8*v_8)>>8;
unsigned long pm2_8=u_8-pm3_8;
unsigned long pm1_8=v_8-pm3_8;
unsigned long pm0_8=256-pm1_8-pm2_8-pm3_8;
unsigned long Color=*(unsigned long*)(PColor0);
unsigned long BR=(Color & 0x00FF00FF)*pm0_8;
unsigned long GA=((Color & 0xFF00FF00)>>8)*pm0_8;
Color=((unsigned long*)(PColor0))[1];
GA+=((Color & 0xFF00FF00)>>8)*pm2_8;
BR+=(Color & 0x00FF00FF)*pm2_8;
Color=*(unsigned long*)(PColor1);
GA+=((Color & 0xFF00FF00)>>8)*pm1_8;
BR+=(Color & 0x00FF00FF)*pm1_8;
Color=((unsigned long*)(PColor1))[1];
GA+=((Color & 0xFF00FF00)>>8)*pm3_8;
BR+=(Color & 0x00FF00FF)*pm3_8;
*(unsigned long*)(result)=(GA & 0xFF00FF00)|((BR & 0xFF00FF00)>>8);
}
5、MMX指令改写(3)
//主函数
void PicZoom_Bilinear_MMX(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(0==Src.width)||(0==Src.height)) return;
long xrIntFloat_16=((Src.width)<<16)/Dst.width+1;
long yrIntFloat_16=((Src.height)<<16)/Dst.height+1;
const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);
long dst_width=Dst.width;
//计算出需要特殊处理的边界
long border_y0=-csDErrorY/yrIntFloat_16+1; //y0+y*yr>=0; y0=csDErrorY => y>=-csDErrorY/yr
if (border_y0>=Dst.height) border_y0=Dst.height;
long border_x0=-csDErrorX/xrIntFloat_16+1;
if (border_x0>=Dst.width ) border_x0=Dst.width;
long border_y1=(((Src.height-2)<<16)-csDErrorY)/yrIntFloat_16+1; //y0+y*yr<=(height-2) => y<=(height-2-csDErrorY)/yr
if (border_y1>8;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
for (long x=border_x0;x>16];
Color32* PColor1=(Color32*)((UInt8*)(PColor0)+Src_byte_width);
Bilinear_Fast_MMX(PColor0,PColor1,(srcx_16 & 0xFFFF)>>8,v_8,&pDstLine[x]);
srcx_16+=xrIntFloat_16;
}
}
for (x=border_x1;x>16);
long y=(y_16>>16);
unsigned long u_16=((unsigned short)(x_16));
unsigned long v_16=((unsigned short)(y_16));
Color32 pixel[4];
pixel[0]=pic.getPixelsBorder(x,y);
pixel[1]=pic.getPixelsBorder(x+1,y);
pixel[2]=pic.getPixelsBorder(x,y+1);
pixel[3]=pic.getPixelsBorder(x+1,y+1);
Bilinear_Fast_MMX(&pixel[0],&pixel[2],u_16>>8,v_16>>8,result);
}
must_inline void Bilinear_Fast_MMX(Color32* PColor0,Color32* PColor1,unsigned long u_8,unsigned long v_8,Color32* result)
{
asm
{
MOVD MM6,v_8
MOVD MM5,u_8
mov edx,PColor0
mov eax,PColor1
PXOR mm7,mm7
MOVD MM2,dword ptr [eax]
MOVD MM0,dword ptr [eax+4]
PUNPCKLWD MM5,MM5
PUNPCKLWD MM6,MM6
MOVD MM3,dword ptr [edx]
MOVD MM1,dword ptr [edx+4]
PUNPCKLDQ MM5,MM5
PUNPCKLBW MM0,MM7
PUNPCKLBW MM1,MM7
PUNPCKLBW MM2,MM7
PUNPCKLBW MM3,MM7
PSUBw MM0,MM2
PSUBw MM1,MM3
PSLLw MM2,8
PSLLw MM3,8
PMULlw MM0,MM5
PMULlw MM1,MM5
PUNPCKLDQ MM6,MM6
PADDw MM0,MM2
PADDw MM1,MM3
PSRLw MM0,8
PSRLw MM1,8
PSUBw MM0,MM1
PSLLw MM1,8
PMULlw MM0,MM6
mov eax,result
PADDw MM0,MM1
PSRLw MM0,8
PACKUSwb MM0,MM7
movd [eax],MM0
//emms
}
}
6、SSE2指令集改写(4)
//主函数
void PicZoom_ftBilinear_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(2>Src.width)||(2>Src.height)) return;
long xrIntFloat_16=((Src.width-1)<<16)/Dst.width;
long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;
long dst_width=Dst.width;
long Src_byte_width=Src.byte_width;
Color32* pDstLine=Dst.pdata;
long srcy_16=0;
asm pxor xmm7,xmm7 //xmm7=0
for (long y=0;y>8;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src_byte_width) ;
asm
{
movd xmm6,v_8
PUNPCKLWD xmm6,xmm6
PUNPCKLDQ xmm6,xmm6
PUNPCKLQDQ xmm6,xmm6//xmm6=v_8
mov esi,PSrcLineColor
mov ecx,PSrcLineColorNext
xor edx,edx //srcx_16=0
mov ebx,dst_width
mov edi,pDstLine
push ebp
mov ebp,xrIntFloat_16
push ebx
and ebx,(not 1)
test ebx,ebx //nop
jle end_loop2
lea edi,[edi+ebx*4]
neg ebx
loop2_start:
call ftBilinear_SSE2_expand2
lea edx,[edx+ebp*2]
add ebx,2
jnz loop2_start
end_loop2:
pop ebx
and ebx,1
test ebx,ebx
jle end_write
lea edi,[edi+ebx*4]
neg ebx
loop1_start:
call ftBilinear_SSE2
lea edx,[edx+ebp]
add ebx,1
jnz loop1_start
end_write:
pop ebp
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
asm emms
}
//ftBilinear_SSE2_expand2(out [edi+ebx*4];xmm6=v_8,xmm7=0,edx=srcx_16,esi=PSrcLineColor,ecx=PSrcLineColorNext,ebp=xrIntFloat_16)
void __declspec(naked) ftBilinear_SSE2_expand2()
{
asm
{
lea eax,[edx+ebp]
MOVD XMM5,edx
MOVD XMM4,eax
PUNPCKLWD XMM5,XMM4
PSRLW XMM5,8
mov eax,edx
shr eax,16 //srcx_16>>16
PUNPCKLWD XMM5,XMM5
MOVQ XMM2, qword ptr [ecx+eax*4]//XMM2=0 0 Color0 Color2
MOVQ XMM3, qword ptr [esi+eax*4]//XMM3=0 0 Color1 Color3
lea eax,[edx+ebp]
shr eax,16 //srcx_16>>16
PUNPCKLDQ XMM5,XMM5 //mm5=u_8' u_8' u_8' u_8' u_8 u_8 u_8 u_8
movq xmm4,qword ptr [ecx+eax*4]
PUNPCKLDQ XMM2,xmm4//XMM2=Color0' Color0 Color2' Color2
movq xmm4,qword ptr [esi+eax*4]
PUNPCKLDQ XMM3,xmm4//XMM3=Color1' Color1 Color3' Color3
MOVHLPS XMM0,XMM2 //XMM0= X X Color0' Color0
MOVHLPS XMM1,XMM3 //XMM1= X X Color1' Color1
PUNPCKLBW XMM0,XMM7
PUNPCKLBW XMM1,XMM7
PUNPCKLBW XMM2,XMM7
PUNPCKLBW XMM3,XMM7
PSUBw XMM0,XMM2
PSUBw XMM1,XMM3
PSLLw XMM2,8
PSLLw XMM3,8
PMULlw XMM0,XMM5
PMULlw XMM1,XMM5
PADDw XMM0,XMM2
PADDw XMM1,XMM3
PSRLw XMM0,8
PSRLw XMM1,8
PSUBw XMM0,XMM1
PSLLw XMM1,8
PMULlw XMM0,XMM6
PADDw XMM0,XMM1
PSRLw XMM0,8
PACKUSwb XMM0,XMM7
//MOVQ qword ptr [edi+ebx*4], xmm0//write two DstColor
MOVDQ2Q mm4,xmm0
movntq qword ptr [edi+ebx*4],mm4
ret
}
}
//ftBilinear_SSE2(out [edi+ebx*4];xmm6=v_8,xmm7=0,edx=srcx_16,esi=PSrcLineColor,ecx=PSrcLineColorNext,ebp=xrIntFloat_16)
void __declspec(naked) ftBilinear_SSE2()
{
asm
{
mov eax,edx
shl eax,16
shr eax,24
//== movzx eax,dh //eax=u_8
MOVD XMM5,eax
mov eax,edx
shr eax,16 //srcx_16>>16
MOVD XMM0, dword ptr [ecx+eax*4+4]//XMM0=Color2
MOVD XMM2, dword ptr [ecx+eax*4] //XMM2=Color0
PUNPCKLWD XMM5,XMM5
MOVD XMM1, dword ptr [esi+eax*4+4]//XMM1=Color3
MOVD XMM3, dword ptr [esi+eax*4] //XMM3=Color1
PUNPCKLDQ XMM5,XMM5 //mm5=u_8
PUNPCKLBW XMM0,XMM7
PUNPCKLBW XMM1,XMM7
PUNPCKLBW XMM2,XMM7
PUNPCKLBW XMM3,XMM7
PSUBw XMM0,XMM2
PSUBw XMM1,XMM3
PSLLw XMM2,8
PSLLw XMM3,8
PMULlw XMM0,XMM5
PMULlw XMM1,XMM5
PADDw XMM0,XMM2
PADDw XMM1,XMM3
PSRLw XMM0,8
PSRLw XMM1,8
PSUBw XMM0,XMM1
PSLLw XMM1,8
PMULlw XMM0,XMM6
PADDw XMM0,XMM1
PSRLw XMM0,8
PACKUSwb XMM0,XMM7
MOVd dword ptr [edi+ebx*4],XMM0 //write DstColor
ret
}
}
7、SSE2指令集改写(4)+预计算缩放系数表
void PicZoom_ftBilinearTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(2>Src.width)||(2>Src.height)) return;
long xrIntFloat_16=((Src.width-1)<<16)/Dst.width;
long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;
long dst_width=Dst.width;
UInt8* _bufMem=new UInt8[(dst_width*2*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];
TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte对齐
Int32* xList=(Int32*)(uList+dst_width*2);
{//init u table
long srcx_16=0;
for (long x=0;x>1]=(srcx_16>>16);
unsigned long u=(srcx_16>>8)&0xFF;
unsigned long ur=(256-u)<<1;
u=u<<1;
uList[x+0]=(ur|(ur<<16));
uList[x+0]|=uList[x+0]<<32;
uList[x+1]=u|(u<<16);
uList[x+1]|=uList[x+1]<<32;
srcx_16+=xrIntFloat_16;
}
}
Color32* pDstLine=Dst.pdata;
long srcy_16=0;
asm pxor xmm7,xmm7 //xmm7=0
for (long y=0;y>8) & 0xFF;
unsigned long vr=(256-v)>>1;
v>>=1;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src.byte_width*(srcy_16>>16)) ;
Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src.byte_width) ;
asm{
movd xmm5,vr
movd xmm6,v
punpcklwd xmm5,xmm5
punpcklwd xmm6,xmm6
punpckldq xmm5,xmm5
punpckldq xmm6,xmm6
punpcklqdq xmm5,xmm5
punpcklqdq xmm6,xmm6
mov esi,PSrcLineColor
mov ecx,PSrcLineColorNext
mov edx,xList //x
mov ebx,dst_width
mov edi,pDstLine
push ebp
mov ebp,uList
push ebx
and ebx,(not 1)
test ebx,ebx
jle end_loop2
lea ebx,[ebx*4]
lea edi,[edi+ebx]
lea edx,[edx+ebx]
lea ebp,[ebp+ebx*4]
neg ebx
loop2_start:
//call ftBilinearTable_SSE2_expand2
ftBilinearTable_SSE2_expand2()
add ebx,8
jnz loop2_start
end_loop2:
pop ebx
and ebx,1
test ebx,ebx
jle end_write
lea ebx,[ebx*4]
lea edi,[edi+ebx]
lea edx,[edx+ebx]
lea ebp,[ebp+ebx*4]
neg ebx
loop1_start:
//call ftBilinearTable_SSE2
ftBilinearTable_SSE2()
add ebx,4
jnz loop1_start
end_write:
pop ebp
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
delete []_bufMem;
}
//ftBilinearTable_SSE2(out [edi+ebx*4]; xmm5=v,xmm6=vr,xmm7=0,[ebp]=(u,ur),[edx]=srx_x,esi=PSrcLineColor,ecx=PSrcLineColorNext)
//void __declspec(naked) ftBilinearTable_SSE2(){
#define ftBilinearTable_SSE2() \
asm mov eax,[edx+ebx] \
asm movq xmm0,qword ptr[esi+eax*4] \
asm movq xmm1,qword ptr[ecx+eax*4] \
asm punpcklbw xmm0,xmm7 \
asm punpcklbw xmm1,xmm7 \
asm pmullw xmm0,xmm5 \
asm pmullw xmm1,xmm6 \
asm paddw xmm0,xmm1 \
asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] \
asm movdqa xmm1,xmm0 \
asm punpckhqdq xmm0,xmm0 \
asm paddw xmm0,xmm1 \
asm packuswb xmm0,xmm7 \
asm movd dword ptr [edi+ebx],xmm0
//ret //for __declspec(naked)
//}
//}
//void __declspec(naked) ftBilinearTable_SSE2_expand2(){
#define ftBilinearTable_SSE2_expand2() \
asm mov eax,[edx+ebx] \
asm movq xmm0,qword ptr[esi+eax*4] \
asm movq xmm1,qword ptr[ecx+eax*4] \
asm mov eax,[edx+ebx+4] \
asm movq xmm2,qword ptr[esi+eax*4] \
asm movq xmm3,qword ptr[ecx+eax*4] \
asm punpcklbw xmm0,xmm7 \
asm punpcklbw xmm1,xmm7 \
asm punpcklbw xmm2,xmm7 \
asm punpcklbw xmm3,xmm7 \
asm pmullw xmm0,xmm5 \
asm pmullw xmm1,xmm6 \
asm pmullw xmm2,xmm5 \
asm pmullw xmm3,xmm6 \
asm paddw xmm0,xmm1 \
asm paddw xmm2,xmm3 \
asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] \
asm pmulhw xmm2,xmmword ptr [ebp+ebx*4+16] \
asm movdqa xmm1,xmm0 \
asm punpcklqdq xmm0,xmm2 \
asm punpckhqdq xmm1,xmm2 \
asm paddw xmm0,xmm1 \
asm packuswb xmm0,xmm7 \
asm movq qword ptr [edi+ebx],xmm0 \
//ret //for __declspec(naked)
//}
//}