http://blog.csdn.net/housisong/
[email protected] 2007.06.26
tag:图像旋转,任意角度,图像缩放,速度优化,定点数优化,近邻取样插值,二次线性插值,
三次卷积插值,MipMap链,三次线性插值,MMX/SSE优化,CPU缓存优化,AlphaBlend,颜色混合
摘要:首先给出一个基本的图像旋转算法,然后一步一步的优化其速度和旋转质量,打破不能软件旋转的神话!
任意角度的高质量的快速的图像旋转 全文 分为:
上篇 纯软件的任意角度的快速旋转
下篇 高质量的旋转
正文:
为了便于讨论,这里只处理32bit的ARGB颜色;
代码使用C++;涉及到汇编优化的时候假定为x86平台;使用的编译器为vc2005;
为了代码的可读性,没有加入异常处理代码;
测试使用的CPU为AMD64x2 4200+,测试时使用的单线程执行;
(一些基础代码和插值原理的详细说明参见作者的《图形图像处理-之-高质量的快速的图像缩放》系列文章
旋转原理和基础参考《图形图像处理-之-任意角度的高质量的快速的图像旋转 上篇 纯软件的任意角度的快速旋转》)
速度测试说明:
只测试内存数据到内存数据的缩放
测试图片都是800*600旋转到1004*1004,测试成绩取各个旋转角度的平均速度值; fps表示每秒钟的帧数,值越大表示函数越快
A:一些颜色和图片的数据定义:
#define
asm __asm
typedef unsigned
char
TUInt8;
//
[0..255]
struct
TARGB32
//
32 bit color
{
TUInt8 b,g,r,a;
//
a is alpha
};
struct
TPicRegion
//
一块颜色数据区的描述,便于参数传递
{
TARGB32
*
pdata;
//
颜色数据首地址
long
byte_width;
//
一行数据的物理宽度(字节宽度);
//
abs(byte_width)有可能大于等于width*sizeof(TARGB32);
long
width;
//
像素宽度
long
height;
//
像素高度
};
inline TARGB32
&
Pixels(TARGB32
*
pcolor,
const
long
byte_width,
const
long
x,
const
long
y)
{
return
( (TARGB32
*
)((TUInt8
*
)pcolor
+
byte_width
*
y) )[x];
}
//
那么访问一个点的函数可以写为:
inline TARGB32
&
Pixels(
const
TPicRegion
&
pic,
const
long
x,
const
long
y)
{
return
Pixels(pic.pdata,pic.byte_width,x,y);
}
//
判断一个点是否在图片中
inline
bool
PixelsIsInPic(
const
TPicRegion
&
pic,
const
long
x,
const
long
y)
{
return
( (x
>=
0
)
&&
(x
<
pic.width)
&&
(y
>=
0
)
&&
(y
<
pic.height) );
}
//
访问一个点的函数,(x,y)坐标可能超出图片边界;
//
边界处理模式:边界饱和
inline TARGB32
&
Pixels_Bound(
const
TPicRegion
&
pic,
long
x,
long
y)
{
//
assert((pic.width>0)&&(pic.height>0));
if
(x
<
0
) x
=
0
;
else
if
(x
>=
pic.width ) x
=
pic.width
-
1
;
if
(y
<
0
) y
=
0
;
else
if
(y
>=
pic.height) y
=
pic.height
-
1
;
return
Pixels(pic,x,y);
}
inline TARGB32
&
Pixels_Bound(
const
TPicRegion
&
pic,
long
x,
long
y,
bool
&
IsInPic)
{
//
assert((pic.width>0)&&(pic.height>0));
IsInPic
=
true
;
if
(x
<
0
) {x
=
0
; IsInPic
=
false
; }
else
if
(x
>=
pic.width ) {x
=
pic.width
-
1
; IsInPic
=
false
; }
if
(y
<
0
) {y
=
0
; IsInPic
=
false
; }
else
if
(y
>=
pic.height) {y
=
pic.height
-
1
; IsInPic
=
false
; }
return
Pixels(pic,x,y);
}
B:实现二次线性插值的旋转
(插值原理参见我的blog文章《图形图像处理-之-高质量的快速的图像缩放 中篇 二次线性插值和三次卷积插值》)
a.首先改写用于边界扫描的类TRotaryClipData;在图片边缘插值的时候,插值的颜色数据可能
部分在图片外,部分颜色数据在图片内,所以TRotaryClipData需要同时找出“插值边界以外”、
“插值边界”、“插值边界以内”
扫描线图示: 外 | 边界 | 内 | 边界 | 外
struct
TRotaryClipData{
public
:
long
src_width;
long
src_height;
long
dst_width;
long
dst_height;
long
Ax_16;
long
Ay_16;
long
Bx_16;
long
By_16;
long
Cx_16;
long
Cy_16;
long
border_width;
//
插值边界宽度
private
:
long
cur_dst_up_x0;
long
cur_dst_up_x1;
long
cur_dst_down_x0;
long
cur_dst_down_x1;
inline
bool
is_border_src(
long
src_x_16,
long
src_y_16)
{
return
( ( (src_x_16
>=
(
-
(border_width
<<
16
)))
&&
((src_x_16
>>
16
)
<
(src_width
+
border_width)) )
&&
( (src_y_16
>=
(
-
(border_width
<<
16
)))
&&
((src_y_16
>>
16
)
<
(src_height
+
border_width)) ) );
}
inline
bool
is_in_src(
long
src_x_16,
long
src_y_16)
{
return
( ( (src_x_16
>=
(border_width
<<
16
))
&&
((src_x_16
>>
16
)
<
(src_width
-
border_width) ) )
&&
( (src_y_16
>=
(border_width
<<
16
))
&&
((src_y_16
>>
16
)
<
(src_height
-
border_width)) ) );
}
void
find_begin_in(
long
dst_y,
long
&
out_dst_x,
long
&
src_x_16,
long
&
src_y_16)
{
src_x_16
-=
Ax_16;
src_y_16
-=
Ay_16;
while
(is_border_src(src_x_16,src_y_16))
{
--
out_dst_x;
src_x_16
-=
Ax_16;
src_y_16
-=
Ay_16;
}
src_x_16
+=
Ax_16;
src_y_16
+=
Ay_16;
}
bool
find_begin(
long
dst_y,
long
&
out_dst_x0,
long
dst_x1)
{
long
test_dst_x0
=
out_dst_x0
-
1
;
long
src_x_16
=
Ax_16
*
test_dst_x0
+
Bx_16
*
dst_y
+
Cx_16;
long
src_y_16
=
Ay_16
*
test_dst_x0
+
By_16
*
dst_y
+
Cy_16;
for
(
long
i
=
test_dst_x0;i
<=
dst_x1;
++
i)
{
if
(is_border_src(src_x_16,src_y_16))
{
out_dst_x0
=
i;
if
(i
==
test_dst_x0)
find_begin_in(dst_y,out_dst_x0,src_x_16,src_y_16);
if
(out_dst_x0
<
0
)
{
src_x_16
-=
(Ax_16
*
out_dst_x0);
src_y_16
-=
(Ay_16
*
out_dst_x0);
}
out_src_x0_16
=
src_x_16;
out_src_y0_16
=
src_y_16;
return
true
;
}
else
{
src_x_16
+=
Ax_16;
src_y_16
+=
Ay_16;
}
}
return
false
;
}
void
find_end(
long
dst_y,
long
dst_x0,
long
&
out_dst_x1)
{
long
test_dst_x1
=
out_dst_x1;
if
(test_dst_x1
<
dst_x0) test_dst_x1
=
dst_x0;
long
src_x_16
=
Ax_16
*
test_dst_x1
+
Bx_16
*
dst_y
+
Cx_16;
long
src_y_16
=
Ay_16
*
test_dst_x1
+
By_16
*
dst_y
+
Cy_16;
if
(is_border_src(src_x_16,src_y_16))
{
++
test_dst_x1;
src_x_16
+=
Ax_16;
src_y_16
+=
Ay_16;
while
(is_border_src(src_x_16,src_y_16))
{
++
test_dst_x1;
src_x_16
+=
Ax_16;
src_y_16
+=
Ay_16;
}
out_dst_x1
=
test_dst_x1;
}
else
{
src_x_16
-=
Ax_16;
src_y_16
-=
Ay_16;
while
(
!
is_border_src(src_x_16,src_y_16))
{
--
test_dst_x1;
src_x_16
-=
Ax_16;
src_y_16
-=
Ay_16;
}
out_dst_x1
=
test_dst_x1;
}
}
inline
void
update_out_dst_x_in()
{
if
((
0
==
border_width)
||
(out_dst_x0_boder
>=
out_dst_x1_boder) )
{
out_dst_x0_in
=
out_dst_x0_boder;
out_dst_x1_in
=
out_dst_x1_boder;
}
else
{
long
src_x_16
=
out_src_x0_16;
long
src_y_16
=
out_src_y0_16;
long
i
=
out_dst_x0_boder;
while
(i
<
out_dst_x1_boder)
{
if
(is_in_src(src_x_16,src_y_16))
break
;
src_x_16
+=
Ax_16;
src_y_16
+=
Ay_16;
++
i;
}
out_dst_x0_in
=
i;
src_x_16
=
out_src_x0_16
+
(out_dst_x1_boder
-
out_dst_x0_boder)
*
Ax_16;
src_y_16
=
out_src_y0_16
+
(out_dst_x1_boder
-
out_dst_x0_boder)
*
Ay_16;
i
=
out_dst_x1_boder;
while
(i
>
out_dst_x0_in)
{
src_x_16
-=
Ax_16;
src_y_16
-=
Ay_16;
if
(is_in_src(src_x_16,src_y_16))
break
;
--
i;
}
out_dst_x1_in
=
i;
}
}
inline
void
update_out_dst_up_x()
{
if
(cur_dst_up_x0
<
0
)
out_dst_x0_boder
=
0
;
else
out_dst_x0_boder
=
cur_dst_up_x0;
if
(cur_dst_up_x1
>=
dst_width)
out_dst_x1_boder
=
dst_width;
else
out_dst_x1_boder
=
cur_dst_up_x1;
update_out_dst_x_in();
}
inline
void
update_out_dst_down_x()
{
if
(cur_dst_down_x0
<
0
)
out_dst_x0_boder
=
0
;
else
out_dst_x0_boder
=
cur_dst_down_x0;
if
(cur_dst_down_x1
>=
dst_width)
out_dst_x1_boder
=
dst_width;
else
out_dst_x1_boder
=
cur_dst_down_x1;
update_out_dst_x_in();
}
public
:
long
out_src_x0_16;
long
out_src_y0_16;
long
out_dst_up_y;
long
out_dst_down_y;
long
out_dst_x0_boder;
long
out_dst_x0_in;
long
out_dst_x1_in;
long
out_dst_x1_boder;
public
:
bool
inti_clip(
double
move_x,
double
move_y,unsigned
long
aborder_width)
{
border_width
=
aborder_width;
//
计算src中心点映射到dst后的坐标
out_dst_down_y
=
(
long
)(src_height
*
0.5
+
move_y);
cur_dst_down_x0
=
(
long
)(src_width
*
0.5
+
move_x);
cur_dst_down_x1
=
cur_dst_down_x0;
//
得到初始扫描线
if
(find_begin(out_dst_down_y,cur_dst_down_x0,cur_dst_down_x1))
find_end(out_dst_down_y,cur_dst_down_x0,cur_dst_down_x1);
out_dst_up_y
=
out_dst_down_y;
cur_dst_up_x0
=
cur_dst_down_x0;
cur_dst_up_x1
=
cur_dst_down_x1;
update_out_dst_up_x();
return
(cur_dst_down_x0
<
cur_dst_down_x1);
}
bool
next_clip_line_down()
{
++
out_dst_down_y;
if
(
!
find_begin(out_dst_down_y,cur_dst_down_x0,cur_dst_down_x1))
return
false
;
find_end(out_dst_down_y,cur_dst_down_x0,cur_dst_down_x1);
update_out_dst_down_x();
return
(cur_dst_down_x0
<
cur_dst_down_x1);
}
bool
next_clip_line_up()
{
--
out_dst_up_y;
if
(
!
find_begin(out_dst_up_y,cur_dst_up_x0,cur_dst_up_x1))
return
false
;
find_end(out_dst_up_y,cur_dst_up_x0,cur_dst_up_x1);
update_out_dst_up_x();
return
(cur_dst_up_x0
<
cur_dst_up_x1);
}
};
b. 边界插值的特殊处理
对于“插值边界以外”很简单,不用处理直接跳过插值;
对于“插值边界以内”,也比较容易处理,直接调用快速的差值算法就可以了,不用担心内存访问问题;
插值实现:
(从《图形图像处理-之-高质量的快速的图像缩放 中篇 二次线性插值和三次卷积插值》文章来的,后面不再说明)
inline
void
BilInear_Fast(
const
TPicRegion
&
pic,
const
long
x_16,
const
long
y_16,TARGB32
*
result)
{
TARGB32
*
PColor0
=&
Pixels(pic,x_16
>>
16
,y_16
>>
16
);
TARGB32
*
PColor1
=
(TARGB32
*
)((TUInt8
*
)PColor0
+
pic.byte_width);
unsigned
long
u_8
=
(unsigned
char
)(x_16
>>
8
);
unsigned
long
v_8
=
(unsigned
char
)(y_16
>>
8
);
unsigned
long
pm3_8
=
(u_8
*
v_8)
>>
8
;
unsigned
long
pm2_8
=
u_8
-
pm3_8;
unsigned
long
pm1_8
=
v_8
-
pm3_8;
unsigned
long
pm0_8
=
256
-
pm1_8
-
pm2_8
-
pm3_8;
unsigned
long
Color
=*
(unsigned
long
*
)(PColor0);
unsigned
long
BR
=
(Color
&
0x00FF00FF
)
*
pm0_8;
unsigned
long
GA
=
((Color
&
0xFF00FF00
)
>>
8
)
*
pm0_8;
Color
=
((unsigned
long
*
)(PColor0))[
1
];
GA
+=
((Color
&
0xFF00FF00
)
>>
8
)
*
pm2_8;
BR
+=
(Color
&
0x00FF00FF
)
*
pm2_8;
Color
=*
(unsigned
long
*
)(PColor1);
GA
+=
((Color
&
0xFF00FF00
)
>>
8
)
*
pm1_8;
BR
+=
(Color
&
0x00FF00FF
)
*
pm1_8;
Color
=
((unsigned
long
*
)(PColor1))[
1
];
GA
+=
((Color
&
0xFF00FF00
)
>>
8
)
*
pm3_8;
BR
+=
(Color
&
0x00FF00FF
)
*
pm3_8;
*
(unsigned
long
*
)result
=
(GA
&
0xFF00FF00
)
|
((BR
&
0xFF00FF00
)
>>
8
);
}
对于“插值边界”,就需要特殊处理了,很多插值旋转的实现可能都在这里打了折扣;要想完美的解决
这块区域,可以引入AlphaBlend(带Alpha通道的颜色混合) ;
其实AlphaBlend的原理也很简单,就是按不同的比例混合两种颜色:
new_color=dst_color*(1-alpha)+src_color*alpha;
对于ARGB32bit颜色,需要用该公式分别处理4个颜色通道,并假设Alpha为[0..255]的整数,那么完整的实现函数为:
inline TARGB32 AlphaBlend(TARGB32 dst,TARGB32 src)
{
//
AlphaBlend颜色混合公式(对其中的每个颜色分量分别处理):
//
new_color=(dst_color*(255-src_color.alpha)+src_color*src_color.alpha)/255
//
提示:除法指令是很慢的操作,但vc2005可以把x/255编译为完全等价的"(x*M)>>N"类似的快速计算代码;
unsigned
long
a
=
src.a;
//
if (a==0) return dst;
//
else if (a==255) return src;
unsigned
long
ra
=
255
-
a;
unsigned
long
result_b
=
(dst.b
*
ra
+
src.b
*
a)
/
255
;
unsigned
long
result_g
=
(dst.g
*
ra
+
src.g
*
a)
/
255
;
unsigned
long
result_r
=
(dst.r
*
ra
+
src.r
*
a)
/
255
;
unsigned
long
result_a
=
(dst.a
*
ra
+
a
*
a)
/
255
;
unsigned
long
result
=
(result_b)
|
(result_g
<<
8
)
|
(result_r
<<
16
)
|
(result_a
<<
24
);
return
*
(TARGB32
*
)
&
result;
}
优化AlphaBlend,颜色处理中,也可以这样近似计算: x/255 => x>>8 ,所以有:
inline TARGB32 AlphaBlend(TARGB32 dst,TARGB32 src)
{
unsigned
long
a
=
src.a;
unsigned
long
ra
=
255
-
a;
unsigned
long
result_b
=
(dst.b
*
ra
+
src.b
*
a)
>>
8
;
unsigned
long
result_g
=
(dst.g
*
ra
+
src.g
*
a)
>>
8
;
unsigned
long
result_r
=
(dst.r
*
ra
+
src.r
*
a)
>>
8
;
unsigned
long
result_a
=
(dst.a
*
ra
+
a
*
a)
>>
8
;
unsigned
long
result
=
(result_b)
|
(result_g
<<
8
)
|
(result_r
<<
16
)
|
(result_a
<<
24
);
return
*
(TARGB32
*
)
&
result;
}
(dst*(255-alpha)+src*alpha)>>8 可以近似为:(dst*(256-alpha)+src*alpha)>>8
即 (dst<<8+(src-dst)*alpha)>>8 从而用一个移位代替一个乘法 (*256会被优化为移位)
inline TARGB32 AlphaBlend(TARGB32 dst,TARGB32 src)
{
long
a
=
src.a;
unsigned
long
result_b
=
((unsigned
long
)(((
long
)dst.b)
*
256
+
((
long
)src.b
-
(
long
)dst.b)
*
a))
>>
8
;
unsigned
long
result_g
=
((unsigned
long
)(((
long
)dst.g)
*
256
+
((
long
)src.g
-
(
long
)dst.g)
*
a))
>>
8
;
unsigned
long
result_r
=
((unsigned
long
)(((
long
)dst.r)
*
256
+
((
long
)src.r
-
(
long
)dst.r)
*
a))
>>
8
;
unsigned
long
result_a
=
((unsigned
long
)(((
long
)dst.a)
*
256
+
((
long
)a
-
(
long
)dst.a)
*
a))
>>
8
;
unsigned
long
result
=
(result_b)
|
(result_g
<<
8
)
|
(result_r
<<
16
)
|
(result_a
<<
24
);
return
*
(TARGB32
*
)
&
result;
}
继续优化,同时运算两路颜色分量的AlphaBlend实现:
inline TARGB32 AlphaBlend(TARGB32 dst,TARGB32 src)
{
unsigned
long
Src_ARGB
=*
(unsigned
long
*
)
&
src;
unsigned
long
Dst_ARGB
=*
(unsigned
long
*
)
&
dst;
unsigned
long
a
=
Src_ARGB
>>
24
;
unsigned
long
ra
=
255
-
a;
unsigned
long
result_RB
=
((Dst_ARGB
&
0x00FF00FF
)
*
ra
+
(Src_ARGB
&
0x00FF00FF
)
*
a);
unsigned
long
result_AG
=
(((Dst_ARGB
&
0xFF00FF00
)
>>
8
)
*
ra
+
((Src_ARGB
&
0xFF00FF00
)
>>
8
)
*
a);
unsigned
long
result
=
((result_RB
&
0xFF00FF00
)
>>
8
)
|
(result_AG
&
0xFF00FF00
);
return
*
(TARGB32
*
)
&
result;
}
回到我们的主线:完美解决“旋转插值边界”
怎么利用AlphaBlend呢? 我们可以在处理边界的时候,对于颜色数据在图片内的颜色,把Alpha通道分量设置为255,
在图片外的颜色数据(使用Pixels_Bound会返回最接近的一个内部颜色)的Alpha通道分量设置为0;
这个任务就交给边界插值函数了:
inline
void
BilInear_Border(
const
TPicRegion
&
pic,
const
long
x_16,
const
long
y_16,TARGB32
*
result)
{
unsigned
long
x0
=
(x_16
>>
16
);
unsigned
long
y0
=
(y_16
>>
16
);
TARGB32 pixel[
4
];
bool
IsInPic;
pixel[
0
]
=
Pixels_Bound(pic,x0,y0,IsInPic);
if
(
!
IsInPic) pixel[
0
].a
=
0
;
else
pixel[
0
].a
=
255
;
pixel[
2
]
=
Pixels_Bound(pic,x0,y0
+
1
,IsInPic);
if
(
!
IsInPic) pixel[
2
].a
=
0
;
else
pixel[
2
].a
=
255
;
pixel[
1
]
=
Pixels_Bound(pic,x0
+
1
,y0,IsInPic);
if
(
!
IsInPic) pixel[
1
].a
=
0
;
else
pixel[
1
].a
=
255
;
pixel[
3
]
=
Pixels_Bound(pic,x0
+
1
,y0
+
1
,IsInPic);
if
(
!
IsInPic) pixel[
3
].a
=
0
;
else
pixel[
3
].a
=
255
;
TPicRegion npic;
npic.pdata
=&
pixel[
0
];
npic.byte_width
=
2
*
sizeof
(TARGB32);
//
npic.width =2;
//
npic.height =2;
BilInear_Fast(npic,(unsigned
short
)x_16,(unsigned
short
)y_16,result);
}
返回的颜色中的Alpha的值就代表了颜色的有效强度(一般介于0..255之间);
那么,对边界上的插值就可以用类似这样的代码处理好了:
TARGB32 tmp_color;
BilInear_Border(SrcPic,srcx0_16,srcy0_16,&tmp_color);
pDstLine[x]=AlphaBlend(pDstLine[x],tmp_color);
c. OK,给出完整的函数:
void
PicRotary_BilInear_CopyLine(TARGB32
*
pDstLine,
long
dst_border_x0,
long
dst_in_x0,
long
dst_in_x1,
long
dst_border_x1,
const
TPicRegion
&
SrcPic,
long
srcx0_16,
long
srcy0_16,
long
Ax_16,
long
Ay_16)
{
long
x;
for
(x
=
dst_border_x0;x
<
dst_in_x0;
++
x)
{
TARGB32 src_color;
BilInear_Border(SrcPic,srcx0_16,srcy0_16,
&
src_color);
pDstLine[x]
=
AlphaBlend(pDstLine[x],src_color);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
for
(x
=
dst_in_x0;x
<
dst_in_x1;
++
x)
{
BilInear_Fast(SrcPic,srcx0_16,srcy0_16,
&
pDstLine[x]);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
for
(x
=
dst_in_x1;x
<
dst_border_x1;
++
x)
{
TARGB32 src_color;
BilInear_Border(SrcPic,srcx0_16,srcy0_16,
&
src_color);
pDstLine[x]
=
AlphaBlend(pDstLine[x],src_color);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
}
void
PicRotaryBilInear(
const
TPicRegion
&
Dst,
const
TPicRegion
&
Src,
double
RotaryAngle,
double
ZoomX,
double
ZoomY,
double
move_x,
double
move_y)
{
if
( (fabs(ZoomX
*
Src.width)
<
1.0e-4
)
||
(fabs(ZoomY
*
Src.height)
<
1.0e-4
) )
return
;
//
太小的缩放比例认为已经不可见
double
tmprZoomXY
=
1.0
/
(ZoomX
*
ZoomY);
double
rZoomX
=
tmprZoomXY
*
ZoomY;
double
rZoomY
=
tmprZoomXY
*
ZoomX;
double
sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long
Ax_16
=
(
long
)(rZoomX
*
cosA
*
(
1
<<
16
));
long
Ay_16
=
(
long
)(rZoomX
*
sinA
*
(
1
<<
16
));
long
Bx_16
=
(
long
)(
-
rZoomY
*
sinA
*
(
1
<<
16
));
long
By_16
=
(
long
)(rZoomY
*
cosA
*
(
1
<<
16
));
double
rx0
=
Src.width
*
0.5
;
//
(rx0,ry0)为旋转中心
double
ry0
=
Src.height
*
0.5
;
long
Cx_16
=
(
long
)((
-
(rx0
+
move_x)
*
rZoomX
*
cosA
+
(ry0
+
move_y)
*
rZoomY
*
sinA
+
rx0)
*
(
1
<<
16
));
long
Cy_16
=
(
long
)((
-
(rx0
+
move_x)
*
rZoomX
*
sinA
-
(ry0
+
move_y)
*
rZoomY
*
cosA
+
ry0)
*
(
1
<<
16
));
TRotaryClipData rcData;
rcData.Ax_16
=
Ax_16;
rcData.Bx_16
=
Bx_16;
rcData.Cx_16
=
Cx_16;
rcData.Ay_16
=
Ay_16;
rcData.By_16
=
By_16;
rcData.Cy_16
=
Cy_16;
rcData.dst_width
=
Dst.width;
rcData.dst_height
=
Dst.height;
rcData.src_width
=
Src.width;
rcData.src_height
=
Src.height;
if
(
!
rcData.inti_clip(move_x,move_y,
1
))
return
;
TARGB32
*
pDstLine
=
Dst.pdata;
((TUInt8
*&
)pDstLine)
+=
(Dst.byte_width
*
rcData.out_dst_down_y);
while
(
true
)
//
to down
{
long
y
=
rcData.out_dst_down_y;
if
(y
>=
Dst.height)
break
;
if
(y
>=
0
)
{
PicRotary_BilInear_CopyLine(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
}
if
(
!
rcData.next_clip_line_down())
break
;
((TUInt8
*&
)pDstLine)
+=
Dst.byte_width;
}
pDstLine
=
Dst.pdata;
((TUInt8
*&
)pDstLine)
+=
(Dst.byte_width
*
rcData.out_dst_up_y);
while
(rcData.next_clip_line_up())
//
to up
{
long
y
=
rcData.out_dst_up_y;
if
(y
<
0
)
break
;
((TUInt8
*&
)pDstLine)
-=
Dst.byte_width;
if
(y
<
Dst.height)
{
PicRotary_BilInear_CopyLine(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
}
}
}
//注:测试图片都是800*600的图片旋转到1004*1004的图片中心 测试成绩取各个旋转角度的平均速度值
////////////////////////////////////////////////////////////////////////////////
//速度测试:
//==============================================================================
// PicRotaryBilInear 64.5 fps
////////////////////////////////////////////////////////////////////////////////
二次线性插值旋转结果图示:
30度 60度 90度
120度 150度 180度
![图形图像处理-之-任意角度的高质量的快速的图像旋转 下篇 高质量的旋转_第9张图片](http://img.e-com-net.com/image/info5/dd58f3462dcd42e891dfb8e856c30de0.jpg)
210度 240度 270度
300度 330度 360度
C:用MMX指令来改进二次线性插值的旋转
inline TARGB32 AlphaBlend_MMX(TARGB32 dst,TARGB32 src)
{
unsigned
long
result;
asm
{
PXOR MM7,MM7
MOVD MM0,src
MOVD MM2,dst
PUNPCKLBW MM0,MM7
PUNPCKLBW MM2,MM7
MOVQ MM1,MM0
PUNPCKHWD MM1,MM1
PSUBW MM0,MM2
PUNPCKHDQ MM1,MM1
PSLLW MM2,
8
PMULLW MM0,MM1
PADDW MM2,MM0
PSRLW MM2,
8
PACKUSWB MM2,MM7
MOVD result,MM2
}
return
*
(TARGB32
*
)
&
result;
}
void
__declspec(naked) __stdcall BilInear_Fast_MMX(
const
TPicRegion
&
pic,
const
long
x_16,
const
long
y_16,TARGB32
*
result)
{
asm
{
mov edx,[esp
+
12
]
//
y_16
mov eax,[esp
+
8
]
//
x_16
PXOR mm7,mm7
shl edx,
16
shl eax,
16
shr edx,
24
//
edx=v_8
shr eax,
24
//
eax=u_8
MOVD MM6,edx
MOVD MM5,eax
mov ecx,[esp
+
4
]
//
pic
mov edx,[esp
+
12
]
//
y_16
mov eax,[ecx]TPicRegion.byte_width
sar edx,
16
imul edx,eax
add edx,[ecx]TPicRegion.pdata
add eax,edx
mov ecx,[esp
+
8
]
//
x_16
sar ecx,
16
//
srcx_16>>16
MOVD MM2,dword ptr [eax
+
ecx
*
4
]
//
MM2=Color1
MOVD MM0,dword ptr [eax
+
ecx
*
4
+
4
]
//
MM0=Color3
PUNPCKLWD MM5,MM5
PUNPCKLWD MM6,MM6
MOVD MM3,dword ptr [edx
+
ecx
*
4
]
//
MM3=Color0
MOVD MM1,dword ptr [edx
+
ecx
*
4
+
4
]
//
MM1=Color2
PUNPCKLDQ MM5,MM5
//
mm5=u_8
PUNPCKLBW MM0,MM7
PUNPCKLBW MM1,MM7
PUNPCKLBW MM2,MM7
PUNPCKLBW MM3,MM7
PSUBw MM0,MM2
PSUBw MM1,MM3
PSLLw MM2,
8
PSLLw MM3,
8
PMULlw MM0,MM5
PMULlw MM1,MM5
PUNPCKLDQ MM6,MM6
//
mm6=v_8
PADDw MM0,MM2
PADDw MM1,MM3
PSRLw MM0,
8
PSRLw MM1,
8
PSUBw MM0,MM1
PSLLw MM1,
8
PMULlw MM0,MM6
mov eax,[esp
+
16
]
//
result
PADDw MM0,MM1
PSRLw MM0,
8
PACKUSwb MM0,MM7
movd [eax],MM0
//
emms
ret
16
}
}
void
BilInear_Border_MMX(
const
TPicRegion
&
pic,
const
long
x_16,
const
long
y_16,TARGB32
*
result)
{
unsigned
long
x0
=
(x_16
>>
16
);
unsigned
long
y0
=
(y_16
>>
16
);
TARGB32 pixel[
4
];
bool
IsInPic;
pixel[
0
]
=
Pixels_Bound(pic,x0,y0,IsInPic);
if
(
!
IsInPic) pixel[
0
].a
=
0
;
else
pixel[
0
].a
=
255
;
pixel[
2
]
=
Pixels_Bound(pic,x0,y0
+
1
,IsInPic);
if
(
!
IsInPic) pixel[
2
].a
=
0
;
else
pixel[
2
].a
=
255
;
pixel[
1
]
=
Pixels_Bound(pic,x0
+
1
,y0,IsInPic);
if
(
!
IsInPic) pixel[
1
].a
=
0
;
else
pixel[
1
].a
=
255
;
pixel[
3
]
=
Pixels_Bound(pic,x0
+
1
,y0
+
1
,IsInPic);
if
(
!
IsInPic) pixel[
3
].a
=
0
;
else
pixel[
3
].a
=
255
;
TPicRegion npic;
npic.pdata
=&
pixel[
0
];
npic.byte_width
=
2
*
sizeof
(TARGB32);
//
npic.width =2;
//
npic.height =2;
BilInear_Fast_MMX(npic,(unsigned
short
)x_16,(unsigned
short
)y_16,result);
}
void
PicRotary_BilInear_CopyLine_MMX(TARGB32
*
pDstLine,
long
dst_border_x0,
long
dst_in_x0,
long
dst_in_x1,
long
dst_border_x1,
const
TPicRegion
&
SrcPic,
long
srcx0_16,
long
srcy0_16,
long
Ax_16,
long
Ay_16)
{
long
x;
for
(x
=
dst_border_x0;x
<
dst_in_x0;
++
x)
{
TARGB32 src_color;
BilInear_Border_MMX(SrcPic,srcx0_16,srcy0_16,
&
src_color);
pDstLine[x]
=
AlphaBlend_MMX(pDstLine[x],src_color);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
for
(x
=
dst_in_x0;x
<
dst_in_x1;
++
x)
{
BilInear_Fast_MMX(SrcPic,srcx0_16,srcy0_16,
&
pDstLine[x]);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
for
(x
=
dst_in_x1;x
<
dst_border_x1;
++
x)
{
TARGB32 src_color;
BilInear_Border_MMX(SrcPic,srcx0_16,srcy0_16,
&
src_color);
pDstLine[x]
=
AlphaBlend_MMX(pDstLine[x],src_color);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
asm emms
}
void
PicRotaryBilInear_MMX(
const
TPicRegion
&
Dst,
const
TPicRegion
&
Src,
double
RotaryAngle,
double
ZoomX,
double
ZoomY,
double
move_x,
double
move_y)
{
if
( (fabs(ZoomX
*
Src.width)
<
1.0e-4
)
||
(fabs(ZoomY
*
Src.height)
<
1.0e-4
) )
return
;
//
太小的缩放比例认为已经不可见
double
tmprZoomXY
=
1.0
/
(ZoomX
*
ZoomY);
double
rZoomX
=
tmprZoomXY
*
ZoomY;
double
rZoomY
=
tmprZoomXY
*
ZoomX;
double
sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long
Ax_16
=
(
long
)(rZoomX
*
cosA
*
(
1
<<
16
));
long
Ay_16
=
(
long
)(rZoomX
*
sinA
*
(
1
<<
16
));
long
Bx_16
=
(
long
)(
-
rZoomY
*
sinA
*
(
1
<<
16
));
long
By_16
=
(
long
)(rZoomY
*
cosA
*
(
1
<<
16
));
double
rx0
=
Src.width
*
0.5
;
//
(rx0,ry0)为旋转中心
double
ry0
=
Src.height
*
0.5
;
long
Cx_16
=
(
long
)((
-
(rx0
+
move_x)
*
rZoomX
*
cosA
+
(ry0
+
move_y)
*
rZoomY
*
sinA
+
rx0)
*
(
1
<<
16
));
long
Cy_16
=
(
long
)((
-
(rx0
+
move_x)
*
rZoomX
*
sinA
-
(ry0
+
move_y)
*
rZoomY
*
cosA
+
ry0)
*
(
1
<<
16
));
TRotaryClipData rcData;
rcData.Ax_16
=
Ax_16;
rcData.Bx_16
=
Bx_16;
rcData.Cx_16
=
Cx_16;
rcData.Ay_16
=
Ay_16;
rcData.By_16
=
By_16;
rcData.Cy_16
=
Cy_16;
rcData.dst_width
=
Dst.width;
rcData.dst_height
=
Dst.height;
rcData.src_width
=
Src.width;
rcData.src_height
=
Src.height;
if
(
!
rcData.inti_clip(move_x,move_y,
1
))
return
;
TARGB32
*
pDstLine
=
Dst.pdata;
((TUInt8
*&
)pDstLine)
+=
(Dst.byte_width
*
rcData.out_dst_down_y);
while
(
true
)
//
to down
{
long
y
=
rcData.out_dst_down_y;
if
(y
>=
Dst.height)
break
;
if
(y
>=
0
)
{
PicRotary_BilInear_CopyLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
}
if
(
!
rcData.next_clip_line_down())
break
;
((TUInt8
*&
)pDstLine)
+=
Dst.byte_width;
}
pDstLine
=
Dst.pdata;
((TUInt8
*&
)pDstLine)
+=
(Dst.byte_width
*
rcData.out_dst_up_y);
while
(rcData.next_clip_line_up())
//
to up
{
long
y
=
rcData.out_dst_up_y;
if
(y
<
0
)
break
;
((TUInt8
*&
)pDstLine)
-=
Dst.byte_width;
if
(y
<
Dst.height)
{
PicRotary_BilInear_CopyLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
}
}
}
//注:测试图片都是800*600的图片旋转到1004*1004的图片中心 测试成绩取各个旋转角度的平均速度值
////////////////////////////////////////////////////////////////////////////////
//速度测试:
//==============================================================================
// PicRotaryBilInear_MMX 93.8 fps
////////////////////////////////////////////////////////////////////////////////
D:三次卷积插值的旋转
(实现就比较简单了,几乎就是拷贝代码,然后稍微改写几个单词:)
(很多代码从《图形图像处理-之-高质量的快速的图像缩放 中篇 二次线性插值和三次卷积插值》文章来的)
inline
double
SinXDivX(
double
x)
{
//
该函数计算插值曲线sin(x*PI)/(x*PI)的值
//
PI=3.1415926535897932385;
//
下面是它的近似拟合表达式
const
float
a
=
-
1
;
//
a还可以取 a=-2,-1,-0.75,-0.5等等,起到调节锐化或模糊程度的作用
if
(x
<
0
) x
=-
x;
//
x=abs(x);
double
x2
=
x
*
x;
double
x3
=
x2
*
x;
if
(x
<=
1
)
return
(a
+
2
)
*
x3
-
(a
+
3
)
*
x2
+
1
;
else
if
(x
<=
2
)
return
a
*
x3
-
(
5
*
a)
*
x2
+
(
8
*
a)
*
x
-
(
4
*
a);
else
return
0
;
}
inline TUInt8 ColorBound(
long
Color)
{
if
(Color
<=
0
)
return
0
;
else
if
(Color
>=
255
)
return
255
;
else
return
Color;
}
static
long
SinXDivX_Table_8[(
2
<<
8
)
+
1
];
class
_CAutoInti_SinXDivX_Table {
private
:
void
_Inti_SinXDivX_Table()
{
for
(
long
i
=
0
;i
<=
(
2
<<
8
);
++
i)
SinXDivX_Table_8[i]
=
long
(
0.5
+
256
*
SinXDivX(i
*
(
1.0
/
(
256
))));
};
public
:
_CAutoInti_SinXDivX_Table() { _Inti_SinXDivX_Table(); }
};
static
_CAutoInti_SinXDivX_Table __tmp_CAutoInti_SinXDivX_Table;
void
ThreeOrder_Fast(
const
TPicRegion
&
pic,
const
long
x_16,
const
long
y_16,TARGB32
*
result)
{
long
u_8
=
(unsigned
char
)((x_16)
>>
8
);
long
v_8
=
(unsigned
char
)((y_16)
>>
8
);
const
TARGB32
*
pixel
=&
Pixels(pic,(x_16
>>
16
)
-
1
,(y_16
>>
16
)
-
1
);
long
pic_byte_width
=
pic.byte_width;
long
au_8[
4
],av_8[
4
];
//
au_8[
0
]
=
SinXDivX_Table_8[(
1
<<
8
)
+
u_8];
au_8[
1
]
=
SinXDivX_Table_8[u_8];
au_8[
2
]
=
SinXDivX_Table_8[(
1
<<
8
)
-
u_8];
au_8[
3
]
=
SinXDivX_Table_8[(
2
<<
8
)
-
u_8];
av_8[
0
]
=
SinXDivX_Table_8[(
1
<<
8
)
+
v_8];
av_8[
1
]
=
SinXDivX_Table_8[v_8];
av_8[
2
]
=
SinXDivX_Table_8[(
1
<<
8
)
-
v_8];
av_8[
3
]
=
SinXDivX_Table_8[(
2
<<
8
)
-
v_8];
long
sR
=
0
,sG
=
0
,sB
=
0
,sA
=
0
;
for
(
long
i
=
0
;i
<
4
;
++
i)
{
long
aR
=
0
,aG
=
0
,aB
=
0
,aA
=
0
;
for
(
long
j
=
0
;j
<
4
;
++
j)
{
aA
+=
au_8[j]
*
pixel[j].a;
aR
+=
au_8[j]
*
pixel[j].r;
aG
+=
au_8[j]
*
pixel[j].g;
aB
+=
au_8[j]
*
pixel[j].b;
}
sA
+=
aA
*
av_8[i];
sR
+=
aR
*
av_8[i];
sG
+=
aG
*
av_8[i];
sB
+=
aB
*
av_8[i];
((TUInt8
*&
)pixel)
+=
pic_byte_width;
}
*
(unsigned
long
*
)result
=
ColorBound(sB
>>
16
)
|
(ColorBound(sG
>>
16
)
<<
8
)
|
(ColorBound(sR
>>
16
)
<<
16
)
|
(ColorBound(sA
>>
16
)
<<
24
);
}
void
ThreeOrder_Border(
const
TPicRegion
&
pic,
const
long
x_16,
const
long
y_16,TARGB32
*
result)
{
unsigned
long
x0_sub1
=
(x_16
>>
16
)
-
1
;
unsigned
long
y0_sub1
=
(y_16
>>
16
)
-
1
;
long
u_16_add1
=
((unsigned
short
)(x_16))
+
(
1
<<
16
);
long
v_16_add1
=
((unsigned
short
)(y_16))
+
(
1
<<
16
);
TARGB32 pixel[
16
];
long
i,j;
for
(i
=
0
;i
<
4
;
++
i)
{
long
y
=
y0_sub1
+
i;
for
(j
=
0
;j
<
4
;
++
j)
{
long
x
=
x0_sub1
+
j;
bool
IsInPic;
pixel[i
*
4
+
j]
=
Pixels_Bound(pic,x,y,IsInPic);
if
(
!
IsInPic) pixel[i
*
4
+
j].a
=
0
;
else
pixel[i
*
4
+
j].a
=
255
;
}
}
TPicRegion npic;
npic.pdata
=&
pixel[
0
];
npic.byte_width
=
4
*
sizeof
(TARGB32);
//
npic.width =4;
//
npic.height =4;
ThreeOrder_Fast(npic,u_16_add1,v_16_add1,result);
}
void
PicRotary_ThreeOrder_CopyLine(TARGB32
*
pDstLine,
long
dst_border_x0,
long
dst_in_x0,
long
dst_in_x1,
long
dst_border_x1,
const
TPicRegion
&
SrcPic,
long
srcx0_16,
long
srcy0_16,
long
Ax_16,
long
Ay_16)
{
long
x;
for
(x
=
dst_border_x0;x
<
dst_in_x0;
++
x)
{
TARGB32 src_color;
ThreeOrder_Border(SrcPic,srcx0_16,srcy0_16,
&
src_color);
pDstLine[x]
=
AlphaBlend(pDstLine[x],src_color);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
for
(x
=
dst_in_x0;x
<
dst_in_x1;
++
x)
{
ThreeOrder_Fast(SrcPic,srcx0_16,srcy0_16,
&
pDstLine[x]);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
for
(x
=
dst_in_x1;x
<
dst_border_x1;
++
x)
{
TARGB32 src_color;
ThreeOrder_Border(SrcPic,srcx0_16,srcy0_16,
&
src_color);
pDstLine[x]
=
AlphaBlend(pDstLine[x],src_color);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
}
void
PicRotaryThreeOrder(
const
TPicRegion
&
Dst,
const
TPicRegion
&
Src,
double
RotaryAngle,
double
ZoomX,
double
ZoomY,
double
move_x,
double
move_y)
{
if
( (fabs(ZoomX
*
Src.width)
<
1.0e-4
)
||
(fabs(ZoomY
*
Src.height)
<
1.0e-4
) )
return
;
//
太小的缩放比例认为已经不可见
double
tmprZoomXY
=
1.0
/
(ZoomX
*
ZoomY);
double
rZoomX
=
tmprZoomXY
*
ZoomY;
double
rZoomY
=
tmprZoomXY
*
ZoomX;
double
sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long
Ax_16
=
(
long
)(rZoomX
*
cosA
*
(
1
<<
16
));
long
Ay_16
=
(
long
)(rZoomX
*
sinA
*
(
1
<<
16
));
long
Bx_16
=
(
long
)(
-
rZoomY
*
sinA
*
(
1
<<
16
));
long
By_16
=
(
long
)(rZoomY
*
cosA
*
(
1
<<
16
));
double
rx0
=
Src.width
*
0.5
;
//
(rx0,ry0)为旋转中心
double
ry0
=
Src.height
*
0.5
;
long
Cx_16
=
(
long
)((
-
(rx0
+
move_x)
*
rZoomX
*
cosA
+
(ry0
+
move_y)
*
rZoomY
*
sinA
+
rx0)
*
(
1
<<
16
));
long
Cy_16
=
(
long
)((
-
(rx0
+
move_x)
*
rZoomX
*
sinA
-
(ry0
+
move_y)
*
rZoomY
*
cosA
+
ry0)
*
(
1
<<
16
));
TRotaryClipData rcData;
rcData.Ax_16
=
Ax_16;
rcData.Bx_16
=
Bx_16;
rcData.Cx_16
=
Cx_16;
rcData.Ay_16
=
Ay_16;
rcData.By_16
=
By_16;
rcData.Cy_16
=
Cy_16;
rcData.dst_width
=
Dst.width;
rcData.dst_height
=
Dst.height;
rcData.src_width
=
Src.width;
rcData.src_height
=
Src.height;
if
(
!
rcData.inti_clip(move_x,move_y,
2
))
return
;
TARGB32
*
pDstLine
=
Dst.pdata;
((TUInt8
*&
)pDstLine)
+=
(Dst.byte_width
*
rcData.out_dst_down_y);
while
(
true
)
//
to down
{
long
y
=
rcData.out_dst_down_y;
if
(y
>=
Dst.height)
break
;
if
(y
>=
0
)
{
PicRotary_ThreeOrder_CopyLine(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
}
if
(
!
rcData.next_clip_line_down())
break
;
((TUInt8
*&
)pDstLine)
+=
Dst.byte_width;
}
pDstLine
=
Dst.pdata;
((TUInt8
*&
)pDstLine)
+=
(Dst.byte_width
*
rcData.out_dst_up_y);
while
(rcData.next_clip_line_up())
//
to up
{
long
y
=
rcData.out_dst_up_y;
if
(y
<
0
)
break
;
((TUInt8
*&
)pDstLine)
-=
Dst.byte_width;
if
(y
<
Dst.height)
{
PicRotary_ThreeOrder_CopyLine(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
}
}
}
//注:测试图片都是800*600的图片旋转到1004*1004的图片中心 测试成绩取各个旋转角度的平均速度值
////////////////////////////////////////////////////////////////////////////////
//速度测试:
//==============================================================================
// PicRotaryThreeOrder 20.3 fps
////////////////////////////////////////////////////////////////////////////////
三次卷积插值旋转结果图示:
30度 60度 90度
120度 150度 180度
![图形图像处理-之-任意角度的高质量的快速的图像旋转 下篇 高质量的旋转_第21张图片](http://img.e-com-net.com/image/info5/03447dba1da7497098ea54c8f14eb76a.jpg)
210度 240度 270度
300度 330度 360度
E:用MMX优化三次卷积插值的旋转
typedef unsigned
long
TMMXData32;
static
TMMXData32 SinXDivX_Table_7_MMX[(
2
<<
8
)
+
1
];
class
_CAutoInti_SinXDivX_Table_MMX {
private
:
void
_Inti_SinXDivX_Table_MMX()
{
for
(
long
i
=
0
;i
<=
(
2
<<
8
);
++
i)
{
unsigned
short
t
=
long
(
0.5
+
256
/
2
*
SinXDivX(i
*
(
1.0
/
(
256
))));
//
==SinXDivX_Table_8[i]/2
unsigned
long
tl
=
t
|
(((unsigned
long
)t)
<<
16
);
SinXDivX_Table_7_MMX[i]
=
tl;
}
};
public
:
_CAutoInti_SinXDivX_Table_MMX() { _Inti_SinXDivX_Table_MMX(); }
};
static
_CAutoInti_SinXDivX_Table_MMX __tmp_CAutoInti_SinXDivX_Table_MMX;
void
ThreeOrder_Fast_MMX(
const
TPicRegion
&
pic,
const
long
x_16,
const
long
y_16,TARGB32
*
result)
{
TMMXData32 auv_7[
4
+
4
];
asm
{
mov ecx,pic
mov eax,y_16
mov ebx,x_16
movzx edi,ah
//
v_8
mov edx,[ecx
+
TPicRegion.pdata]
shr eax,
16
mov esi,[ecx
+
TPicRegion.byte_width]
dec eax
movzx ecx,bh
//
u_8
shr ebx,
16
imul eax,esi
lea edx,[edx
+
ebx
*
4
-
4
]
add edx,eax
//
pixel
lea eax,auv_7
mov ebx,[SinXDivX_Table_7_MMX
+
256
*
4
+
ecx
*
4
]
mov [eax
+
0
*
4
],ebx
mov ebx,[SinXDivX_Table_7_MMX
+
ecx
*
4
]
neg ecx
mov [eax
+
1
*
4
],ebx
mov ebx,[SinXDivX_Table_7_MMX
+
256
*
4
+
ecx
*
4
]
mov [eax
+
2
*
4
],ebx
mov ebx,[SinXDivX_Table_7_MMX
+
512
*
4
+
ecx
*
4
]
mov [eax
+
3
*
4
],ebx
mov ecx,edi
mov ebx,[SinXDivX_Table_7_MMX
+
256
*
4
+
ecx
*
4
]
mov [
4
*
4
+
eax
+
0
*
4
],ebx
mov ebx,[SinXDivX_Table_7_MMX
+
ecx
*
4
]
neg ecx
mov [
4
*
4
+
eax
+
1
*
4
],ebx
mov ebx,[SinXDivX_Table_7_MMX
+
256
*
4
+
ecx
*
4
]
mov [
4
*
4
+
eax
+
2
*
4
],ebx
mov ebx,[SinXDivX_Table_7_MMX
+
512
*
4
+
ecx
*
4
]
mov [
4
*
4
+
eax
+
3
*
4
],ebx
pxor mm7,mm7
//
0
//
mov edx,pixel
pxor mm0,mm0
//
result=0
//
lea eax,auv_7
mov ecx,
-
4
for4_begin:
movd mm1,dword ptr [edx]
movd mm2,dword ptr [edx
+
4
]
movd mm3,dword ptr [edx
+
8
]
movd mm4,dword ptr [edx
+
12
]
movd mm5,dword ptr [eax]
movd mm6,dword ptr [eax
+
4
]
punpcklbw mm1,mm7
punpcklbw mm2,mm7
punpcklwd mm5,mm5
punpcklwd mm6,mm6
pmullw mm1,mm5
pmullw mm2,mm6
punpcklbw mm3,mm7
punpcklbw mm4,mm7
movd mm5,dword ptr [eax
+
8
]
movd mm6,dword ptr [eax
+
12
]
punpcklwd mm5,mm5
punpcklwd mm6,mm6
pmullw mm3,mm5
pmullw mm4,mm6
psraw mm1,
7
psraw mm2,
7
psraw mm3,
7
psraw mm4,
7
movd mm6,dword ptr [
4
*
8
+
eax
+
ecx
*
4
]
//
v
paddsw mm1,mm2
paddsw mm3,mm4
punpcklwd mm6,mm6
paddsw mm1,mm3
add edx,esi
//
+pic.byte_width
psraw mm1,
1
pmullw mm1,mm6
inc ecx
paddsw mm0,mm1
jnz for4_begin
psraw mm0,
6
packuswb mm0,mm7
mov eax,result
movd [eax],mm0
emms
}
}
void
ThreeOrder_Border_MMX(
const
TPicRegion
&
pic,
const
long
x_16,
const
long
y_16,TARGB32
*
result)
{
unsigned
long
x0_sub1
=
(x_16
>>
16
)
-
1
;
unsigned
long
y0_sub1
=
(y_16
>>
16
)
-
1
;
long
u_16_add1
=
((unsigned
short
)(x_16))
+
(
1
<<
16
);
long
v_16_add1
=
((unsigned
short
)(y_16))
+
(
1
<<
16
);
TARGB32 pixel[
16
];
long
i,j;
for
(i
=
0
;i
<
4
;
++
i)
{
long
y
=
y0_sub1
+
i;
for
(j
=
0
;j
<
4
;
++
j)
{
long
x
=
x0_sub1
+
j;
bool
IsInPic;
pixel[i
*
4
+
j]
=
Pixels_Bound(pic,x,y,IsInPic);
if
(
!
IsInPic) pixel[i
*
4
+
j].a
=
0
;
else
pixel[i
*
4
+
j].a
=
255
;
}
}
TPicRegion npic;
npic.pdata
=&
pixel[
0
];
npic.byte_width
=
4
*
sizeof
(TARGB32);
//
npic.width =4;
//
npic.height =4;
ThreeOrder_Fast_MMX(npic,u_16_add1,v_16_add1,result);
}
void
PicRotary_ThreeOrder_CopyLine_MMX(TARGB32
*
pDstLine,
long
dst_border_x0,
long
dst_in_x0,
long
dst_in_x1,
long
dst_border_x1,
const
TPicRegion
&
SrcPic,
long
srcx0_16,
long
srcy0_16,
long
Ax_16,
long
Ay_16)
{
long
x;
for
(x
=
dst_border_x0;x
<
dst_in_x0;
++
x)
{
TARGB32 src_color;
ThreeOrder_Border_MMX(SrcPic,srcx0_16,srcy0_16,
&
src_color);
pDstLine[x]
=
AlphaBlend_MMX(pDstLine[x],src_color);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
for
(x
=
dst_in_x0;x
<
dst_in_x1;
++
x)
{
ThreeOrder_Fast_MMX(SrcPic,srcx0_16,srcy0_16,
&
pDstLine[x]);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
for
(x
=
dst_in_x1;x
<
dst_border_x1;
++
x)
{
TARGB32 src_color;
ThreeOrder_Border_MMX(SrcPic,srcx0_16,srcy0_16,
&
src_color);
pDstLine[x]
=
AlphaBlend_MMX(pDstLine[x],src_color);
srcx0_16
+=
Ax_16;
srcy0_16
+=
Ay_16;
}
asm emms
}
void
PicRotaryThreeOrder_MMX(
const
TPicRegion
&
Dst,
const
TPicRegion
&
Src,
double
RotaryAngle,
double
ZoomX,
double
ZoomY,
double
move_x,
double
move_y)
{
if
( (fabs(ZoomX
*
Src.width)
<
1.0e-4
)
||
(fabs(ZoomY
*
Src.height)
<
1.0e-4
) )
return
;
//
太小的缩放比例认为已经不可见
double
tmprZoomXY
=
1.0
/
(ZoomX
*
ZoomY);
double
rZoomX
=
tmprZoomXY
*
ZoomY;
double
rZoomY
=
tmprZoomXY
*
ZoomX;
double
sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long
Ax_16
=
(
long
)(rZoomX
*
cosA
*
(
1
<<
16
));
long
Ay_16
=
(
long
)(rZoomX
*
sinA
*
(
1
<<
16
));
long
Bx_16
=
(
long
)(
-
rZoomY
*
sinA
*
(
1
<<
16
));
long
By_16
=
(
long
)(rZoomY
*
cosA
*
(
1
<<
16
));
double
rx0
=
Src.width
*
0.5
;
//
(rx0,ry0)为旋转中心
double
ry0
=
Src.height
*
0.5
;
long
Cx_16
=
(
long
)((
-
(rx0
+
move_x)
*
rZoomX
*
cosA
+
(ry0
+
move_y)
*
rZoomY
*
sinA
+
rx0)
*
(
1
<<
16
));
long
Cy_16
=
(
long
)((
-
(rx0
+
move_x)
*
rZoomX
*
sinA
-
(ry0
+
move_y)
*
rZoomY
*
cosA
+
ry0)
*
(
1
<<
16
));
TRotaryClipData rcData;
rcData.Ax_16
=
Ax_16;
rcData.Bx_16
=
Bx_16;
rcData.Cx_16
=
Cx_16;
rcData.Ay_16
=
Ay_16;
rcData.By_16
=
By_16;
rcData.Cy_16
=
Cy_16;
rcData.dst_width
=
Dst.width;
rcData.dst_height
=
Dst.height;
rcData.src_width
=
Src.width;
rcData.src_height
=
Src.height;
if
(
!
rcData.inti_clip(move_x,move_y,
2
))
return
;
TARGB32
*
pDstLine
=
Dst.pdata;
((TUInt8
*&
)pDstLine)
+=
(Dst.byte_width
*
rcData.out_dst_down_y);
while
(
true
)
//
to down
{
long
y
=
rcData.out_dst_down_y;
if
(y
>=
Dst.height)
break
;
if
(y
>=
0
)
{
PicRotary_ThreeOrder_CopyLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
}
if
(
!
rcData.next_clip_line_down())
break
;
((TUInt8
*&
)pDstLine)
+=
Dst.byte_width;
}
pDstLine
=
Dst.pdata;
((TUInt8
*&
)pDstLine)
+=
(Dst.byte_width
*
rcData.out_dst_up_y);
while
(rcData.next_clip_line_up())
//
to up
{
long
y
=
rcData.out_dst_up_y;
if
(y
<
0
)
break
;
((TUInt8
*&
)pDstLine)
-=
Dst.byte_width;
if
(y
<
Dst.height)
{
PicRotary_ThreeOrder_CopyLine_MMX(pDstLine,rcData.out_dst_x0_boder,rcData.out_dst_x0_in,
rcData.out_dst_x1_in,rcData.out_dst_x1_boder,Src,rcData.out_src_x0_16,rcData.out_src_y0_16,Ax_16,Ay_16);
}
}
}
//注:测试图片都是800*600的图片旋转到1004*1004的图片中心 测试成绩取各个旋转角度的平均速度值
////////////////////////////////////////////////////////////////////////////////
//速度测试:
//==============================================================================
// PicRotaryThreeOrder_MMX 34.4 fps
////////////////////////////////////////////////////////////////////////////////
F 效果图:
//程序使用的调用参数:
const long testcount=2000;
long dst_wh=1004;
for (int i=0;i<testcount;++i)
{
double zoom=rand()*(1.0/RAND_MAX)+0.5;
PicRotary_XXX(ppicDst,ppicSrc,rand()*(PI*2/RAND_MAX),zoom,zoom,((dst_wh+ppicSrc.width)*rand()*(1.0/RAND_MAX)-ppicSrc.width),(dst_wh+ppicSrc.height)*rand()*(1.0/RAND_MAX)-ppicSrc.height);
}
近邻取样插值旋转效果图:
![图形图像处理-之-任意角度的高质量的快速的图像旋转 下篇 高质量的旋转_第25张图片](http://img.e-com-net.com/image/info5/2a82c79f71584f449d04619aff66e44f.jpg)
二次线性插值旋转效果图:
![图形图像处理-之-任意角度的高质量的快速的图像旋转 下篇 高质量的旋转_第26张图片](http://img.e-com-net.com/image/info5/e902d6e30d244784afa52893b66059d0.jpg)
三次卷积插值旋转效果图:
![图形图像处理-之-任意角度的高质量的快速的图像旋转 下篇 高质量的旋转_第27张图片](http://img.e-com-net.com/image/info5/c18a78c767784bbe98a988fd4399aafc.jpg)
G:旋转测试的结果放到一起:
//注:测试图片都是800*600的图片旋转到1004*1004的图片中心,测试成绩取各个旋转角度的平均速度值
////////////////////////////////////////////////////////////////////////////////
//速度测试: (测试CPU为AMD64x2 4200+,单线程)
//==============================================================================
// PicRotary3 251.7 fps
// PicRotarySEE 280.1 fps
// PicRotarySEE2 281.2 fps
//
// PicRotaryBilInear 64.5 fps
// PicRotaryBilInear_MMX 93.8 fps
// PicRotaryThreeOrder 20.3 fps
// PicRotaryThreeOrder_MMX 34.4 fps
////////////////////////////////////////////////////////////////////////////////
(对于旋转的MipMap处理和三次线性插值,可以参考《图形图像处理-之-高质量的快速的图像缩放 下篇 三次线性插值和MipMap链》文章)
(这里为了函数的独立性和容易理解,都是拷贝代码然后稍作修改;实际的程序中,建议把他们合并到同一个函数中,
减少代码量,提高可维护性; 对于MMX、SSE、SSE2等的使用建议用CPUID指令测试看CPU是否支持这些指令,
动态决定调用不同的实现)