warpAffine 是图像处理中比较常见的一种变换,可以将图像校正或对齐。
对于线性插值方式,OpenCV 首先将坐标映射保存成两张图,然后调用 remap 函数。第二步是比较耗时的部分,并且 warpPerspective 亦采用此处理。
remap 通过构建查找表来存储系数乘积,这样减少了乘法运算次数。
由于篇幅过长,将文章分成 warpAffine 和 remap 两部分。
检查输入通道数量以及插值类型。
如果目的矩阵为空则创建内存,如果是原地操作则复制一份源数据。
CV_INSTRUMENT_REGION();
int interpolation = flags & INTER_MAX;
CV_Assert( _src.channels() <= 4 || (interpolation != INTER_LANCZOS4 &&
interpolation != INTER_CUBIC) );
CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() &&
_src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType,
borderValue, OCL_OP_AFFINE))
CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
borderValue, OCL_OP_AFFINE))
Mat src = _src.getMat(), M0 = _M0.getMat();
_dst.create( dsize.empty() ? src.size() : dsize, src.type() );
Mat dst = _dst.getMat();
CV_Assert( src.cols > 0 && src.rows > 0 );
if( dst.data == src.data )
src = src.clone();
如果未设置WARP_INVERSE_MAP
,则对参数矩阵M
求逆。
double M[6] = {0};
Mat matM(2, 3, CV_64F, M);
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
M0.convertTo(matM, matM.type());
if( !(flags & WARP_INVERSE_MAP) )
{
double D = M[0]*M[4] - M[1]*M[3];
D = D != 0 ? 1./D : 0;
double A11 = M[4]*D, A22=M[0]*D;
M[0] = A11; M[1] *= -D;
M[3] *= -D; M[4] = A22;
double b1 = -M[0]*M[2] - M[1]*M[5];
double b2 = -M[3]*M[2] - M[4]*M[5];
M[2] = b1; M[5] = b2;
}
#if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && !IPP_DISABLE_WARPAFFINE
CV_IPP_CHECK()
{
int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
( cn == 1 || cn == 3 || cn == 4 ) &&
( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
{
ippiWarpAffineBackFunc ippFunc = 0;
if ((flags & WARP_INVERSE_MAP) != 0)
{
ippFunc =
type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
0;
}
else
{
ippFunc =
type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
0;
}
int mode =
interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
interpolation == INTER_NEAREST ? IPPI_INTER_NN :
interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
0;
CV_Assert(mode && ippFunc);
double coeffs[2][3];
for( int i = 0; i < 2; i++ )
for( int j = 0; j < 3; j++ )
coeffs[i][j] = matM.at<double>(i, j);
bool ok;
Range range(0, dst.rows);
IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
if( ok )
{
CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
return;
}
setIppErrorStatus();
}
}
#endif
调用命名空间中的 hal::warpAffine 函数,各参数独立传入。
hal::warpAffine(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
M, interpolation, borderType, borderValue.val);
d s t ( x , y ) = s r c ( M 11 x + M 12 y + M 13 , M 21 x + M 22 y + M 23 ) \mathrm{dst}(x,y)=\mathrm{src}(M_{11}x+M_{12}y+M_{13}, M_{21}x+M_{22}y+M_{23}) dst(x,y)=src(M11x+M12y+M13,M21x+M22y+M23)
OpenCV 屏蔽了 Carotene 中的实现。
根据描述构建出源和目的矩阵。
INTER_BITS 为5,即插值所用比特数。
预先计算行元素变换系数,adelta
为 M 11 x M_{11}x M11x,bdelta
为 M 21 x M_{21}x M21x。
CALL_HAL(warpAffine, cv_hal_warpAffine, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
int x;
AutoBuffer<int> _abdelta(dst.cols*2);
int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS;
for( x = 0; x < dst.cols; x++ )
{
adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
}
创建调用者 WarpAffineInvoker,parallel_for_ 函数并行调用。
Range range(0, dst.rows);
WarpAffineInvoker invoker(src, dst, interpolation, borderType,
Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
adelta, bdelta, M);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
public:
WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
const Scalar &_borderValue, int *_adelta, int *_bdelta, const double *_M) :
ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
M(_M)
{
}
处理操作符函数。
BLOCK_SZ
指定分块大小。
XY
存储源图上对应的坐标,A
为辅助系数。
INTER_BITS 为5。
INTER_TAB_SIZE 为 2 I N T E R _ B I T S = 32 2^{\mathrm{INTER\_BITS}}=32 2INTER_BITS=32,这样双线性插值表大小为32x32x4。
AB_SCALE
为 2 10 2^{10} 210。这样计算结果的 10.6 的格式存储。
处理区域为BLOCK_SZ/2
行、BLOCK_SZ*2
列的矩形。为什么?
virtual void operator() (const Range& range) const CV_OVERRIDE
{
const int BLOCK_SZ = 64;
AutoBuffer<short, 0> __XY(BLOCK_SZ * BLOCK_SZ * 2), __A(BLOCK_SZ * BLOCK_SZ);
short *XY = __XY.data(), *A = __A.data();
const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS;
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
#if CV_TRY_AVX2
bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
#endif
#if CV_TRY_SSE4_1
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
#endif
int bh0 = std::min(BLOCK_SZ/2, dst.rows);
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
xy
指向结果矩阵的当前行行首。
X0
为 M 12 y + M 13 M_{12}y+M_{13} M12y+M13
Y0
为 M 22 x + M 23 M_{22}x+M_{23} M22x+M23
round_delta
为16。
_XY
将缓冲区__XY
封装成了矩阵。
dpart
为结果的 RoI 区域。
for( y = range.start; y < range.end; y += bh0 )
{
for( x = 0; x < dst.cols; x += bw0 )
{
int bw = std::min( bw0, dst.cols - x);
int bh = std::min( bh0, range.end - y);
Mat _XY(bh, bw, CV_16SC2, XY), matA;
Mat dpart(dst, Rect(x, y, bw, bh));
for( y1 = 0; y1 < bh; y1++ )
{
short* xy = XY + y1*bw*2;
int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
如果是最近邻方法。
WarpAffineInvoker_Blockline_SSE41
if( interpolation == INTER_NEAREST )
{
x1 = 0;
#if CV_TRY_SSE4_1
if( useSSE4_1 )
opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
else
#endif
{
#if CV_SIMD128
{
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
int span = v_uint16x8::nlanes;
for( ; x1 <= bw - span; x1 += span )
{
v_int16x8 v_dst[2];
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr(shift+v_load(ptr + offset)),\
v_shr(shift+v_load(ptr + offset + 4)))
v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
#undef CV_CONVERT_MAP
v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
}
}
#endif
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x+x1]) >> AB_BITS;
int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
xy[x1*2] = saturate_cast<short>(X);
xy[x1*2+1] = saturate_cast<short>(Y);
}
}
}
对于双线性插值。
alpha
指向当前行。
avx2可以调用 warpAffineBlockline
v_mask
为向量掩码。
每次处理span*2
个数。
v_setall_s32 通过 OPENCV_HAL_IMPL_NEON_INIT 宏来定义。
v_load 从存储器中加载寄存器内容。
计算 M 11 x + M 12 y + M 13 M_{11}x+M_{12}y+M_{13} M11x+M12y+M13 和 M 21 x + M 22 y + M 23 M_{21}x+M_{22}y+M_{23} M21x+M22y+M23 时,是两个v_int32x4
相加(vaddq_s32),得到v_int32x4
的结果。然后右移 (10-5)位。
v_shr 右移。
v_pack 将两个数据宽度减半后拼到一起。
两次右移后
v_store_interleave 将2个寄存器中的数据交错存储到内存中。
v_shl 左移。
v_store 将寄存器内容存储到内存中。
v_Y0
和v_X0
合并。
else
{
short* alpha = A + y1*bw;
x1 = 0;
#if CV_TRY_AVX2
if ( useAVX2 )
x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
#endif
#if CV_SIMD128
{
v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
int span = v_float32x4::nlanes;
for( ; x1 <= bw - span * 2; x1 += span * 2 )
{
v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1));
v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1));
v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1 + span));
v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1 + span));
v_int16x8 v_xy[2];
v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
v_int32x4 v_alpha0 = v_shl<INTER_BITS>(v_Y0 & v_mask) | (v_X0 & v_mask);
v_int32x4 v_alpha1 = v_shl<INTER_BITS>(v_Y1 & v_mask) | (v_X1 & v_mask);
v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
}
}
#endif
X
和Y
为映射到源图上的像素坐标。alpha
将Y
和X
的小数部分合并存储。
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
(X & (INTER_TAB_SIZE-1)));
}
}
}
cv::remap 从src
中找到dpart
的源值。最近邻方式不需要第二张图。
if( interpolation == INTER_NEAREST )
remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
else
{
Mat _matA(bh, bw, CV_16U, A);
remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
}
}
}
}
成员变量。
private:
Mat src;
Mat dst;
int interpolation, borderType;
Scalar borderValue;
int *adelta, *bdelta;
const double *M;