上一篇文章中提到 warpAffine 会分块处理,将坐标映射和插值系数分别存储下来,然后借助 remap 来实现最终的映射。而 remap 会根据映射关系取源像素并加权计算出目的像素值。其最核心的计算为 RemapVec_8u。
d s t ( x , y ) = s r c ( m a p x ( x , y ) , m a p y ( x , y ) ) \mathrm{dst}(x,y)=\mathrm{src}(\mathrm{map}_x(x,y), \mathrm{map}_y(x,y)) dst(x,y)=src(mapx(x,y),mapy(x,y))
定义不同类型的函数表。
remapBilinear
CV_INSTRUMENT_REGION();
static RemapNNFunc nn_tab[] =
{
remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
};
static RemapFunc linear_tab[] =
{
remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
remapBilinear<Cast<float, float>, RemapNoVec, float>,
remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
};
static RemapFunc cubic_tab[] =
{
remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapBicubic<Cast<float, ushort>, float, 1>,
remapBicubic<Cast<float, short>, float, 1>, 0,
remapBicubic<Cast<float, float>, float, 1>,
remapBicubic<Cast<double, double>, float, 1>, 0
};
static RemapFunc lanczos4_tab[] =
{
remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
remapLanczos4<Cast<float, ushort>, float, 1>,
remapLanczos4<Cast<float, short>, float, 1>, 0,
remapLanczos4<Cast<float, float>, float, 1>,
remapLanczos4<Cast<double, double>, float, 1>, 0
};
_map1
不能为空,_map2
可以。
CV_Assert( !_map1.empty() );
CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
_dst.create( map1.size(), src.type() );
Mat dst = _dst.getMat();
CV_OVX_RUN(
src.type() == CV_8UC1 && dst.type() == CV_8UC1 &&
!ovx::skipSmallImages<VX_KERNEL_REMAP>(src.cols, src.rows) &&
(borderType& ~BORDER_ISOLATED) == BORDER_CONSTANT &&
((map1.type() == CV_32FC2 && map2.empty() && map1.size == dst.size) ||
(map1.type() == CV_32FC1 && map2.type() == CV_32FC1 && map1.size == dst.size && map2.size == dst.size) ||
(map1.empty() && map2.type() == CV_32FC2 && map2.size == dst.size)) &&
((borderType & BORDER_ISOLATED) != 0 || !src.isSubmatrix()),
openvx_remap(src, dst, map1, map2, interpolation, borderValue));
输入输出不能过大。
如果输入输出相同则进行拷贝。
CV_Assert( dst.cols < SHRT_MAX && dst.rows < SHRT_MAX && src.cols < SHRT_MAX && src.rows < SHRT_MAX );
if( dst.data == src.data )
src = src.clone();
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
int type = src.type(), depth = CV_MAT_DEPTH(type);
#if defined HAVE_IPP && !IPP_DISABLE_REMAP
CV_IPP_CHECK()
{
if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
(borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
{
int ippInterpolation =
interpolation == INTER_NEAREST ? IPPI_INTER_NN :
interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
ippiRemap ippFunc =
type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
if (ippFunc)
{
bool ok;
IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
borderType, borderValue, &ok);
Range range(0, dst.rows);
parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
if (ok)
{
CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
return;
}
setIppErrorStatus();
}
}
}
#endif
如果是最近邻插值设置nnfunc
,否则设置ifunc
。
initInterTab2D 返回静态数组的指针给ctab
。
RemapNNFunc nnfunc = 0;
RemapFunc ifunc = 0;
const void* ctab = 0;
bool fixpt = depth == CV_8U;
bool planar_input = false;
if( interpolation == INTER_NEAREST )
{
nnfunc = nn_tab[depth];
CV_Assert( nnfunc != 0 );
}
else
{
if( interpolation == INTER_LINEAR )
ifunc = linear_tab[depth];
else if( interpolation == INTER_CUBIC ){
ifunc = cubic_tab[depth];
CV_Assert( _src.channels() <= 4 );
}
else if( interpolation == INTER_LANCZOS4 ){
ifunc = lanczos4_tab[depth];
CV_Assert( _src.channels() <= 4 );
}
else
CV_Error( CV_StsBadArg, "Unknown interpolation method" );
CV_Assert( ifunc != 0 );
ctab = initInterTab2D( interpolation, fixpt );
}
const Mat *m1 = &map1, *m2 = &map2;
if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
{
if( map1.type() != CV_16SC2 )
std::swap(m1, m2);
}
else
{
CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
(map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
planar_input = map1.channels() == 1;
}
调用 RemapInvoker 的函数。
RemapInvoker invoker(src, dst, m1, m2,
borderType, borderValue, planar_input, nnfunc, ifunc,
ctab);
parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
initInterTab2D 存储_tab
中两个核参数相乘的结果。
static bool inittab[INTER_MAX+1] = {false};
float* tab = 0;
short* itab = 0;
int ksize = 0;
if( method == INTER_LINEAR )
tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
else if( method == INTER_CUBIC )
tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
else if( method == INTER_LANCZOS4 )
tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
else
CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
initInterTab1D 计算不同方法的一维表值,即原始系数。
INTER_TAB_SIZE 为 2 I N T E R _ B I T S 2^{\mathrm{INTER\_BITS}} 2INTER_BITS,表的大小取决于插值位宽。
tab
尺寸为 INTER_TAB_SIZE2,从而能存储不同系数就的乘积。
NNDeltaTab_i
的作用是什么?
内层循环计算中心系数乘积。vy
为当前行的中心系数值。
INTER_REMAP_COEF_SCALE 为 2 I N T E R _ R E M A P _ C O E F _ B I T S 2^{\mathrm{INTER\_REMAP\_COEF\_BITS}} 2INTER_REMAP_COEF_BITS。
isum
为itab
当前位置上的中心元素之和。
if( !inittab[method] )
{
AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
int i, j, k1, k2;
initInterTab1D(method, _tab.data(), INTER_TAB_SIZE);
for( i = 0; i < INTER_TAB_SIZE; i++ )
for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
{
int isum = 0;
NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
for( k1 = 0; k1 < ksize; k1++ )
{
float vy = _tab[i*ksize + k1];
for( k2 = 0; k2 < ksize; k2++ )
{
float v = vy*_tab[j*ksize + k2];
tab[k1*ksize + k2] = v;
isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
}
}
计算完成后,tab
和itab
重新指向首地址。
if( isum != INTER_REMAP_COEF_SCALE )
{
int diff = isum - INTER_REMAP_COEF_SCALE;
int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
for( k1 = ksize2; k1 < ksize2+2; k1++ )
for( k2 = ksize2; k2 < ksize2+2; k2++ )
{
if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
mk1 = k1, mk2 = k2;
else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
Mk1 = k1, Mk2 = k2;
}
if( diff < 0 )
itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
else
itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
}
}
tab -= INTER_TAB_SIZE2*ksize*ksize;
itab -= INTER_TAB_SIZE2*ksize*ksize;
#if CV_SIMD128
if( method == INTER_LINEAR )
{
for( i = 0; i < INTER_TAB_SIZE2; i++ )
for( j = 0; j < 4; j++ )
{
BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
}
}
#endif
inittab[method] = true;
}
return fixpt ? (const void*)itab : (const void*)tab;
float scale = 1.f/tabsz;
if( method == INTER_LINEAR )
{
for( int i = 0; i < tabsz; i++, tab += 2 )
interpolateLinear( i*scale, tab );
}
else if( method == INTER_CUBIC )
{
for( int i = 0; i < tabsz; i++, tab += 4 )
interpolateCubic( i*scale, tab );
}
else if( method == INTER_LANCZOS4 )
{
for( int i = 0; i < tabsz; i++, tab += 8 )
interpolateLanczos4( i*scale, tab );
}
else
CV_Error( CV_StsBadArg, "Unknown interpolation method" );
计算插值系数。
coeffs[0] = 1.f - x;
coeffs[1] = x;
public:
RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
const Mat *_m2, int _borderType, const Scalar &_borderValue,
int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
borderType(_borderType), borderValue(_borderValue),
planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
{
}
brows0
为缓冲区行数,bcols0
为列数。
dpart
为目的矩阵的当前处理区域。
_bufxy
为x
和y
的缓冲区。bufxy
为当前 RoI 区域。
virtual void operator() (const Range& range) const CV_OVERRIDE
{
int x, y, x1, y1;
const int buf_size = 1 << 14;
int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
int bcols0 = std::min(buf_size/brows0, dst->cols);
brows0 = std::min(buf_size/bcols0, dst->rows);
Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
if( !nnfunc )
_bufa.create(brows0, bcols0, CV_16UC1);
for( y = range.start; y < range.end; y += brows0 )
{
for( x = 0; x < dst->cols; x += bcols0 )
{
int brows = std::min(brows0, range.end - y);
int bcols = std::min(bcols0, dst->cols - x);
Mat dpart(*dst, Rect(x, y, bcols, brows));
Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
如果为最近邻映射。
if( nnfunc )
{
if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
bufxy = (*m1)(Rect(x, y, bcols, brows));
else if( map_depth != CV_32F )
{
for( y1 = 0; y1 < brows; y1++ )
{
short* XY = bufxy.ptr<short>(y1);
const short* sXY = m1->ptr<short>(y+y1) + x*2;
const ushort* sA = m2->ptr<ushort>(y+y1) + x;
for( x1 = 0; x1 < bcols; x1++ )
{
int a = sA[x1] & (INTER_TAB_SIZE2-1);
XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
}
}
}
else if( !planar_input )
(*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
else
{
for( y1 = 0; y1 < brows; y1++ )
{
short* XY = bufxy.ptr<short>(y1);
const float* sX = m1->ptr<float>(y+y1) + x;
const float* sY = m2->ptr<float>(y+y1) + x;
x1 = 0;
#if CV_SIMD128
{
int span = v_float32x4::nlanes;
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
{
v_int32x4 ix0 = v_round(v_load(sX + x1));
v_int32x4 iy0 = v_round(v_load(sY + x1));
v_int32x4 ix1 = v_round(v_load(sX + x1 + span));
v_int32x4 iy1 = v_round(v_load(sY + x1 + span));
v_int16x8 dx, dy;
dx = v_pack(ix0, ix1);
dy = v_pack(iy0, iy1);
v_store_interleave(XY + x1 * 2, dx, dy);
}
}
#endif
for( ; x1 < bcols; x1++ )
{
XY[x1*2] = saturate_cast<short>(sX[x1]);
XY[x1*2+1] = saturate_cast<short>(sY[x1]);
}
}
}
nnfunc( *src, dpart, bufxy, borderType, borderValue );
continue;
}
否则为线性插值。
XY
指向bufxy
的当前行,A
指向bufa
的当前行。
如果m1
为双通道整型且m2
为单通道整型, bufxy
封装m1
的当前处理分块,即源坐标值。
sA
为m2
矩阵的当前位置。
INTER_TAB_SIZE2 为 2 10 2^{10} 210
由掩码得到小数部分保存到A
中。
Mat bufa(_bufa, Rect(0, 0, bcols, brows));
for( y1 = 0; y1 < brows; y1++ )
{
short* XY = bufxy.ptr<short>(y1);
ushort* A = bufa.ptr<ushort>(y1);
if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
{
bufxy = (*m1)(Rect(x, y, bcols, brows));
const ushort* sA = m2->ptr<ushort>(y+y1) + x;
x1 = 0;
#if CV_SIMD128
{
v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1);
int span = v_uint16x8::nlanes;
for( ; x1 <= bcols - span; x1 += span )
v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale);
}
#endif
for( ; x1 < bcols; x1++ )
A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
}
如果m1
为单通道
else if( planar_input )
{
const float* sX = m1->ptr<float>(y+y1) + x;
const float* sY = m2->ptr<float>(y+y1) + x;
x1 = 0;
#if CV_SIMD128
{
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1);
int span = v_float32x4::nlanes;
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
{
v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1));
v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1));
v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span));
v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span));
v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2));
v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2));
v_uint16x8 v_v = v_shl<INTER_BITS>(v_sy8) | (v_sx8);
v_store(A + x1, v_v);
v_int16x8 v_d0 = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
v_int16x8 v_d1 = v_pack(v_shr<INTER_BITS>(v_sy0), v_shr<INTER_BITS>(v_sy1));
v_store_interleave(XY + (x1 << 1), v_d0, v_d1);
}
}
#endif
for( ; x1 < bcols; x1++ )
{
int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
A[x1] = (ushort)v;
}
}
else
{
const float* sXY = m1->ptr<float>(y+y1) + x*2;
x1 = 0;
#if CV_SIMD128
{
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE);
v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE);
int span = v_float32x4::nlanes;
for( ; x1 <= bcols - span * 2; x1 += span * 2 )
{
v_float32x4 v_fx, v_fy;
v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy);
v_int32x4 v_sx0 = v_round(v_fx * v_scale);
v_int32x4 v_sy0 = v_round(v_fy * v_scale);
v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy);
v_int32x4 v_sx1 = v_round(v_fx * v_scale);
v_int32x4 v_sy1 = v_round(v_fy * v_scale);
v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2));
v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2));
v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1));
v_store(A + x1, v_v8);
v_int16x8 v_dx = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1));
v_int16x8 v_dy = v_pack(v_shr<INTER_BITS>(v_sy0), v_shr<INTER_BITS>(v_sy1));
v_store_interleave(XY + (x1 << 1), v_dx, v_dy);
}
}
#endif
for( ; x1 < bcols; x1++ )
{
int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
A[x1] = (ushort)v;
}
}
}
调用ifunc
完成插值操作。
ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
}
}
}
类成员。
private:
const Mat* src;
Mat* dst;
const Mat *m1, *m2;
int borderType;
Scalar borderValue;
int planar_input;
RemapNNFunc nnfunc;
RemapFunc ifunc;
const void *ctab;
remapBilinear 在必要时填充边界,图像映射通过 RemapVec_8u 实现加速。
sstep
为源图行跨度。MatStep 能够进行隐式类型转换。
cval
为填充像素值。
width1
和height1
为横向和纵向上的最大取值。
typedef typename CastOp::rtype T;
typedef typename CastOp::type1 WT;
Size ssize = _src.size(), dsize = _dst.size();
const int cn = _src.channels();
const AT* wtab = (const AT*)_wtab;
const T* S0 = _src.ptr<T>();
size_t sstep = _src.step/sizeof(S0[0]);
T cval[CV_CN_MAX];
CastOp castOp;
VecOp vecOp;
for(int k = 0; k < cn; k++ )
cval[k] = saturate_cast<T>(_borderValue[k & 3]);
unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
CV_Assert( !ssize.empty() );
#if CV_SIMD128
if( _src.type() == CV_8UC3 )
width1 = std::max(ssize.width-2, 0);
#endif
D
指向目的图上的当前行。
XY
和FXY
指向整数和小数部分的值。
curInlier
判断是否超出边界。
如果dx
在区域内,根据XY
中的值是否超出边界来判断。
X0
是上一次处理完后的行位置,也是本次起点,X1
是本次处理的终点。
dy = 0
时,由于X0=0
,所以跳过不处理。如果prevInlier
和curInlier
相等则跳过不处理。
dx = dsize.width
本是一个不存在的位置,这使得在行尾prevInlier
和curInlier
不相等,马上进行处理。
for(int dy = 0; dy < dsize.height; dy++ )
{
T* D = _dst.ptr<T>(dy);
const short* XY = _xy.ptr<short>(dy);
const ushort* FXY = _fxy.ptr<ushort>(dy);
int X0 = 0;
bool prevInlier = false;
for(int dx = 0; dx <= dsize.width; dx++ )
{
bool curInlier = dx < dsize.width ?
(unsigned)XY[dx*2] < width1 &&
(unsigned)XY[dx*2+1] < height1 : !prevInlier;
if( curInlier == prevInlier )
continue;
int X1 = dx;
dx = X0;
X0 = X1;
prevInlier = curInlier;
f ( x , y ) = a 0 α 0 β 0 + a 1 α 1 β 0 + b 0 α 0 β 1 + b 1 α 1 β 1 \begin{aligned} f(x, y) &= a_0\alpha_0\beta_0+ a_1\alpha_1\beta_0 + b_0\alpha_0\beta_1+ b_1\alpha_1\beta_1 \end{aligned} f(x,y)=a0α0β0+a1α1β0+b0α0β1+b1α1β1
如果不是内露层,调用vecOp
处理可向量化的数据;剩余部分循环处理。总共处理X1-X0
个像素。
S
为对应到源图的左上点地址,w
为融合后的4个权重参数。以FXY
数组的值作为索引,可以获得4个中心系数乘积。
if( !curInlier )
{
int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
D += len*cn;
dx += len;
if( cn == 1 )
{
for( ; dx < X1; dx++, D++ )
{
int sx = XY[dx*2], sy = XY[dx*2+1];
const AT* w = wtab + FXY[dx]*4;
const T* S = S0 + sy*sstep + sx;
*D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
}
}
else if( cn == 2 )
for( ; dx < X1; dx++, D += 2 )
{
int sx = XY[dx*2], sy = XY[dx*2+1];
const AT* w = wtab + FXY[dx]*4;
const T* S = S0 + sy*sstep + sx*2;
WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
D[0] = castOp(t0); D[1] = castOp(t1);
}
else if( cn == 3 )
for( ; dx < X1; dx++, D += 3 )
{
int sx = XY[dx*2], sy = XY[dx*2+1];
const AT* w = wtab + FXY[dx]*4;
const T* S = S0 + sy*sstep + sx*3;
WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
}
else if( cn == 4 )
for( ; dx < X1; dx++, D += 4 )
{
int sx = XY[dx*2], sy = XY[dx*2+1];
const AT* w = wtab + FXY[dx]*4;
const T* S = S0 + sy*sstep + sx*4;
WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
D[0] = castOp(t0); D[1] = castOp(t1);
t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
D[2] = castOp(t0); D[3] = castOp(t1);
}
else
for( ; dx < X1; dx++, D += cn )
{
int sx = XY[dx*2], sy = XY[dx*2+1];
const AT* w = wtab + FXY[dx]*4;
const T* S = S0 + sy*sstep + sx*cn;
for(int k = 0; k < cn; k++ )
{
WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3];
D[k] = castOp(t0);
}
}
}
否则处理边界的情况。
BORDER_TRANSPARENT
直接跳过。
else
{
if( borderType == BORDER_TRANSPARENT && cn != 3 )
{
D += (X1 - dx)*cn;
dx = X1;
continue;
}
单通道的常量边界、重复边界或者其他类型。
(sx
,sy
)为原图上的坐标。sx0
、sy0
、sx1
和sy1
是周围修剪后的坐标,用于插值。
cv::borderInterpolate 计算外推像素的源位置。
if( cn == 1 )
for( ; dx < X1; dx++, D++ )
{
int sx = XY[dx*2], sy = XY[dx*2+1];
if( borderType == BORDER_CONSTANT &&
(sx >= ssize.width || sx+1 < 0 ||
sy >= ssize.height || sy+1 < 0) )
{
D[0] = cval[0];
}
else
{
int sx0, sx1, sy0, sy1;
T v0, v1, v2, v3;
const AT* w = wtab + FXY[dx]*4;
if( borderType == BORDER_REPLICATE )
{
sx0 = clip(sx, 0, ssize.width);
sx1 = clip(sx+1, 0, ssize.width);
sy0 = clip(sy, 0, ssize.height);
sy1 = clip(sy+1, 0, ssize.height);
v0 = S0[sy0*sstep + sx0];
v1 = S0[sy0*sstep + sx1];
v2 = S0[sy1*sstep + sx0];
v3 = S0[sy1*sstep + sx1];
}
else
{
sx0 = borderInterpolate(sx, ssize.width, borderType);
sx1 = borderInterpolate(sx+1, ssize.width, borderType);
sy0 = borderInterpolate(sy, ssize.height, borderType);
sy1 = borderInterpolate(sy+1, ssize.height, borderType);
v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
}
D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
}
}
多通道的边界处理。
else
for( ; dx < X1; dx++, D += cn )
{
int sx = XY[dx*2], sy = XY[dx*2+1];
if( borderType == BORDER_CONSTANT &&
(sx >= ssize.width || sx+1 < 0 ||
sy >= ssize.height || sy+1 < 0) )
{
for(int k = 0; k < cn; k++ )
D[k] = cval[k];
}
else
{
int sx0, sx1, sy0, sy1;
const T *v0, *v1, *v2, *v3;
const AT* w = wtab + FXY[dx]*4;
if( borderType == BORDER_REPLICATE )
{
sx0 = clip(sx, 0, ssize.width);
sx1 = clip(sx+1, 0, ssize.width);
sy0 = clip(sy, 0, ssize.height);
sy1 = clip(sy+1, 0, ssize.height);
v0 = S0 + sy0*sstep + sx0*cn;
v1 = S0 + sy0*sstep + sx1*cn;
v2 = S0 + sy1*sstep + sx0*cn;
v3 = S0 + sy1*sstep + sx1*cn;
}
else if( borderType == BORDER_TRANSPARENT &&
((unsigned)sx >= (unsigned)(ssize.width-1) ||
(unsigned)sy >= (unsigned)(ssize.height-1)))
continue;
else
{
sx0 = borderInterpolate(sx, ssize.width, borderType);
sx1 = borderInterpolate(sx+1, ssize.width, borderType);
sy0 = borderInterpolate(sy, ssize.height, borderType);
sy1 = borderInterpolate(sy+1, ssize.height, borderType);
v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
}
for(int k = 0; k < cn; k++ )
D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
}
}
}
}
}
处理1、3或4通道的数据,并且跨度不能过大。
S0
为源图第0行,S1
为源图第1行。
对于单通道数据,wtab
指向输入的数组;否则指向 BilinearTab_iC4 数组。
D
为目的数据的指针。
INTER_REMAP_COEF_SCALE 为 2 15 2^{15} 215
delta
为 2 14 2^{14} 214
xy2ofs
的高低字节分别为源图上y
和x
元素的偏移。
int operator()( const Mat& _src, void* _dst, const short* XY,
const ushort* FXY, const void* _wtab, int width ) const
{
int cn = _src.channels(), x = 0, sstep = (int)_src.step;
if( (cn != 1 && cn != 3 && cn != 4) || sstep >= 0x8000 )
return 0;
const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
uchar* D = (uchar*)_dst;
v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2);
v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16)));
int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes;
#define CV_PICK_AND_PACK_RGB(ptr, offset, result) \
{ \
const uchar* const p = ((const uchar*)ptr) + (offset); \
if (p <= src_limit_8bytes) \
{ \
v_uint8x16 rrggbb, dummy; \
v_uint16x8 rrggbb8, dummy8; \
v_uint8x16 rgb0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \
v_uint8x16 rgb1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + 3), 0, 0, 0)); \
v_zip(rgb0, rgb1, rrggbb, dummy); \
v_expand(rrggbb, rrggbb8, dummy8); \
result = v_reinterpret_as_s16(rrggbb8); \
} \
else \
{ \
result = v_int16x8((short)p[0], (short)p[3], /* r0r1 */ \
(short)p[1], (short)p[4], /* g0g1 */ \
(short)p[2], (short)p[5], /* b0b1 */ 0, 0); \
} \
}
#define CV_PICK_AND_PACK_RGBA(ptr, offset, result) \
{ \
const uchar* const p = ((const uchar*)ptr) + (offset); \
CV_DbgAssert(p <= src_limit_8bytes); \
v_uint8x16 rrggbbaa, dummy; \
v_uint16x8 rrggbbaa8, dummy8; \
v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \
v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + v_int32x4::nlanes), 0, 0, 0)); \
v_zip(rgba0, rgba1, rrggbbaa, dummy); \
v_expand(rrggbbaa, rrggbbaa8, dummy8); \
result = v_reinterpret_as_s16(rrggbbaa8); \
}
#define CV_PICK_AND_PACK4(base,offset) \
v_uint16x8(*(unaligned_ushort*)(base + offset[0]), *(unaligned_ushort*)(base + offset[1]), \
*(unaligned_ushort*)(base + offset[2]), *(unaligned_ushort*)(base + offset[3]), \
0, 0, 0, 0)
如果是单通道的数据。
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP 定义 v_load,从内存加载到寄存器(vld1q_s16)。
_xy0
和_xy1
是对应到源图上的坐标。
v0
、v1
、v2
和v3
是像素值, a0
、a1
、b0
和b1
是系数。
if( cn == 1 )
{
for( ; x <= width - 8; x += 8 )
{
v_int16x8 _xy0 = v_load(XY + x*2);
v_int16x8 _xy1 = v_load(XY + x*2 + 8);
v_int32x4 v0, v1, v2, v3, a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2;
v_dotprod 将两个寄存器中的值相乘,并对相邻的结果对求和。先调用 _v128_unzip,后者会使用 vuzp1q_s16 和 vuzp2q_s16 解压向量。
xy0
和xy1
为对应到源图上的一维地址。
v_store 由 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP 定义,将数据存储到内存(vst1q_s32)。
保存到iofs0
和iofs1
这一步能否省掉?
v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
v_int32x4 xy1 = v_dotprod( _xy1, xy2ofs );
v_store( iofs0, xy0 );
v_store( iofs1, xy1 );
CV_PICK_AND_PACK4 根据基地址和偏移地址获取内存中的4个值然后构造一个 v_uint16x8 对象。这里存在一半的浪费。
vec16
为源图上的对应像素,每个uint16
存储了相邻的两个像素。
v_reinterpret_as_u8 由 OPENCV_HAL_IMPL_NEON_INIT 定义,向量重新解释强制转换操作 (vreinterpretq_u8_u16)。
v_expand 将寄存器的内容复制到两个2倍宽包装类型的寄存器中(vget_low_u8+vmovl_u8+vget_high_u8)。
stub
包含真正的8个uint16
元素。
v0
再次将每对元素拼到一起。
加载下一行得到v1
。
v_uint16x8 stub, dummy;
v_uint16x8 vec16;
vec16 = CV_PICK_AND_PACK4(S0, iofs0);
v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
v0 = v_reinterpret_as_s32(stub);
vec16 = CV_PICK_AND_PACK4(S1, iofs0);
v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
v1 = v_reinterpret_as_s32(stub);
v_load_low 将数据加载到低位,高位未定义(vcombine_s32+vld1_s32+vdup_n_s32)。
OPENCV_HAL_IMPL_NEON_UNPACKS 定义 v_zip,从成对的两个源寄存器的下半部分读取相邻的向量元素,将这些对进行交织并写入目标寄存器(vzip1q_s16)。
v_zip 交织两个向量。
wtab
原本为int16
,按照int32
加载使得第一行的两个系数成对。
a0
和b0
各自存储了两个像素的4个参数。加载方式比较低效?
a1
和b1
中为垃圾值。
v_recombine 由 OPENCV_HAL_IMPL_NEON_UNPACKS 宏定义,分别合并来两个向量的较低部分和较高部分(vcombine_s32+vget_low_s32+vget_high_s32)。
a2
和b2
为交换后的值,即4个像素第一行和第二行的系数。
v_dotprod 点元素的乘积并将第三个元素添加到相邻对的总和中。
f ( x , y ) = ( a 0 α 0 + a 1 α 1 ) β 0 + ( b 0 α 0 + b 1 α 1 ) β 1 = a 0 α 0 β 0 + a 1 α 1 β 0 + b 0 α 0 β 1 + b 1 α 1 β 1 = f ( Q 11 ) ( x 2 − x ) ( y 2 − y ) + f ( Q 12 ) ( x − x 1 ) ( y 2 − y ) + f ( Q 21 ) ( x 2 − x ) ( y 2 − y ) + f ( Q 22 ) ( x − x 1 ) ( y − y 1 ) \begin{aligned} f(x, y) &= (a_0\alpha_0+ a_1\alpha_1)\beta_0 + (b_0\alpha_0+ b_1\alpha_1)\beta_1\\ &= a_0\alpha_0\beta_0+ a_1\alpha_1\beta_0 + b_0\alpha_0\beta_1+ b_1\alpha_1\beta_1\\ &= f(Q_{11})(x_2 -x)(y_2 -y) + f(Q_{12})(x-x_1)(y_2 -y) + f(Q_{21})(x_2 -x)(y_2 -y) + f(Q_{22})(x-x_1)(y-y_1) \end{aligned} f(x,y)=(a0α0+a1α1)β0+(b0α0+b1α1)β1=a0α0β0+a1α1β0+b0α0β1+b1α1β1=f(Q11)(x2−x)(y2−y)+f(Q12)(x−x1)(y2−y)+f(Q21)(x2−x)(y2−y)+f(Q22)(x−x1)(y−y1)
v_zip(v_load_low((int*)(wtab + FXY[x] * 4)), v_load_low((int*)(wtab + FXY[x + 1] * 4)), a0, a1);
v_zip(v_load_low((int*)(wtab + FXY[x + 2] * 4)), v_load_low((int*)(wtab + FXY[x + 3] * 4)), b0, b1);
v_recombine(a0, b0, a2, b2);
v1 = v_dotprod(v_reinterpret_as_s16(v1), v_reinterpret_as_s16(b2), delta);
v0 = v_dotprod(v_reinterpret_as_s16(v0), v_reinterpret_as_s16(a2), v1);
对于iofs1
重复以上操作得到v2
。
vec16 = CV_PICK_AND_PACK4(S0, iofs1);
v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
v2 = v_reinterpret_as_s32(stub);
vec16 = CV_PICK_AND_PACK4(S1, iofs1);
v_expand(v_reinterpret_as_u8(vec16), stub, dummy);
v3 = v_reinterpret_as_s32(stub);
v_zip(v_load_low((int*)(wtab + FXY[x + 4] * 4)), v_load_low((int*)(wtab + FXY[x + 5] * 4)), c0, c1);
v_zip(v_load_low((int*)(wtab + FXY[x + 6] * 4)), v_load_low((int*)(wtab + FXY[x + 7] * 4)), d0, d1);
v_recombine(c0, d0, c2, d2);
v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta);
v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3);
v_pack_u_store 由 OPENCV_HAL_IMPL_NEON_PACK 宏定义。
v_pack 将两个值缩短(vqmovn_s32)后合并为一个较大的向量 (vcombine_s16)。
v_pack_u_store 将两个值缩短(vqmovun_s16)后存储到内存(vst1_u8)。
v0
和v2
是对应的。
v0 = v0 >> INTER_REMAP_COEF_BITS;
v2 = v2 >> INTER_REMAP_COEF_BITS;
v_pack_u_store(D + x, v_pack(v0, v2));
}
}
如果是三通道数据。
else if( cn == 3 )
{
for( ; x <= width - 5; x += 4, D += 12 )
{
v_int16x8 u0, v0, u1, v1;
v_int16x8 _xy0 = v_load(XY + x * 2);
v_int32x4 xy0 = v_dotprod(_xy0, xy2ofs);
v_store(iofs0, xy0);
int offset0 = FXY[x] * 16;
int offset1 = FXY[x + 1] * 16;
int offset2 = FXY[x + 2] * 16;
int offset3 = FXY[x + 3] * 16;
v_int16x8 w00 = v_load(wtab + offset0);
v_int16x8 w01 = v_load(wtab + offset0 + 8);
v_int16x8 w10 = v_load(wtab + offset1);
v_int16x8 w11 = v_load(wtab + offset1 + 8);
CV_PICK_AND_PACK_RGB(S0, iofs0[0], u0);
CV_PICK_AND_PACK_RGB(S1, iofs0[0], v0);
CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1);
CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1);
v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
result0 = v_rotate_left<1>(result0);
v_int16x8 result8 = v_pack(result0, result1);
v_uint8x16 result16 = v_pack_u(result8, result8);
v_store_low(D, v_rotate_right<1>(result16));
w00 = v_load(wtab + offset2);
w01 = v_load(wtab + offset2 + 8);
w10 = v_load(wtab + offset3);
w11 = v_load(wtab + offset3 + 8);
CV_PICK_AND_PACK_RGB(S0, iofs0[2], u0);
CV_PICK_AND_PACK_RGB(S1, iofs0[2], v0);
CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1);
CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1);
result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
result0 = v_rotate_left<1>(result0);
result8 = v_pack(result0, result1);
result16 = v_pack_u(result8, result8);
v_store_low(D + 6, v_rotate_right<1>(result16));
}
}
else if( cn == 4 )
{
for( ; x <= width - 4; x += 4, D += 16 )
{
v_int16x8 _xy0 = v_load(XY + x * 2);
v_int16x8 u0, v0, u1, v1;
v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
v_store(iofs0, xy0);
int offset0 = FXY[x] * 16;
int offset1 = FXY[x + 1] * 16;
int offset2 = FXY[x + 2] * 16;
int offset3 = FXY[x + 3] * 16;
v_int16x8 w00 = v_load(wtab + offset0);
v_int16x8 w01 = v_load(wtab + offset0 + 8);
v_int16x8 w10 = v_load(wtab + offset1);
v_int16x8 w11 = v_load(wtab + offset1 + 8);
CV_PICK_AND_PACK_RGBA(S0, iofs0[0], u0);
CV_PICK_AND_PACK_RGBA(S1, iofs0[0], v0);
CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1);
CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1);
v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
v_int16x8 result8 = v_pack(result0, result1);
v_pack_u_store(D, result8);
w00 = v_load(wtab + offset2);
w01 = v_load(wtab + offset2 + 8);
w10 = v_load(wtab + offset3);
w11 = v_load(wtab + offset3 + 8);
CV_PICK_AND_PACK_RGBA(S0, iofs0[2], u0);
CV_PICK_AND_PACK_RGBA(S1, iofs0[2], v0);
CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1);
CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1);
result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS;
result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS;
result8 = v_pack(result0, result1);
v_pack_u_store(D + 8, result8);
}
}
return x;
}