注:问号以及未注释部分 会在x265-1.8版本内更新
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#include "common.h"
#include "primitives.h"
#include "lowres.h"
#include "motion.h"
#include "x265.h"
#if _MSC_VER
#pragma warning(disable: 4127) // conditional expression is constant (macros use this construct)
#endif
using namespace x265;
namespace {
// 分像素搜索中需要设置的参数
struct SubpelWorkload
{
int hpel_iters; // 1/2像素搜索迭代次数
int hpel_dirs; // 1/2像素搜索方向数(点数)
int qpel_iters; // 1/4像素搜索迭代次数
int qpel_dirs; // 1/4像素搜索方向数(点数)
bool hpel_satd; // 分像素搜索中是否使用satd
};
// 根据输入参数subpelRefine,选择不同的分像素搜索参数
// 搜索性能从0到7逐渐增加,搜索复杂度也随之增加
const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
{
{ 1, 4, 0, 4, false }, // 4 SAD HPEL only
{ 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
{ 1, 4, 1, 4, true }, // 4 SATD HPEL + 4 SATD QPEL
{ 2, 4, 1, 4, true }, // 2x4 SATD HPEL + 4 SATD QPEL
{ 2, 4, 2, 4, true }, // 2x4 SATD HPEL + 2x4 SATD QPEL
{ 1, 8, 1, 8, true }, // 8 SATD HPEL + 8 SATD QPEL (default)
{ 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL
{ 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL
};
int sizeScale[NUM_PU_SIZES];
#define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) }; //hex算法搜索点
const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) }; //四周八点坐标
const MV hex4[16] =
{
MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
};
const MV offsets[] =
{
MV(-1, 0), MV(0, -1),
MV(-1, -1), MV(1, -1),
MV(-1, 0), MV(1, 0),
MV(-1, 1), MV(-1, -1),
MV(1, -1), MV(1, 1),
MV(-1, 0), MV(0, 1),
MV(-1, 1), MV(1, 1),
MV(1, 0), MV(0, 1),
}; // offsets for Two Point Search
/* sum of absolute differences between MV candidates, used for adaptive ME range */
inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
{
int sum = 0;
for (int i = 0; i < numCandidates - 1; i++)
{
sum += abs(mvc[i].x - mvc[i + 1].x)
+ abs(mvc[i].y - mvc[i + 1].y);
}
return sum;
}
}
/** 函数功能 :初始化ME,searchMethod 默认hex,subme 默认2
* \返回 :null * */
MotionEstimate::MotionEstimate()
{
ctuAddr = -1;
absPartIdx = -1;
searchMethod = X265_HEX_SEARCH;
subpelRefine = 2;
blockwidth = blockheight = 0;
blockOffset = 0;
bChromaSATD = false;
chromaSatd = NULL;
}
/** 函数功能 : 初始化搜索算法、创建待搜索块的缓存
/*\参数 method: 搜索方法
/*\参数 refine: subme强度
* \参数 csp: 图像格式
* \返回 : null */
void MotionEstimate::init(int method, int refine, int csp)
{
searchMethod = method; //初始化搜索方法:dia、hex、umh、star....
subpelRefine = refine; //subme强度设置
fencPUYuv.create(FENC_STRIDE, csp); //创建待搜索块的缓存,大小为64x64,将来搜索块会先copy到此缓存
}
void MotionEstimate::initScales(void)
{
#define SETUP_SCALE(W, H) \
sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
SETUP_SCALE(4, 4);
SETUP_SCALE(8, 8);
SETUP_SCALE(8, 4);
SETUP_SCALE(4, 8);
SETUP_SCALE(16, 16);
SETUP_SCALE(16, 8);
SETUP_SCALE(8, 16);
SETUP_SCALE(16, 12);
SETUP_SCALE(12, 16);
SETUP_SCALE(4, 16);
SETUP_SCALE(16, 4);
SETUP_SCALE(32, 32);
SETUP_SCALE(32, 16);
SETUP_SCALE(16, 32);
SETUP_SCALE(32, 24);
SETUP_SCALE(24, 32);
SETUP_SCALE(32, 8);
SETUP_SCALE(8, 32);
SETUP_SCALE(64, 64);
SETUP_SCALE(64, 32);
SETUP_SCALE(32, 64);
SETUP_SCALE(64, 48);
SETUP_SCALE(48, 64);
SETUP_SCALE(64, 16);
SETUP_SCALE(16, 64);
#undef SETUP_SCALE
}
int MotionEstimate::hpelIterationCount(int subme)
{
return workload[subme].hpel_iters +
workload[subme].qpel_iters / 2;
}
/** 函数功能 :释放内存
* \返回 :null * */
MotionEstimate::~MotionEstimate()
{
fencPUYuv.destroy();
}
/** 函数功能 : 设置me对应的asm函数,copy待搜索块数据到待搜索块的缓存
/*\参数 fencY: 当前编码帧的帧首地址
/*\参数 stride: 原始帧步长
/*\参数 offset: 当前搜索块首地址相对于帧首地址的偏移量
/*\参数 pwidth: 当前搜索块的宽度
/*\参数 pheight: 当前搜索块的高度
* \返回 : null */
/* Called by lookahead, luma only, no use of PicYuv */
void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
{
partEnum = partitionFromSizes(pwidth, pheight); //根据当前块大小返回相应的块标号:如:LUMA_4x4, LUMA_8x8, LUMA_16x16, LUMA_32x32, LUMA_64x64....
X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
sad = primitives.pu[partEnum].sad; //获取求相应块sad的相应函数指针
satd = primitives.pu[partEnum].satd; //获取求相应块satd的相应函数指针
sad_x3 = primitives.pu[partEnum].sad_x3; //获取求相应块(同时计算3个MV对应的3个SAD值)的相应函数指针
sad_x4 = primitives.pu[partEnum].sad_x4; //获取求相应块(同时计算3个MV对应的3个SAD值)的相应函数指针
blockwidth = pwidth; //设置当前搜索块的宽度
blockOffset = offset; //当前搜索块首地址相对于帧首地址的偏移量
absPartIdx = ctuAddr = -1;
/* copy PU block into cache */
primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);//copy待搜索块数据到待搜索块的缓存
X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
}
/* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
{
partEnum = partitionFromSizes(pwidth, pheight);
X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
sad = primitives.pu[partEnum].sad;
satd = primitives.pu[partEnum].satd;
sad_x3 = primitives.pu[partEnum].sad_x3;
sad_x4 = primitives.pu[partEnum].sad_x4;
chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
/* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
* is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
bChromaSATD = subpelRefine > 2 && chromaSatd; // 只有在subpelRefine大于2时,在分像素ME时才会计算chroma的satd
X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
ctuAddr = _ctuAddr;
absPartIdx = cuPartIdx + puPartIdx;
blockwidth = pwidth;
blockOffset = 0;
/* copy PU from CU Yuv */
fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
}
#define COST_MV_PT_DIST(mx, my, point, dist) \
do \
{ \
MV tmv(mx, my); \
int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
cost += mvcost(tmv << 2); \
if (cost < bcost) { \
bcost = cost; \
bmv = tmv; \
bPointNr = point; \
bDistance = dist; \
} \
} while (0)
#define COST_MV(mx, my) \
do \
{ \
int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
cost += mvcost(MV(mx, my) << 2); \
COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
} while (0)
//搜索六边形的三点,并将cost存入costs中
#define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
{ \
pixel *pix_base = fref + bmv.x + bmv.y * stride; \
sad_x3(fenc, \
pix_base + (m0x) + (m0y) * stride, \
pix_base + (m1x) + (m1y) * stride, \
pix_base + (m2x) + (m2y) * stride, \
stride, costs); \
const uint16_t *base_mvx = &m_cost_mvx[(bmv.x + (m0x)) << 2]; \
const uint16_t *base_mvy = &m_cost_mvy[(bmv.y + (m0y)) << 2]; \
X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]), "mvcost() check failure\n"); \
X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]), "mvcost() check failure\n"); \
X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]), "mvcost() check failure\n"); \
(costs)[0] += (base_mvx[((m0x) - (m0x)) << 2] + base_mvy[((m0y) - (m0y)) << 2]); \
(costs)[1] += (base_mvx[((m1x) - (m0x)) << 2] + base_mvy[((m1y) - (m0y)) << 2]); \
(costs)[2] += (base_mvx[((m2x) - (m0x)) << 2] + base_mvy[((m2y) - (m0y)) << 2]); \
}
#define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
{ \
sad_x4(fenc, \
fref + (m0x) + (m0y) * stride, \
fref + (m1x) + (m1y) * stride, \
fref + (m2x) + (m2y) * stride, \
fref + (m3x) + (m3y) * stride, \
stride, costs); \
(costs)[0] += mvcost(MV(m0x, m0y) << 2); \
(costs)[1] += mvcost(MV(m1x, m1y) << 2); \
(costs)[2] += mvcost(MV(m2x, m2y) << 2); \
(costs)[3] += mvcost(MV(m3x, m3y) << 2); \
COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
}
#define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
{ \
pixel *pix_base = fref + omv.x + omv.y * stride; \
sad_x4(fenc, \
pix_base + (m0x) + (m0y) * stride, \
pix_base + (m1x) + (m1y) * stride, \
pix_base + (m2x) + (m2y) * stride, \
pix_base + (m3x) + (m3y) * stride, \
stride, costs); \
const uint16_t *base_mvx = &m_cost_mvx[(omv.x << 2)]; \
const uint16_t *base_mvy = &m_cost_mvy[(omv.y << 2)]; \
X265_CHECK(mvcost((omv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
X265_CHECK(mvcost((omv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
X265_CHECK(mvcost((omv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
X265_CHECK(mvcost((omv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
costs[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
costs[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
costs[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
costs[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
}
//搜索菱形或者正方形的四个顶点,并将cost存入costs中
// 计算MVD的cost分为两个部分。第一部分MVD消耗的cost,这部分上次搜索最优的MV与最初的预测MVP之间的MVD
// 计算第二部分MVD消耗的cost,这部分是此次搜索的MV与上次搜索最优的MV之间的MVD,两次加起来也就是当前MV与MVP之间的MVD
#define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
{ \
pixel *pix_base = fref + bmv.x + bmv.y * stride; \
sad_x4(fenc, \
pix_base + (m0x) + (m0y) * stride, \
pix_base + (m1x) + (m1y) * stride, \
pix_base + (m2x) + (m2y) * stride, \
pix_base + (m3x) + (m3y) * stride, \
stride, costs); \
/* TODO: use restrict keyword in ICL */ \
const uint16_t *base_mvx = &m_cost_mvx[(bmv.x << 2)]; \
const uint16_t *base_mvy = &m_cost_mvy[(bmv.y << 2)]; \
X265_CHECK(mvcost((bmv + MV(m0x, m0y)) << 2) == (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]), "mvcost() check failure\n"); \
X265_CHECK(mvcost((bmv + MV(m1x, m1y)) << 2) == (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]), "mvcost() check failure\n"); \
X265_CHECK(mvcost((bmv + MV(m2x, m2y)) << 2) == (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]), "mvcost() check failure\n"); \
X265_CHECK(mvcost((bmv + MV(m3x, m3y)) << 2) == (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]), "mvcost() check failure\n"); \
(costs)[0] += (base_mvx[(m0x) << 2] + base_mvy[(m0y) << 2]); \
(costs)[1] += (base_mvx[(m1x) << 2] + base_mvy[(m1y) << 2]); \
(costs)[2] += (base_mvx[(m2x) << 2] + base_mvy[(m2y) << 2]); \
(costs)[3] += (base_mvx[(m3x) << 2] + base_mvy[(m3y) << 2]); \
}
#define DIA1_ITER(mx, my) \
{ \
omv.x = mx; omv.y = my; \
COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
}
// 进行十字搜索
// 首先进行第一次十字搜索:检查十字中横轴上的点(起始点由start决定),之后检查十字中纵轴上的点(均以距离2向外圈扩散)
// 之后进行第二次十字搜索:检查十字中纵轴上的点(起始点由start决定),之后检查十字中横轴上的点(均以距离2向外圈扩散)
#define CROSS(start, x_max, y_max) \
{ \
int16_t i = start; \
if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
for (; i < (x_max) - 2; i += 4) { \
COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
for (; i < (x_max); i += 2) \
{ \
if (omv.x + i <= mvmax.x) \
COST_MV(omv.x + i, omv.y); \
if (omv.x - i >= mvmin.x) \
COST_MV(omv.x - i, omv.y); \
} \
i = start; \
if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
for (; i < (y_max) - 2; i += 4) { \
COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
for (; i < (y_max); i += 2) \
{ \
if (omv.y + i <= mvmax.y) \
COST_MV(omv.x, omv.y + i); \
if (omv.y - i >= mvmin.y) \
COST_MV(omv.x, omv.y - i); \
} \
}
/**
* 函数功能:星形ME搜索
/* 调用范围 只在MotionEstimate::motionEstimate函数中被调用
* \参数 ref 参考帧
* \参数 mvmin 输出的实际搜索范围(左边界和上边界)
* \参数 mvmax 输出的实际搜索范围(下边界和右边界)
* \参数 bmv 从AMVP得到的预测MV,并返回最优的MV
* \参数 bcost 预测MV对应的cost,并返回最优的cost
* \参数 bPointNr 返回最优的MV对应的位置标号,该位置标号在下面ME的搜索模板中标出
* \参数 bDistance 返回最优的MV对应的步长
* \参数 earlyExitIters 提前跳出的迭代次数
* \参数 merange 输入的ME搜索范围
*/
void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
const MV & mvmin,
const MV & mvmax,
MV & bmv,
int & bcost,
int & bPointNr,
int & bDistance,
int earlyExitIters,
int merange)
{
ALIGN_VAR_16(int, costs[16]);
pixel* fenc = fencPUYuv.m_buf[0]; // 待搜索块的Y分量数据指针
pixel* fref = ref->fpelPlane[0] + blockOffset; // 待匹配帧对应位置的Y分量数据指针
intptr_t stride = ref->lumaStride; // 参考帧Y分量数据宽度
MV omv = bmv;
int saved = bcost;
int rounds = 0; // 在上次搜索到最优MV后,有多少轮没有更新最优MV,如果rounds>earlyExitIters,说明这次搜索偏差较大,提前结束搜索
// 在步长为1时进行小菱形搜索
{
int16_t dist = 1;
/* bPointNr
2
4 * 5
7
*/
// 找到小十字搜索的位置边界,并检查是否超过设定的界限
const int16_t top = omv.y - dist;
const int16_t bottom = omv.y + dist;
const int16_t left = omv.x - dist;
const int16_t right = omv.x + dist;
if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y) // 如果四个点都没有超过设定的界限,计算这四个位置的失真(sad+mvcost)
{
COST_MV_PT_DIST_X4(omv.x, top, 2, dist,
left, omv.y, 4, dist,
right, omv.y, 5, dist,
omv.x, bottom, 7, dist);
}
else // 如果四个点中有超过设定界限的情况,则计算没有超过界限的MV的失真(sad+mvcost)
{
if (top >= mvmin.y) // check top
{
COST_MV_PT_DIST(omv.x, top, 2, dist);
}
if (left >= mvmin.x) // check middle left
{
COST_MV_PT_DIST(left, omv.y, 4, dist);
}
if (right <= mvmax.x) // check middle right
{
COST_MV_PT_DIST(right, omv.y, 5, dist);
}
if (bottom <= mvmax.y) // check bottom
{
COST_MV_PT_DIST(omv.x, bottom, 7, dist);
}
}
if (bcost < saved) // 如果找到比之前搜索更优的点,则将rounds置零
rounds = 0;
else if (++rounds >= earlyExitIters) // 在允许的迭代次数中都无法找到更优的MV,则提前结束本轮搜索
return;
}
// 在步长小于8时进行菱形搜索,每次搜索的步长是上次的两倍
for (int16_t dist = 2; dist <= 8; dist <<= 1)
{
/* bPointNr
2
1 3
4 * 5
6 8
7
Points 2, 4, 5, 7 are dist
Points 1, 3, 6, 8 are dist>>1
*/
// 找到菱形搜索的位置边界,并检查是否超过设定的界限
const int16_t top = omv.y - dist;
const int16_t bottom = omv.y + dist;
const int16_t left = omv.x - dist;
const int16_t right = omv.x + dist;
const int16_t top2 = omv.y - (dist >> 1);
const int16_t bottom2 = omv.y + (dist >> 1);
const int16_t left2 = omv.x - (dist >> 1);
const int16_t right2 = omv.x + (dist >> 1);
saved = bcost; // 更新上一轮的最优cost
if (top >= mvmin.y && left >= mvmin.x &&
right <= mvmax.x && bottom <= mvmax.y) // 如果8个点都没有超过设定的界限,计算这8个位置的失真(sad+mvcost)
{
COST_MV_PT_DIST_X4(omv.x, top, 2, dist,
left2, top2, 1, dist >> 1,
right2, top2, 3, dist >> 1,
left, omv.y, 4, dist);
COST_MV_PT_DIST_X4(right, omv.y, 5, dist,
left2, bottom2, 6, dist >> 1,
right2, bottom2, 8, dist >> 1,
omv.x, bottom, 7, dist);
}
else // 如果8个点中有超过设定界限的情况,则计算没有超过界限的MV的失真(sad+mvcost)
{
if (top >= mvmin.y) // check top
{
COST_MV_PT_DIST(omv.x, top, 2, dist);
}
if (top2 >= mvmin.y) // check half top
{
if (left2 >= mvmin.x) // check half left
{
COST_MV_PT_DIST(left2, top2, 1, (dist >> 1));
}
if (right2 <= mvmax.x) // check half right
{
COST_MV_PT_DIST(right2, top2, 3, (dist >> 1));
}
}
if (left >= mvmin.x) // check left
{
COST_MV_PT_DIST(left, omv.y, 4, dist);
}
if (right <= mvmax.x) // check right
{
COST_MV_PT_DIST(right, omv.y, 5, dist);
}
if (bottom2 <= mvmax.y) // check half bottom
{
if (left2 >= mvmin.x) // check half left
{
COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1));
}
if (right2 <= mvmax.x) // check half right
{
COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1));
}
}
if (bottom <= mvmax.y) // check bottom
{
COST_MV_PT_DIST(omv.x, bottom, 7, dist);
}
}
if (bcost < saved) // 如果找到比之前搜索更优的点,则将rounds置零
rounds = 0;
else if (++rounds >= earlyExitIters) // 在允许的迭代次数中都无法找到更优的MV,则提前结束本轮搜索
return;
}
// 在步长大于等于16时,进行大十字搜索(每一个步长下的大十字都呈现发散的形状)
for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1)
{
// 找到大十字搜索的位置边界,并检查是否超过设定的界限
const int16_t top = omv.y - dist;
const int16_t bottom = omv.y + dist;
const int16_t left = omv.x - dist;
const int16_t right = omv.x + dist;
saved = bcost; // 更新上一轮的最优cost
if (top >= mvmin.y && left >= mvmin.x &&
right <= mvmax.x && bottom <= mvmax.y) // 如果所有需要搜索的点都没有超过设定的界限,计算这些位置的失真(sad+mvcost)
{
/* index
0
3
2
1
0 3 2 1 * 1 2 3 0
1
2
3
0
*/
// 首先计算最外圈的标号为0的位置
COST_MV_PT_DIST_X4(omv.x, top, 0, dist,
left, omv.y, 0, dist,
right, omv.y, 0, dist,
omv.x, bottom, 0, dist);
// 之后逐渐向中心位置检测
for (int16_t index = 1; index < 4; index++)
{
int16_t posYT = top + ((dist >> 2) * index);
int16_t posYB = bottom - ((dist >> 2) * index);
int16_t posXL = omv.x - ((dist >> 2) * index);
int16_t posXR = omv.x + ((dist >> 2) * index);
COST_MV_PT_DIST_X4(posXL, posYT, 0, dist,
posXR, posYT, 0, dist,
posXL, posYB, 0, dist,
posXR, posYB, 0, dist);
}
}
else // 如果需要搜索的点中有超过设定界限的情况,则计算没有超过界限的MV的失真(sad+mvcost)
{
if (top >= mvmin.y) // check top
{
COST_MV_PT_DIST(omv.x, top, 0, dist);
}
if (left >= mvmin.x) // check left
{
COST_MV_PT_DIST(left, omv.y, 0, dist);
}
if (right <= mvmax.x) // check right
{
COST_MV_PT_DIST(right, omv.y, 0, dist);
}
if (bottom <= mvmax.y) // check bottom
{
COST_MV_PT_DIST(omv.x, bottom, 0, dist);
}
for (int16_t index = 1; index < 4; index++)
{
int16_t posYT = top + ((dist >> 2) * index);
int16_t posYB = bottom - ((dist >> 2) * index);
int16_t posXL = omv.x - ((dist >> 2) * index);
int16_t posXR = omv.x + ((dist >> 2) * index);
if (posYT >= mvmin.y) // check top
{
if (posXL >= mvmin.x) // check left
{
COST_MV_PT_DIST(posXL, posYT, 0, dist);
}
if (posXR <= mvmax.x) // check right
{
COST_MV_PT_DIST(posXR, posYT, 0, dist);
}
}
if (posYB <= mvmax.y) // check bottom
{
if (posXL >= mvmin.x) // check left
{
COST_MV_PT_DIST(posXL, posYB, 0, dist);
}
if (posXR <= mvmax.x) // check right
{
COST_MV_PT_DIST(posXR, posYB, 0, dist);
}
}
}
}
if (bcost < saved)// 如果找到比之前搜索更优的点,则将rounds置零
rounds = 0;
else if (++rounds >= earlyExitIters)// 在允许的迭代次数中都无法找到更优的MV,则提前结束本轮搜索
return;
}
}
/** 函数功能 : 运动估计,获取最优的MV
/* 调用范围 :只在Search::predInterSearch、singleMotionEstimation和CostEstimateGroup::estimateCUCost函数中被调用
* \参数 mvmin :最小MV(整像素精度)
* \参数 mvmax :最大MV(整像素精度)
* \参数 qmvp :MVP(分像素精度(1/4))
* \参数 numCandidates :当前的候选参考帧个数 ?????
* \参数 mvc :当前的MVC(MV candidates)列表
* \参数 merange :当前的搜索窗口
* \参数 outQMv :返回最优的MV
* \返回 :返回最优MV所花费的cost **/
int MotionEstimate::motionEstimate(ReferencePlanes *ref,
const MV & mvmin,
const MV & mvmax,
const MV & qmvp,
int numCandidates,
const MV * mvc,
int merange,
MV & outQMv)
{
ALIGN_VAR_16(int, costs[16]);
if (ctuAddr >= 0)
blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0); //CostEstimateGroup::estimateCUCost不会进入 ?????
intptr_t stride = ref->lumaStride; //获取参考帧的步长
pixel* fenc = fencPUYuv.m_buf[0]; //获取当前搜索块原始像素值
pixel* fref = ref->fpelPlane[0] + blockOffset; //fref 当前搜索块在参考帧的对应位置,如 当前搜索块为(x,y) 则fref在参考帧的位置也为(x,y)
setMVP(qmvp); //设置当前的MVP
MV qmvmin = mvmin.toQPel(); //将最小mv扩大到分像素精度(1/4)
MV qmvmax = mvmax.toQPel(); //将最大mv扩大到分像素精度(1/4)
/* The term cost used here means satd/sad values for that particular search.
* The costs used in ME integer search only includes the SAD cost of motion
* residual and sqrtLambda times MVD bits. The subpel refine steps use SATD
* cost of residual and sqrtLambda * MVD bits. Mode decision will be based
* on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
* (mode + MVD bits). */
// measure SAD cost at clipped QPEL MVP
MV pmv = qmvp.clipped(qmvmin, qmvmax); //防止mvp越界 clip操作
MV bestpre = pmv; //存储周边块最优的pmv
int bprecost; //存储周边块最优的cost
if (ref->isLowres)
bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad); //如果当前搜索的参考帧是1/2分辨率采样参考帧:获取伪分像素插值的sad值
else
bprecost = subpelCompare(ref, pmv, sad); //如果当前为普通参考帧,则进行标准的分像素搜索
/* re-measure full pel rounded MVP with SAD as search start point */
MV bmv = pmv.roundToFPel(); //存储最优的整像素MV,初始化pmv,从pmv开始搜索
int bcost = bprecost; //存储最优的cost值,初始化为pmv的sad值
if (pmv.isSubpel())
bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2); //如果当前pmv有分像素精度,则将bcost更新为:整像素点的sad值加上整像素点的mvcost(MV与MVP之间的差(MVD)占用的bits-cost)
//因为下面的搜索算法是先按照整像素点进行搜索,所以在此先排除分像素插值带来的影响
// measure SAD cost at MV(0) if MVP is not zero
if (pmv.notZero()) //如果pmv不是零向量,尝试MV(0,0)当作搜索原点是否更优
{
int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0)); //获取MV(0,0)的代价值:sad+mvcost
if (cost < bcost)
{
bcost = cost;
bmv = 0;
}
}
// measure SAD cost at each QPEL motion vector candidate
if (ref->isLowres) // 如果当前搜索的参考帧是1/2下采样参考帧
{
for (int i = 0; i < numCandidates; i++) // 探测1/2下采样的其它mvc是否更优 (其实numCandidates = 0 这里属于冗余代码 1/2下采样中传入的numCandidates为0)
{
MV m = mvc[i].clipped(qmvmin, qmvmax);
if (m.notZero() && m != pmv && m != bestpre) // check already measured
{
int cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m); // 获取伪分像素插值的sad值+mvcost
if (cost < bprecost)
{
bprecost = cost;
bestpre = m;
}
}
}
}
else // 如果当前为普通参考帧
{
for (int i = 0; i < numCandidates; i++)
{
MV m = mvc[i].clipped(qmvmin, qmvmax);
if (m.notZero() && m != pmv && m != bestpre) // check already measured
{
int cost = subpelCompare(ref, m, sad) + mvcost(m);
if (cost < bprecost)
{
bprecost = cost;
bestpre = m;
}
}
}
}
pmv = pmv.roundToFPel(); //将pmv四舍五入取整,在umh算法中用到
MV omv = bmv; // current search origin or starting point 设置搜索原点
switch (searchMethod)
{
case X265_DIA_SEARCH: // 菱形(迭代)搜索,仅在preset为ultrafast级别时,才选择这种搜索方法
{
/* diamond search, radius 1 */
bcost <<= 4; // 左移4位,空出来的低4位用于判断是否有更好的MV(即可以得到更小的cost)
int i = merange;
do
{
// 1
//4 * 12
// 3
// 1/3/4/12用于标示不同的MV值,通过移位来实现从这些标号到不同MV的转换,具体如下:
// 在X方向通过(bcost << 28) >> 30得到MV的x分量;在Y方向通过(bcost << 30) >> 30得到MV的y分量
// 标号1, X方向:1 << 28 = 0x1000_0000,(1 << 28) >> 30 = 0;Y方向:1 << 30 = 0x4000_0000,(1 << 30) >> 30 = 1
// 标号3, X方向:3 << 28 = 0x3000_0000,(1 << 28) >> 30 = 0;Y方向:3 << 30 = 0xc000_0000,(1 << 30) >> 30 = 0x8000_0001 = -1
// 标号4, X方向:4 << 28 = 0x4000_0000,(1 << 28) >> 30 = 1;Y方向:4 << 30 = 0x0000_0000,(1 << 30) >> 30 = 0
// 标号12 = 0xc,X方向:c << 28 = 0xc000_0000,(1 << 28) >> 30 = 0x1000_0001 = -1;Y方向:c << 30 = 0x0000_0000,(1 << 30) >> 30 = 0
// X/Y方向的MV实际上是上面计算出来的MV的相反数,所以在之前加个负号即可。
COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs); // 以上一次菱形迭代得到的最优点为中心,进行新一次菱形迭代,搜索上下左右四个点
COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
if (!(bcost & 15)) // 假如检查的四个点中没有更好的MV,则直接结束菱形搜索
break;
bmv.x -= (bcost << 28) >> 30; // 按照标号计算出相对于bmv的MV,并更新最优的MV
bmv.y -= (bcost << 30) >> 30;
bcost &= ~15; // 清除低4位的MV标示
}
while (--i && bmv.checkRange(mvmin, mvmax)); // 直到超出搜索窗格或者超过搜索允许的范围,则停止搜索
bcost >>= 4; // 右移四位,得出实际的最优cost
break; // 结束ME
}
case X265_HEX_SEARCH:
{
me_hex2: //goto 标号,在umh算法中会用到
/* hexagon search, radius 2 */
#if 0
for (int i = 0; i < merange / 2; i++) // 迭代六边形搜索,以merange/2为最大搜索范围
{
omv = bmv;
COST_MV(omv.x - 2, omv.y);
COST_MV(omv.x - 1, omv.y + 2);
COST_MV(omv.x + 1, omv.y + 2);
COST_MV(omv.x + 2, omv.y);
COST_MV(omv.x + 1, omv.y - 2);
COST_MV(omv.x - 1, omv.y - 2);
if (omv == bmv) // 当前一次的六边形搜索无法找到比上一次搜索到的最优点更好的MV,则结束六边形搜索
break;
if (!bmv.checkRange(mvmin, mvmax)) // 如果搜索的MV超过设定的边界,则结束六边形搜索
break;
}
#else // if 0
/* equivalent to the above, but eliminates duplicate candidates */
/*
假设当前搜索帧为:
86 90 97 96 91 91 105 98 110 100 104 108 113 78 46 35
96 101 107 117 129 127 134 135 120 100 104 97 53 37 34 32
137 139 141 137 137 137 139 141 125 127 92 41 37 32 30 31
140 136 151 147 152 156 154 139 145 74 41 37 33 30 33 34
61 64 62 57 103 67 90 83 62 48 45 35 32 33 34 32
66 73 69 90 75 67 92 66 49 44 35 34 33 30 32 32
75 93 59 99 60 67 64 50 48 41 37 33 33 32 32 32
78 83 65 73 73 62 48 49 45 39 34 34 31 33 31 34
71 52 83 96 68 49 53 45 44 34 32 33 32 31 34 31
46 49 62 70 55 50 49 43 37 36 37 35 30 35 29 30
51 65 87 63 48 51 50 44 36 37 37 35 34 30 31 35
60 84 94 46 47 50 49 43 44 37 36 37 30 32 34 36
93 80 53 47 50 51 46 45 42 39 39 32 38 39 39 40
99 68 46 47 52 48 44 44 41 38 33 36 35 34 38 38
84 52 46 50 49 47 45 45 43 31 36 38 37 40 35 38
72 44 50 52 49 47 47 47 33 34 40 35 36 33 35 36
其搜索块为右下角8x8:block_enc =
44 34 32 33 32 31 34 31
37 36 37 35 30 35 29 30
36 37 37 35 34 30 31 35
44 37 36 37 30 32 34 36
42 39 39 32 38 39 39 40
41 38 33 36 35 34 38 38
43 31 36 38 37 40 35 38
33 34 40 35 36 33 35 36
参考帧为:
87 94 101 98 97 96 105 102 103 93 98 103 61 39 35 33 33 31 37 43
102 104 104 122 127 130 132 135 119 106 100 44 38 34 30 31 32 38 38 46
153 148 141 151 148 147 146 136 136 82 39 37 33 30 30 34 35 36 43 45
126 122 132 138 146 140 133 130 76 43 39 35 32 31 34 32 31 35 40 44
62 66 62 56 102 70 90 68 53 45 36 34 35 33 32 32 33 38 42 46
63 74 61 108 62 74 68 51 45 40 36 34 33 32 33 32 35 37 42 40
82 94 54 94 63 61 50 48 42 41 34 32 32 33 33 33 37 36 36 49
83 74 71 78 65 50 51 45 39 36 32 31 33 31 32 33 30 35 43 40
64 52 89 80 53 50 47 44 39 33 35 34 31 33 34 30 34 39 41 42
47 57 66 60 49 51 47 45 37 35 38 37 33 31 30 34 36 38 41 44
54 75 86 46 51 48 48 42 38 36 34 35 34 31 32 35 36 41 39 46
70 83 53 49 51 52 47 42 42 36 37 36 33 37 39 40 40 40 39 43
91 67 44 46 49 48 45 42 41 38 34 32 34 33 35 36 35 39 48 56
85 49 45 48 48 47 43 46 41 34 31 36 34 38 38 34 37 35 43 47
66 44 47 52 48 46 44 46 38 30 36 34 34 34 31 32 33 46 40 43
48 48 51 50 48 47 46 44 30 37 41 36 34 32 33 40 47 31 33 48
42 49 50 47 45 49 49 33 37 45 41 38 37 34 37 38 27 40 52 41
46 46 49 44 46 50 44 33 45 47 40 37 38 45 47 31 45 62 59 45
44 47 48 40 47 45 30 42 49 42 38 38 49 42 31 36 44 56 55 51
49 51 45 46 53 40 43 48 54 48 50 55 42 28 34 38 49 49 55 39
其对应位置的参考块为:右下角8x8 (注意:当前不是16x16,而是20x20)
39 33 35 34 31 33 34 30
37 35 38 37 33 31 30 34
38 36 34 35 34 31 32 35
42 36 37 36 33 37 39 40
41 38 34 32 34 33 35 36
41 34 31 36 34 38 38 34
38 30 36 34 34 34 31 32
30 37 41 36 34 32 33 40
假设当前的整像素搜索原点为bmv = (0,-1)(整像素精度) ,MVP为(1,-4)(分像素精度) qp = 12
当前的搜索算法是六边形:
*(-1,-2) * (1,-2)
(-2,0) * *(2,0)
* (-1,2) * (1,2)
**/
/*
以(-2,0) 为例:则当前的偏移坐标为 bmv+(-2,0) = (-2,-1)
所以其参考块为:block_ref
51 45 39 36 32 31 33 31
47 44 39 33 35 34 31 33
47 45 37 35 38 37 33 31
48 42 38 36 34 35 34 31
47 42 42 36 37 36 33 37
45 42 41 38 34 32 34 33
43 46 41 34 31 36 34 38
44 46 38 30 36 34 34 34
求其SAD得到= Σabs(block_enc - block_ref) = 249
当前MV(整像素精度)(-2,-1) 其分像素精度 (-8,-4)
当前的MVD = MV-MVP = (-8,-4) - (1,-4) = (-9,0) ,qp = 12
mvcostx = λ*bits = 2^(qp/6-2) * s_bitsizes[i] =2^(qp/6-2)* (2*(log2(9+1))+e-1) = 2log2(10) + e -1 = 8.3621 四舍五入: 8
mvcosty = λ*bits = 2^(qp/6-2) * s_bitsizes[i] =2^(qp/6-2)* (e-2) = e - 2 = 0.7183 四舍五入:1
mvcost = mvcostx + mvcosty = 8+1 =9
cost = SAD + mvcost = 249+9 = 258
同理求得其它点为:188,263
**/
// 六边形搜索算法各个位置的标号及其对应的MV如下:
// 7(-1,-2) 6(1,-2)
// 2(-2,0) 5(2,0)
// 3(-1,2) 4(1,2)
COST_MV_X3_DIR(-2, 0, -1, 2, 1, 2, costs);//搜索六边形的下三点,并将cost存入costs中
bcost <<= 3; //将当前的最优cost扩大3位,低3位用于存储MV的位置标号
COPY1_IF_LT(bcost, (costs[0] << 3) + 2); //依次比较bcost, 其中cost 分别+2 +3 +4 +5 +6 +7,通过这些标号可以直接用获取最优的mv
COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
COST_MV_X3_DIR(2, 0, 1, -2, -1, -2, costs);//搜索六边形的上三点,并将cost存入costs中
COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
if (bcost & 7) //如果当前搜索的点有比bcost小的, 否则直接退出
{
int dir = (bcost & 7) - 2; //计算dir,MV的标号 2 3 4 5 6 7 对应的dir为: 0 1 2 3 4 5
bmv += hex2[dir + 1]; //找到最优mv(其实是相对bmv的最优mv),更新最优bmv。并以最新的点作为新的搜索起点
//const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
/* half hexagon, not overlapping the previous iteration */
for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--) //快速算法,只搜索半个六边形
{
COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
hex2[dir + 1].x, hex2[dir + 1].y,
hex2[dir + 2].x, hex2[dir + 2].y,
costs);
bcost &= ~7; // 清除cost中用于标示mv的低3位
COPY1_IF_LT(bcost, (costs[0] << 3) + 1); // 使用标号1 2 3来标示搜索的半个六边形的3个点
COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
if (!(bcost & 7)) //如果当前搜索无法获得更优的MV,则直接退出
break;
dir += (bcost & 7) - 2; // (bcost & 7) - 2 取值是 -1 0 1
dir = mod6m1[dir + 1]; // 更新dir找到下一次六边形搜索得中心点, const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
bmv += hex2[dir + 1]; // 更新最优的mv
}
}
bcost >>= 3; // 最后恢复cost
#endif // if 0
/* square refine */
// 用六边形搜索的可能不够准确,再在当前最优搜索点的四周进行一次8点的方形搜索,寻求最优mv
// 搜索模板:
// 5 1 7
// 3 0 4
// 6 2 8
int dir = 0;
COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
COPY2_IF_LT(bcost, costs[0], dir, 1);
COPY2_IF_LT(bcost, costs[1], dir, 2);
COPY2_IF_LT(bcost, costs[2], dir, 3);
COPY2_IF_LT(bcost, costs[3], dir, 4);
COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
COPY2_IF_LT(bcost, costs[0], dir, 5);
COPY2_IF_LT(bcost, costs[1], dir, 6);
COPY2_IF_LT(bcost, costs[2], dir, 7);
COPY2_IF_LT(bcost, costs[3], dir, 8);
bmv += square1[dir];
break;
}
// UMH(Unsymmetric-Cross Multi-Hexagon-Grid)搜索
// 在x265不同配置下默认都不会被调用
case X265_UMH_SEARCH:
{
int ucost1, ucost2;
int16_t cross_start = 1;
/* refine predictors */
omv = bmv;
ucost1 = bcost; // ucost1存储pmv的cost
DIA1_ITER(pmv.x, pmv.y); // 检查pmv周围(上下左右)四个点,并更新bmv
if (pmv.notZero()) // 假设pmv不为0,则检查(0,0)点
DIA1_ITER(0, 0);
ucost2 = bcost; // ucost2存储pmv及其周边点和(0,0)中最优的cost
if (bmv.notZero() && bmv != pmv) // 假如bmv与pmv不相等,且bmv不为零,则还需检查bmv的周围四个点
DIA1_ITER(bmv.x, bmv.y);
if (bcost == ucost2)
cross_start = 3;
/* Early Termination */
// 进行提前终止检查
omv = bmv;
if (bcost == ucost2 && SAD_THRESH(2000)) // 假如最优的cost与ucost2相等,则说明最优的MV就在附近。如果cost满足设定的阈值(2000),则可检查是否可以提前终止
{ // (0,-2)
// (-1,-1) (1,-1)
// (-2,0) (2,0)
// (-1,1) (1,1)
// (0,2)
COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0); // 在周边进行8点的菱形搜索
COST_MV_X4(2, 0, -1, 1, 1, 1, 0, 2);
if (bcost == ucost1 && SAD_THRESH(500)) // 如果最优点就是pmv,并且cost满足阈值条件(500),则直接结束搜索
break;
if (bcost == ucost2) // 如果最优点就是pmv周边mv,并且cost满足阈值条件(500),则直接结束搜索
{
int16_t range = (int16_t)(merange >> 1) | 1;
CROSS(3, range, range); // 进行十字搜索
COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1); // 搜索最优点的周边8个点
COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
if (bcost == ucost2) // 如果最优的cost没有变,仍为之前计算的ucost2,则结束搜索
break;
cross_start = range + 2; // 否则加大十字搜索的起始点
}
}
// TODO: Need to study x264's logic for building mvc list to understand why they
// have special cases here for 16x16, and whether they apply to HEVC CTU
// adaptive search range based on mvc variability
if (numCandidates)
{
/* range multipliers based on casual inspection of some statistics of
* average distance between current predictor and final mv found by ESA.
* these have not been tuned much by actual encoding. */
static const uint8_t range_mul[4][4] =
{
{ 3, 3, 4, 4 },
{ 3, 4, 4, 4 },
{ 4, 4, 4, 5 },
{ 4, 4, 5, 6 },
};
int mvd;
int sad_ctx, mvd_ctx;
int denom = 1;
if (numCandidates == 1)
{
if (LUMA_64x64 == partEnum)
/* mvc is probably the same as mvp, so the difference isn't meaningful.
* but prediction usually isn't too bad, so just use medium range */
mvd = 25;
else
mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
}
else
{
/* calculate the degree of agreement between predictors. */
/* in 64x64, mvc includes all the neighbors used to make mvp,
* so don't count mvp separately. */
denom = numCandidates - 1;
mvd = 0;
if (partEnum != LUMA_64x64)
{
mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
denom++;
}
mvd += predictorDifference(mvc, numCandidates);
}
sad_ctx = SAD_THRESH(1000) ? 0
: SAD_THRESH(2000) ? 1
: SAD_THRESH(4000) ? 2 : 3;
mvd_ctx = mvd < 10 * denom ? 0
: mvd < 20 * denom ? 1
: mvd < 40 * denom ? 2 : 3;
merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2; // 更新merange
}
/* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
* we are still centered on the same place as the DIA2. is this desirable? */
CROSS(cross_start, merange, merange >> 1); // 进行十字搜索
COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2); // 进行一次方形搜索
/* hexagon grid */
omv = bmv;
const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
uint16_t i = 1;
do
{
if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
mvmax.y - omv.y, omv.y - mvmin.y))
{
for (int j = 0; j < 16; j++)
{
MV mv = omv + (hex4[j] * i);
if (mv.checkRange(mvmin, mvmax))
COST_MV(mv.x, mv.y);
}
}
else
{
int16_t dir = 0; // 用于保存MV信息,dir = (mvx << 4) + mvy & 0xf
pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;
size_t dy = (size_t)i * stride;
// 同时计算四个MV的cost,这个宏定义省略了一些加减号,所以调用时一定要给出
#define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
sad_x4(fenc, \
fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
stride, costs + 4 * k); \
fref_base += 2 * dy;
#define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
#define MIN_MV(k, x, y) COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15))
// 进行16个点的六边形搜索
// (0,-4)
// (-2,-3) (2, -3)
// (-4,-2) (4,-2)
// (-4,-1) (4,-1)
// (-4, 0) (4, 0)
// (-4, 1) (4, 1)
// (-4, 2) (4, 2)
// (-2, 3) (+2, 3)
// (0, 4)
SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
// 将sad与MVD相加得到最终的cost
ADD_MVCOST(0, 0, -4);
ADD_MVCOST(1, 0, 4);
ADD_MVCOST(2, -2, -3);
ADD_MVCOST(3, 2, -3);
ADD_MVCOST(4, -4, -2);
ADD_MVCOST(5, 4, -2);
ADD_MVCOST(6, -4, -1);
ADD_MVCOST(7, 4, -1);
ADD_MVCOST(8, -4, 0);
ADD_MVCOST(9, 4, 0);
ADD_MVCOST(10, -4, 1);
ADD_MVCOST(11, 4, 1);
ADD_MVCOST(12, -4, 2);
ADD_MVCOST(13, 4, 2);
ADD_MVCOST(14, -2, 3);
ADD_MVCOST(15, 2, 3);
// 找到这16个点中最优的MV
MIN_MV(0, 0, -4);
MIN_MV(1, 0, 4);
MIN_MV(2, -2, -3);
MIN_MV(3, 2, -3);
MIN_MV(4, -4, -2);
MIN_MV(5, 4, -2);
MIN_MV(6, -4, -1);
MIN_MV(7, 4, -1);
MIN_MV(8, -4, 0);
MIN_MV(9, 4, 0);
MIN_MV(10, -4, 1);
MIN_MV(11, 4, 1);
MIN_MV(12, -4, 2);
MIN_MV(13, 4, 2);
MIN_MV(14, -2, 3);
MIN_MV(15, 2, 3);
#undef SADS
#undef ADD_MVCOST
#undef MIN_MV
if (dir) // 如果找到了最优点则更新最优的MV
{
bmv.x = omv.x + i * (dir >> 4); // 去掉低四位
bmv.y = omv.y + i * ((dir << 28) >> 28); // 只保留低四位
}
}
}
while (++i <= merange >> 2);
if (bmv.checkRange(mvmin, mvmax)) // 如果当前的最优bmv没有越界,则跳转到六边形搜索,使用六边形搜索进行最后的refine操作
goto me_hex2;
break;
}
case X265_STAR_SEARCH: // Adapted from HM ME
{
int bPointNr = 0; // 最优MV在搜索模型中的标号,如果未找到更优的MV,则bPointNr = 0
int bDistance = 0; // 最优点的搜索步长(星形搜索/HM中TZ搜索,就是根据最优点的搜索步长进行不同策略的搜索,具体策略如下:)
// 如果最优点发生在步长为1的位置则说明最优MV就在该位置附近,无需再进行大范围搜索;
// 如果最优点的搜索步长较大,需要以该点为中心重新搜索;
// 如果最优点的搜索步长过大(>RasterDistance),则需要在全范围内,进行raster搜索,再以最优点为中心进行搜索
const int EarlyExitIters = 3; // 迭代次数,如果在搜索中使用不同步长的搜索次数大于设置的迭代次数,则提前终止搜索
StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange); // 进行一次星形搜索
if (bDistance == 1) // 如果最MV的步长为1,则进行两点搜索,这两点为离当前最优位置最近的两个点
{
// if best distance was only 1, check two missing points. If no new point is found, stop
if (bPointNr) // 如果找到了比预测MV更优的MV,则进行两点搜索
{
/* For a given direction 1 to 8, check nearest two outer X pixels
X X
X 1 2 3 X
4 * 5
X 6 7 8 X
X X
*/
int saved = bcost;
const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
if (mv1.checkRange(mvmin, mvmax)) // 如果mv1在允许的搜索范围内
{
COST_MV(mv1.x, mv1.y);
}
if (mv2.checkRange(mvmin, mvmax)) // 如果mv2在允许的搜索范围内
{
COST_MV(mv2.x, mv2.y);
}
if (bcost == saved) // 如果无法在两点搜索中找到更优的MV,则直接结束星形搜索,即找到了最终的MV
break;
}
else // 如果没有找到比预测MV更优的MV,则直接结束
break;
}
const int RasterDistance = 5;
if (bDistance > RasterDistance) // 如果搜索到的最优MV步长过大,则进行光栅搜索,光栅搜索以固定的步长(5)在整个搜索范围内搜索
{
// raster search refinement if original search distance was too big
MV tmv;
for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance)
{
for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance)
{
if (tmv.x + (RasterDistance * 3) <= mvmax.x) // 同时计算四个位置的SAD,并将四个位置的cost与最优cost比较
{
pixel *pix_base = fref + tmv.y * stride + tmv.x;
sad_x4(fenc,
pix_base,
pix_base + RasterDistance,
pix_base + RasterDistance * 2,
pix_base + RasterDistance * 3,
stride, costs);
costs[0] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[0], bmv, tmv);
tmv.x += RasterDistance;
costs[1] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[1], bmv, tmv);
tmv.x += RasterDistance;
costs[2] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[2], bmv, tmv);
tmv.x += RasterDistance;
costs[3] += mvcost(tmv << 3);
COPY2_IF_LT(bcost, costs[3], bmv, tmv);
}
else
COST_MV(tmv.x, tmv.y);
}
}
}
while (bDistance > 0) // 进行星形搜索和两点搜索的迭代,直到某一次迭代时无法找到更优的MV,则迭代结束
{
// center a new search around current best
bDistance = 0;
bPointNr = 0;
const int MaxIters = 32; // 迭代次数,如果在搜索中使用不同步长的搜索次数大于设置的迭代次数,则提前终止搜索
StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange); // 进行一次星形搜索
if (bDistance == 1) // 如果最MV的步长为1,则进行两点搜索,这两点为离当前最优位置最近的两个点
{
if (!bPointNr) // 如果没有找到更优的MV,则直接结束搜索
break;
/* For a given direction 1 to 8, check nearest 2 outer X pixels
X X
X 1 2 3 X
4 * 5
X 6 7 8 X
X X
*/
const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
if (mv1.checkRange(mvmin, mvmax)) // 如果mv1在允许的搜索范围内
{
COST_MV(mv1.x, mv1.y);
}
if (mv2.checkRange(mvmin, mvmax)) // 如果mv2在允许的搜索范围内
{
COST_MV(mv2.x, mv2.y);
}
break;
}
}
break;
}
case X265_FULL_SEARCH: // 全搜索
{
// dead slow exhaustive search, but at least it uses sad_x4()
MV tmv;
for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)
{
for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)
{
if (tmv.x + 3 <= mvmax.x) // 同时计算四个位置的SAD,并将四个位置的cost与最优cost比较
{
pixel *pix_base = fref + tmv.y * stride + tmv.x;
sad_x4(fenc,
pix_base,
pix_base + 1,
pix_base + 2,
pix_base + 3,
stride, costs);
costs[0] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[0], bmv, tmv);
tmv.x++;
costs[1] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[1], bmv, tmv);
tmv.x++;
costs[2] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[2], bmv, tmv);
tmv.x++;
costs[3] += mvcost(tmv << 2);
COPY2_IF_LT(bcost, costs[3], bmv, tmv);
}
else
COST_MV(tmv.x, tmv.y);
}
}
break;
}
default:
X265_CHECK(0, "invalid motion estimate mode\n");
break;
}
if (bprecost < bcost) //如果当前搜索的最优mv 的编码代价不如周边块预测mv的编码代价,更新当前的最优mv:bmv 和最优cost:bcost
{
bmv = bestpre;
bcost = bprecost;
}
else
bmv = bmv.toQPel(); // promote search bmv to qpel 将搜索的整像素精度的mv扩展到1/4分像素精度
const SubpelWorkload& wl = workload[this->subpelRefine]; // 获取相应分像素搜索的参数集
if (!bcost) // 如果没有残差,跳过子像素搜索,直接获取其mvcost
{
/* if there was zero residual at the clipped MVP, we can skip subpel
* refine, but we do need to include the mvcost in the returned cost */
bcost = mvcost(bmv);
}
else if (ref->isLowres) // 如果当前搜索的参考帧是1/2下采样参考帧,使用快速的分像素MV搜索方法
{
int bdir = 0; // 存储最优的周边mv点的位置索引
for (int i = 1; i <= wl.hpel_dirs; i++) //使用sad为评价指标,遍历当前最优mv周边的1/2像素位置mv,hpel_dirs在SubpelWorkload结构体中设置。输入subpelRefine参数越大,hpel_dirs越大
{
/*
搜索模板
* * *
* 0 *
* * *
*/
MV qmv = bmv + square1[i] * 2; // 将半像素点乘2得到1/4像素精度的MV
int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv); //获取当前的cost
COPY2_IF_LT(bcost, cost, bdir, i); //更新最优cost
}
bmv += square1[bdir] * 2; //更新最优mv
bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv); //获取当前的cost值: satd + mvcost
bdir = 0;
for (int i = 1; i <= wl.qpel_dirs; i++) //使用stad cost为评价指标,遍历当前最优mv周边的1/4像素位置
{
MV qmv = bmv + square1[i];
int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv); //获取当前的cost: satd + mvcost
COPY2_IF_LT(bcost, cost, bdir, i); //更新最优cost
}
bmv += square1[bdir]; //更新最优mv
}
else // 如果当前为普通参考帧,进行标准的分像素搜索方法
{
pixelcmp_t hpelcomp;
if (wl.hpel_satd) // 如果分像素搜索中使用satd,则进行标准的分像素插值并使用satd计算cost
{
bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
hpelcomp = satd;
}
else // 否则,使用sad计算cost
hpelcomp = sad;
// 1/2分像素运动搜索
for (int iter = 0; iter < wl.hpel_iters; iter++) // 按照规定的迭代次数进行搜索
{
int bdir = 0;
for (int i = 1; i <= wl.hpel_dirs; i++)
{
MV qmv = bmv + square1[i] * 2; // 由于是1/4像素精度,所以将偏移MV乘2
int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); // 计算得到cost
COPY2_IF_LT(bcost, cost, bdir, i); // 更新最优的分像素MV
}
if (bdir) // 如果找到更好的分像素MV,则将最好的相对MV加在原MV上
bmv += square1[bdir] * 2;
else
break;
}
/* if HPEL search used SAD, remeasure with SATD before QPEL */
if (!wl.hpel_satd) // 加入分像素搜索使用的是sad,则使用satd再做一次搜索
bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
// 1/4分像素运动搜索,1/4分像素点都使用satd计算cost
for (int iter = 0; iter < wl.qpel_iters; iter++)
{
int bdir = 0;
for (int i = 1; i <= wl.qpel_dirs; i++)
{
MV qmv = bmv + square1[i];
int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv); // 计算得到cost
COPY2_IF_LT(bcost, cost, bdir, i); // 更新最优的分像素MV
}
if (bdir) // 如果找到更好的分像素MV,则将最好的相对MV加在原MV上
bmv += square1[bdir];
else
break;
}
}
x265_emms(); //清除MMX寄存器中的内容,即初始化(以避免和浮点数操作发生冲突)。
outQMv = bmv; //存储最优的mv(1/4分像素精度)
return bcost; //返回cost值: 如果当前搜索的参考帧是1/2分辨率采样参考帧:satd + mvcost 如果当前为普通参考帧:则返回标准分像素搜索得到的cost
}
/** 函数功能 : 对一个分像素MV位置进行插值,并估计所花费的cost
/* 调用范围 :只在MotionEstimate::motionEstimate函数中被调用
* \参数 ref :参考帧
* \参数 qmv :1/4像素精度的MV
* \参数 cmp :计算distortion所使用的函数
* \返回 :所花费的cost **/
int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
{
intptr_t refStride = ref->lumaStride;
pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
int xFrac = qmv.x & 0x3; // 得到MV中的x分量中的分像素MV
int yFrac = qmv.y & 0x3; // 得到MV中的y分量中的分像素MV
int cost;
intptr_t lclStride = fencPUYuv.m_size;
X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
if (!(yFrac | xFrac)) // 如果输入的MV为整像素MV,则直接跳过插值,使用整像素参考帧计算cost
cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
else // 否则需要首先插值,再计算cost
{
/* we are taking a short-cut here if the reference is weighted. To be
* accurate we should be interpolating unweighted pixels and weighting
* the final 16bit values prior to rounding and down shifting. Instead we
* are simply interpolating the weighted full-pel pixels. Not 100%
* accurate but good enough for fast qpel ME */
ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
if (!yFrac) // 如果是整数行,则只需进行横向插值
primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, lclStride, xFrac);
else if (!xFrac) // 如果是整数列,则只需进行纵向插值
primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, lclStride, yFrac);
else // 如果既不是整数行也不是整数列,那么就需要先进行横向插值,再进行纵向插值
primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, lclStride, xFrac, yFrac);
cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride); // 得到分像素位置的cost,
}
if (bChromaSATD) // 如果对chroma也计算satd
{
int csp = fencPUYuv.m_csp; // 读取YUV的数据格式
int hshift = fencPUYuv.m_hChromaShift; // 色度宽度需要移位个数
int vshift = fencPUYuv.m_vChromaShift; // 色度高度需要移位个数
int shiftHor = (2 + hshift); // 对于YUV420,hshift=vshift=1,shiftHor=shiftVer=2+1 = 3。MV右移shiftHor/shiftVer位,其中右移的两位相当于找到亮度分量的整数MV位置,而再右移的hshift/vshift,则找到色度分量的整数MV位置
int shiftVer = (2 + vshift);
lclStride = fencPUYuv.m_csize;
intptr_t refStrideC = ref->reconPic->m_strideC; // 得到参考帧色度的步长
intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC; // 得到色度分量在YUV数据中的地址偏移
const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset; // 得到cb分量的地址
const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset; // 得到cr分量的地址
xFrac = qmv.x & ((1 << shiftHor) - 1); // 得到MV中的x分量中的分像素MV,对于YUV420,由于色度块的长和宽均为亮度的1/2,所以色度MV是1/8像素精度。
yFrac = qmv.y & ((1 << shiftVer) - 1); // 得到MV中的y分量中的分像素MV
if (!(yFrac | xFrac)) // 如果输入的MV为整像素MV,则直接跳过插值,使用整像素参考帧计算cost(色度分量的cost包括cb/cr两部分)
{
cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
}
else // 否则需要首先插值,再计算cost
{
ALIGN_VAR_32(pixel, subpelbuf[64 * 64]); // 申请子像素存储的buffer
if (!yFrac) // 如果是整数行,则只需进行横向插值
{
primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift)); // cb色度分量横向插值
cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride); // 计算cb分量的cost
primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift)); // cr色度分量横向插值
cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride); // 计算cr分量的cost
}
else if (!xFrac) // 如果是整数列,则只需进行纵向插值
{
primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift)); // cb色度分量纵向插值
cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride); // 计算cb分量的cost
primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift)); // cr色度分量纵向插值
cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride); // 计算cr分量的cost
}
else // 如果既不是整数行也不是整数列,那么就需要先进行横向插值,再进行纵向插值
{
ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
int extStride = blockwidth >> hshift;
int filterSize = NTAPS_CHROMA;
int halfFilterSize = (filterSize >> 1);
primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1); // cb色度分量横向插值
primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift)); // cb色度分量纵向插值
cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride); // 计算cb分量的cost
primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1); // cr色度分量横向插值
primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift)); // cr色度分量纵向插值
cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride); // 计算cr分量的cost
}
}
}
return cost;
}