void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
{
ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
CUData& cu = interMode.cu;
Yuv* predYuv = &interMode.predYuv;
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
const Slice *slice = m_slice;
int numPart = cu.getNumPartInter(0);
int numPredDir = slice->isInterP() ? 1 : 2;
const int* numRefIdx = slice->m_numRefIdx;
uint32_t lastMode = 0;
int totalmebits = 0;
MV mvzero(0, 0);
Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
MergeData merge;
memset(&merge, 0, sizeof(merge));
bool useAsMVP = false;
for (int puIdx = 0; puIdx < numPart; puIdx++)
{
MotionData* bestME = interMode.bestME[puIdx];
PredictionUnit pu(cu, cuGeom, puIdx);
m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
useAsMVP = false;
x265_analysis_inter_data* interDataCTU = NULL;
int cuIdx;
cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
if (m_param->analysisReuseLevel == 10 && m_param->interRefine > 1)
{
interDataCTU = m_frame->m_analysisData.interData;
if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
&& (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
&& !(interDataCTU->mergeFlag[cuIdx + puIdx])
&& (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
useAsMVP = true;
}
uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
bestME[0].cost = MAX_UINT;
bestME[1].cost = MAX_UINT;
getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
bool bDoUnidir = true;
cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
if ((m_param->analysisLoad && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)
|| (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP))
{
for (int list = 0; list < numPredDir; list++)
{
int ref = -1;
if (useAsMVP)
ref = interDataCTU->refIdx[list][cuIdx + puIdx];
else
ref = bestME[list].ref;
if (ref < 0)
continue;
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[list]);
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
MV mvmin, mvmax, outmv, mvp;
mvp = amvp[mvpIdx];
if (m_param->searchMethod == X265_SEA)
{
int puX = puIdx & 1;
int puY = puIdx >> 1;
for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
}
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
MV mvpIn = mvp;
int satdCost;
if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
mvpIn = bestME[list].mv;
if (useAsMVP)
{
MV bestmv, mvpSel[3];
int mvpIdxSel[3];
satdCost = m_me.COST_MAX;
mvpSel[0] = interDataCTU->mv[list][cuIdx + puIdx].word;
mvpIdxSel[0] = interDataCTU->mvpIdx[list][cuIdx + puIdx];
if (m_param->mvRefine > 1)
{
mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx];
mvpIdxSel[1] = mvpIdx;
if (m_param->mvRefine > 2)
{
mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx];
mvpIdxSel[2] = !mvpIdx;
}
}
for (int cand = 0; cand < m_param->mvRefine; cand++)
{
if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2])))
continue;
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
if (satdCost > bcost)
{
satdCost = bcost;
outmv = bestmv;
mvp = mvpSel[cand];
mvpIdx = mvpIdxSel[cand];
}
}
}
else
{
satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
}
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
if (!(m_param->analysisMultiPassRefine || useAsMVP))
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
else
{
int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn);
if (diffBits < 0)
{
mvpIdx = !mvpIdx;
uint32_t origOutBits = bits;
bits = origOutBits + diffBits;
cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);
}
mvp = amvp[mvpIdx];
}
if (cost < bestME[list].cost)
{
bestME[list].mv = outmv;
bestME[list].mvp = mvp;
bestME[list].mvpIdx = mvpIdx;
bestME[list].cost = cost;
bestME[list].bits = bits;
bestME[list].mvCost = mvCost;
bestME[list].ref = ref;
}
bDoUnidir = false;
}
}
else if (m_param->bDistributeMotionEstimation)
{
PME pme(*this, interMode, cuGeom, pu, puIdx);
pme.m_jobTotal = 0;
pme.m_jobAcquired = 1;
uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
for (int list = 0; list < numPredDir; list++)
{
int idx = 0;
for (int ref = 0; ref < numRefIdx[list]; ref++)
{
if (!(refMask & (1 << ref)))
continue;
pme.m_jobs.ref[list][idx++] = ref;
pme.m_jobTotal++;
}
pme.m_jobs.refCnt[list] = idx;
refMask >>= 16;
}
if (pme.m_jobTotal > 2)
{
pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);
processPME(pme, *this);
int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];
singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref);
bDoUnidir = false;
ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
pme.waitForExit();
}
}
if (bDoUnidir)
{
interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;
uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
for (int list = 0; list < numPredDir; list++)
{
for (int ref = 0; ref < numRefIdx[list]; ref++)
{
ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);
if (!(refMask & (1 << ref)))
{
ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);
continue;
}
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[list]);
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
const MV* amvp = interMode.amvpCand[list][ref];
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
MV mvmin, mvmax, outmv, mvp_lowres;
MV mvp = amvp[mvpIdx];
bool bLowresMVP = false;
if (!m_param->analysisSave && !m_param->analysisLoad)
{
MV lmv = getLowresMV(cu, pu, list, ref);
if (lmv.notZero())
mvc[numMvc++] = lmv;
if (m_param->bEnableHME)
mvp_lowres = lmv;
}
if (m_param->searchMethod == X265_SEA)
{
int puX = puIdx & 1;
int puY = puIdx >> 1;
for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
}
setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
{
MV outmv_lowres;
setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
if (lowresMvCost < satdCost)
{
outmv = outmv_lowres;
satdCost = lowresMvCost;
bLowresMVP = true;
}
}
bits += m_me.bitcost(outmv);
uint32_t mvCost = m_me.mvcost(outmv);
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
if (bLowresMVP)
updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
if (cost < bestME[list].cost)
{
bestME[list].mv = outmv;
bestME[list].mvp = mvp;
bestME[list].mvpIdx = mvpIdx;
bestME[list].ref = ref;
bestME[list].cost = cost;
bestME[list].bits = bits;
bestME[list].mvCost = mvCost;
}
}
refMask >>= 16;
}
}
MotionData bidir[2];
uint32_t bidirCost = MAX_UINT;
int bidirBits = 0;
if (slice->isInterB() && !cu.isBipredRestriction() &&
cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&
bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
{
bidir[0] = bestME[0];
bidir[1] = bestME[1];
int satdCost;
if (m_me.bChromaSATD)
{
cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
motionCompensation(cu, pu, tmpPredYuv, true, true);
satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
}
else
{
PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
bidirCost = satdCost + m_rdCost.getCost(bidirBits);
bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
if (bTryZero)
{
MV mvmin, mvmax;
int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
setSearchRange(cu, mvzero, merange, mvmin, mvmax);
mvmax.y += 2;
mvmin <<= 2;
mvmax <<= 2;
bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
}
if (bTryZero)
{
if (m_me.bChromaSATD)
{
cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
motionCompensation(cu, pu, tmpPredYuv, true, true);
satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
}
else
{
const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
intptr_t refStride = slice->m_mref[0][0].lumaStride;
primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
}
MV mvp0 = bestME[0].mvp;
int mvpIdx0 = bestME[0].mvpIdx;
uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
MV mvp1 = bestME[1].mvp;
int mvpIdx1 = bestME[1].mvpIdx;
uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);
mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);
if (cost < bidirCost)
{
bidir[0].mv = mvzero;
bidir[1].mv = mvzero;
bidir[0].mvp = mvp0;
bidir[1].mvp = mvp1;
bidir[0].mvpIdx = mvpIdx0;
bidir[1].mvpIdx = mvpIdx1;
bidirCost = cost;
bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
}
}
}
if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
{
cu.m_mergeFlag[pu.puAbsPartIdx] = true;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index;
cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
totalmebits += merge.bits;
}
else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
{
lastMode = 2;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
totalmebits += bidirBits;
}
else if (bestME[0].cost <= bestME[1].cost)
{
lastMode = 0;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
totalmebits += bestME[0].bits;
}
else
{
lastMode = 1;
cu.m_mergeFlag[pu.puAbsPartIdx] = false;
cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
totalmebits += bestME[1].bits;
}
motionCompensation(cu, pu, *predYuv, true, bChromaMC);
}
interMode.sa8dBits += totalmebits;
}