void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme)
{
Lowres *fref0 = m_frames[p0];
Lowres *fref1 = m_frames[p1];
Lowres *fenc = m_frames[b];
ReferencePlanes *wfref0 = (fenc->weightedRef[b - p0].isWeighted && !hme) ? &fenc->weightedRef[b - p0] : fref0;
const int widthInCU = hme ? m_lookahead.m_4x4Width : m_lookahead.m_8x8Width;
const int heightInCU = hme ? m_lookahead.m_4x4Height : m_lookahead.m_8x8Height;
const int bBidir = (b < p1);
const int cuXY = cuX + cuY * widthInCU;
const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;
const int cuSize = X265_LOWRES_CU_SIZE;
const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ? fenc->lumaStride/2 : fenc->lumaStride);
if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)
tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)
tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
int lowresPenalty = 4;
int listDist[2] = { b - p0, p1 - b};
MV mvmin, mvmax;
int bcost = tld.me.COST_MAX;
int listused = 0;
mvmin.x = (int32_t)(-cuX * cuSize - 8);
mvmin.y = (int32_t)(-cuY * cuSize - 8);
mvmax.x = (int32_t)((widthInCU - cuX - 1) * cuSize + 8);
mvmax.y = (int32_t)((heightInCU - cuY - 1) * cuSize + 8);
for (int i = 0; i < 1 + bBidir; i++)
{
int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY] : fenc->lowresMvCosts[i][listDist[i]][cuXY];
int skipCost = INT_MAX;
if (!bDoSearch[i])
{
COPY2_IF_LT(bcost, fencCost, listused, i + 1);
continue;
}
int numc = 0;
MV mvc[5], mvp;
MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] : &fenc->lowresMvs[i][listDist[i]][cuXY];
ReferencePlanes* fref = i ? fref1 : wfref0;
#define MVC(mv) mvc[numc++] = mv;
if (cuX < widthInCU - 1)
MVC(fencMV[1]);
if (!lastRow)
{
MVC(fencMV[widthInCU]);
if (cuX > 0)
MVC(fencMV[widthInCU - 1]);
if (cuX < widthInCU - 1)
MVC(fencMV[widthInCU + 1]);
}
if (fenc->lowerResMvs[0][0] && !hme && fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)
{
MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);
}
#undef MVC
if (!numc)
mvp = 0;
else
{
ALIGN_VAR_32(pixel, subpelbuf[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
int mvpcost = MotionEstimate::COST_MAX;
for (int idx = 0; idx < numc; idx++)
{
intptr_t stride = X265_LOWRES_CU_SIZE;
pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride, hme);
int cost = tld.me.bufSATD(src, stride);
COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
if (!mvp.notZero() && bBidir)
skipCost = cost;
}
}
if(!hme)
fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
else
fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane[0]);
if (skipCost < 64 && skipCost < fencCost && bBidir)
{
fencCost = skipCost;
*fencMV = 0;
}
COPY2_IF_LT(bcost, fencCost, listused, i + 1);
}
if (hme)
return;
if (bBidir)
{
ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);
pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);
ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
COPY2_IF_LT(bcost, bicost, listused, 3);
src0 = fref0->lowresPlane[0] + pelOffset;
src1 = fref1->lowresPlane[0] + pelOffset;
primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
COPY2_IF_LT(bcost, bicost, listused, 3);
bcost += lowresPenalty;
}
else
{
bcost += lowresPenalty;
if (fenc->intraCost[cuXY] < bcost)
{
bcost = fenc->intraCost[cuXY];
listused = 0;
}
}
const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
int bcostAq;
if (m_lookahead.m_param->rc.qgSize == 8)
bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor8x8[cuXY] + 128) >> 8) : bcost;
else
bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] +128) >> 8) : bcost;
if (bFrameScoreCU)
{
if (slice < 0)
{
fenc->costEst[b - p0][p1 - b] += bcost;
fenc->costEstAq[b - p0][p1 - b] += bcostAq;
if (!listused && !bBidir)
fenc->intraMbs[b - p0]++;
}
else
{
m_slice[slice].costEst += bcost;
m_slice[slice].costEstAq += bcostAq;
if (!listused && !bBidir)
m_slice[slice].intraMbs++;
}
}
fenc->rowSatds[b - p0][p1 - b][cuY] += bcostAq;
fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
}