Relax inter-CTU dependencies when SAO is off

When using WPP and OWF, the first CTU of a row depends on the last CTU
of the row below in the reference frame. This is necessary when SAO is
enabled since we currently do SAO for a whole CTU row at a time. When
SAO is disabled, however, it is unnecessary to wait for the whole row.

Changes CTUs to depend only on the CTU below in the reference frame
instead of the whole row when WPP and OWF are enabled and SAO disabled.
Gives a significant speedup when running on a machine with many CPU
cores.
This commit is contained in:
Arttu Ylä-Outinen 2017-06-20 16:31:04 +03:00
parent 1efa2708b2
commit bb5354f7e2
3 changed files with 69 additions and 12 deletions

View file

@ -795,9 +795,10 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
#endif
kvz_threadqueue_free_job(&state->tile->wf_jobs[lcu->id]);
state->tile->wf_jobs[lcu->id] = kvz_threadqueue_submit(state->encoder_control->threadqueue, encoder_state_worker_encode_lcu, (void*)lcu, 1, job_description);
threadqueue_job_t **job = &state->tile->wf_jobs[lcu->id];
// If job object was returned, add dependancies and allow it to run.
if (state->tile->wf_jobs[lcu->id]) {
if (job[0]) {
// Add inter frame dependancies when ecoding more than one frame at
// once. The added dependancy is for the first LCU of each wavefront
// row to depend on the reconstruction status of the row below in the
@ -806,26 +807,33 @@ static void encoder_state_encode_leaf(encoder_state_t * const state)
state->previous_encoder_state->tqj_recon_done &&
state->frame->slicetype != KVZ_SLICE_I)
{
if (!lcu->left) {
const lcu_order_element_t * const ref_lcu = &ref_state->lcu_order[i];
if (lcu->below) {
kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], ref_lcu->below->encoder_state->tqj_recon_done);
// We need to wait until the CTUs whose pixels we refer to are
// done before we can start this CTU.
if (lcu->below) {
if (lcu->below->right) {
kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[lcu->below->right->id]);
} else {
kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], ref_lcu->encoder_state->tqj_recon_done);
kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[lcu->below->id]);
}
} else {
if (lcu->right) {
kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[lcu->right->id]);
} else {
kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[lcu->id]);
}
}
}
// Add local WPP dependancy to the LCU on the left.
if (lcu->left) {
kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - 1]);
kvz_threadqueue_job_dep_add(job[0], job[-1]);
}
// Add local WPP dependancy to the LCU on the top right.
if (lcu->above) {
if (lcu->above->right) {
kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu + 1]);
kvz_threadqueue_job_dep_add(job[0], job[-state->tile->frame->width_in_lcu + 1]);
} else {
kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu]);
kvz_threadqueue_job_dep_add(job[0], job[-state->tile->frame->width_in_lcu]);
}
}

View file

@ -1005,6 +1005,7 @@ static INLINE bool add_mvp_candidate(const encoder_state_t *state,
{
if (!cand) return false;
assert(cand->inter.mv_dir != 0);
const int cand_list = cand->inter.mv_dir & (1 << reflist) ? reflist : !reflist;
if (scaling) {

View file

@ -40,9 +40,50 @@
*/
static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int height, int wpp_limit)
{
const encoder_control_t *ctrl = state->encoder_control;
if (ctrl->cfg.owf && ctrl->cfg.wpp) {
// Check that the block does not reference pixels that are not final.
// Fractional motion estimation and odd chroma interpolation need
// 4 pixels below the bottom edge of the block.
int margin = 4;
if (ctrl->cfg.sao_enable) {
// Make sure we don't refer to pixels for which SAO reconstruction
// has not been done.
margin += SAO_DELAY_PX;
} else if (ctrl->cfg.deblock_enable) {
// Make sure we don't refer to pixels that have not been deblocked.
margin += DEBLOCK_DELAY_PX;
}
// Coordinates of the top-left corner of the containing LCU.
const vector2d_t orig_lcu = {
.x = orig->x / LCU_WIDTH,
.y = orig->y / LCU_WIDTH,
};
// Difference between the coordinates of the LCU containing the
// bottom-left corner of the referenced block and the LCU containing
// this block.
const vector2d_t mv_lcu = {
.x = (((orig->x + width + margin) << 2) + x) / (LCU_WIDTH << 2) - orig_lcu.x,
.y = (((orig->y + height + margin) << 2) + y) / (LCU_WIDTH << 2) - orig_lcu.y,
};
// TODO: Remove hard coded constants.
if (mv_lcu.y > 1) {
return false;
}
// TODO: Remove hard coded constants.
if (mv_lcu.x + mv_lcu.y > 2) {
return false;
}
}
if (state->encoder_control->cfg.mv_constraint == KVZ_MV_CONSTRAIN_NONE) {
return (wpp_limit == -1 || y + (height << 2) <= (wpp_limit << 2));
};
return true;
}
int margin = 0;
if (state->encoder_control->cfg.mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN) {
@ -1711,6 +1752,13 @@ void kvz_search_cu_smp(encoder_state_t * const state,
search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost);
if (cost >= MAX_INT) {
// Could not find any motion vector.
*inter_cost = MAX_INT;
*inter_bitcost = MAX_INT;
return;
}
*inter_cost += cost;
*inter_bitcost += bitcost;