From bb5354f7e2bec32e9403e17381b4673fd4c1049e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= Date: Tue, 20 Jun 2017 16:31:04 +0300 Subject: [PATCH] Relax inter-CTU dependencies when SAO is off When using WPP and OWF, the first CTU of a row depends on the last CTU of the row below in the reference frame. This is necessary when SAO is enabled since we currently do SAO for a whole CTU row at a time. When SAO is disabled, however, it is unnecessary to wait for the whole row. Changes CTUs to depend only on the CTU below in the reference frame instead of the whole row when WPP and OWF are enabled and SAO disabled. Gives a significant speedup when running on a machine with many CPU cores. --- src/encoderstate.c | 28 ++++++++++++++++--------- src/inter.c | 1 + src/search_inter.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 69 insertions(+), 12 deletions(-) diff --git a/src/encoderstate.c b/src/encoderstate.c index 71a068b0..da37cd20 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -795,9 +795,10 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) #endif kvz_threadqueue_free_job(&state->tile->wf_jobs[lcu->id]); state->tile->wf_jobs[lcu->id] = kvz_threadqueue_submit(state->encoder_control->threadqueue, encoder_state_worker_encode_lcu, (void*)lcu, 1, job_description); - + threadqueue_job_t **job = &state->tile->wf_jobs[lcu->id]; + // If job object was returned, add dependancies and allow it to run. - if (state->tile->wf_jobs[lcu->id]) { + if (job[0]) { // Add inter frame dependancies when ecoding more than one frame at // once. The added dependancy is for the first LCU of each wavefront // row to depend on the reconstruction status of the row below in the @@ -806,26 +807,33 @@ static void encoder_state_encode_leaf(encoder_state_t * const state) state->previous_encoder_state->tqj_recon_done && state->frame->slicetype != KVZ_SLICE_I) { - if (!lcu->left) { - const lcu_order_element_t * const ref_lcu = &ref_state->lcu_order[i]; - if (lcu->below) { - kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], ref_lcu->below->encoder_state->tqj_recon_done); + // We need to wait until the CTUs whose pixels we refer to are + // done before we can start this CTU. + if (lcu->below) { + if (lcu->below->right) { + kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[lcu->below->right->id]); } else { - kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], ref_lcu->encoder_state->tqj_recon_done); + kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[lcu->below->id]); + } + } else { + if (lcu->right) { + kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[lcu->right->id]); + } else { + kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[lcu->id]); } } } // Add local WPP dependancy to the LCU on the left. if (lcu->left) { - kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - 1]); + kvz_threadqueue_job_dep_add(job[0], job[-1]); } // Add local WPP dependancy to the LCU on the top right. if (lcu->above) { if (lcu->above->right) { - kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu + 1]); + kvz_threadqueue_job_dep_add(job[0], job[-state->tile->frame->width_in_lcu + 1]); } else { - kvz_threadqueue_job_dep_add(state->tile->wf_jobs[lcu->id], state->tile->wf_jobs[lcu->id - state->tile->frame->width_in_lcu]); + kvz_threadqueue_job_dep_add(job[0], job[-state->tile->frame->width_in_lcu]); } } diff --git a/src/inter.c b/src/inter.c index d0c90a88..adc16507 100644 --- a/src/inter.c +++ b/src/inter.c @@ -1005,6 +1005,7 @@ static INLINE bool add_mvp_candidate(const encoder_state_t *state, { if (!cand) return false; + assert(cand->inter.mv_dir != 0); const int cand_list = cand->inter.mv_dir & (1 << reflist) ? reflist : !reflist; if (scaling) { diff --git a/src/search_inter.c b/src/search_inter.c index 78e39978..4ff55c2c 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -40,9 +40,50 @@ */ static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int height, int wpp_limit) { + const encoder_control_t *ctrl = state->encoder_control; + + if (ctrl->cfg.owf && ctrl->cfg.wpp) { + // Check that the block does not reference pixels that are not final. + + // Fractional motion estimation and odd chroma interpolation need + // 4 pixels below the bottom edge of the block. + int margin = 4; + if (ctrl->cfg.sao_enable) { + // Make sure we don't refer to pixels for which SAO reconstruction + // has not been done. + margin += SAO_DELAY_PX; + } else if (ctrl->cfg.deblock_enable) { + // Make sure we don't refer to pixels that have not been deblocked. + margin += DEBLOCK_DELAY_PX; + } + + // Coordinates of the top-left corner of the containing LCU. + const vector2d_t orig_lcu = { + .x = orig->x / LCU_WIDTH, + .y = orig->y / LCU_WIDTH, + }; + // Difference between the coordinates of the LCU containing the + // bottom-left corner of the referenced block and the LCU containing + // this block. + const vector2d_t mv_lcu = { + .x = (((orig->x + width + margin) << 2) + x) / (LCU_WIDTH << 2) - orig_lcu.x, + .y = (((orig->y + height + margin) << 2) + y) / (LCU_WIDTH << 2) - orig_lcu.y, + }; + + // TODO: Remove hard coded constants. + if (mv_lcu.y > 1) { + return false; + } + + // TODO: Remove hard coded constants. + if (mv_lcu.x + mv_lcu.y > 2) { + return false; + } + } + if (state->encoder_control->cfg.mv_constraint == KVZ_MV_CONSTRAIN_NONE) { - return (wpp_limit == -1 || y + (height << 2) <= (wpp_limit << 2)); - }; + return true; + } int margin = 0; if (state->encoder_control->cfg.mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN) { @@ -1711,6 +1752,13 @@ void kvz_search_cu_smp(encoder_state_t * const state, search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost); + if (cost >= MAX_INT) { + // Could not find any motion vector. + *inter_cost = MAX_INT; + *inter_bitcost = MAX_INT; + return; + } + *inter_cost += cost; *inter_bitcost += bitcost;