From 018b5ffa64e81d1e4bda0acaaaccc11d2083824c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= Date: Wed, 17 Jan 2018 13:09:47 +0200 Subject: [PATCH 1/2] Move inter CU reconstruction to a new function Moves code for reconstructing all PUs in an inter CU to a new function kvz_inter_recon_cu in inter.c. --- src/inter.c | 125 ++++++++++++++++++++++++++++++++++----------- src/inter.h | 33 +++++------- src/search.c | 42 +-------------- src/search_inter.c | 24 ++++----- 4 files changed, 121 insertions(+), 103 deletions(-) diff --git a/src/inter.c b/src/inter.c index 51cf1f05..ce9dca48 100644 --- a/src/inter.c +++ b/src/inter.c @@ -306,27 +306,27 @@ static void inter_cp_with_ext_border(const kvz_pixel *ref_buf, int ref_stride, /** - * \brief Reconstruct inter block + * \brief Reconstruct an inter PU using uniprediction. * * \param state encoder state * \param ref picture to copy the data from - * \param xpos block x position - * \param ypos block y position - * \param width block width - * \param height block height + * \param xpos PU x position + * \param ypos PU y position + * \param width PU width + * \param height PU height * \param mv_param motion vector * \param lcu destination lcu - * \param hi_prec_out destination of high precision output (null if not needed) + * \param hi_prec_out destination of high precision output, or NULL if not needed */ -void kvz_inter_recon_lcu(const encoder_state_t * const state, - const kvz_picture * const ref, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - const int16_t mv_param[2], - lcu_t *lcu, - hi_prec_buf_t *hi_prec_out) +static void inter_recon_unipred(const encoder_state_t * const state, + const kvz_picture * const ref, + int32_t xpos, + int32_t ypos, + int32_t width, + int32_t height, + const int16_t mv_param[2], + lcu_t *lcu, + hi_prec_buf_t *hi_prec_out) { const vector2d_t pu_in_tile = { xpos, ypos }; const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH }; @@ -428,27 +428,27 @@ void kvz_inter_recon_lcu(const encoder_state_t * const state, } /** - * \brief Reconstruct bi-pred inter block + * \brief Reconstruct bi-pred inter PU * * \param state encoder state * \param ref1 reference picture to copy the data from * \param ref2 other reference picture to copy the data from - * \param xpos block x position - * \param ypos block y position - * \param width block width - * \param height block height + * \param xpos PU x position + * \param ypos PU y position + * \param width PU width + * \param height PU height * \param mv_param motion vectors * \param lcu destination lcu */ -void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, - const kvz_picture * ref1, - const kvz_picture * ref2, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - int16_t mv_param[2][2], - lcu_t* lcu) +void kvz_inter_recon_bipred(const encoder_state_t * const state, + const kvz_picture * ref1, + const kvz_picture * ref2, + int32_t xpos, + int32_t ypos, + int32_t width, + int32_t height, + int16_t mv_param[2][2], + lcu_t* lcu) { kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH]; kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C]; @@ -468,7 +468,7 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH); if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH); //Reconstruct both predictors - kvz_inter_recon_lcu(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0); + inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0); if (!hi_prec_luma_rec0){ memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64); } @@ -476,7 +476,7 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32); memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32); } - kvz_inter_recon_lcu(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1); + inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1); // After reconstruction, merge the predictors by taking an average of each pixel for (temp_y = 0; temp_y < height; ++temp_y) { @@ -506,6 +506,69 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1); } + +/** + * Reconstruct a single CU. + * + * The CU may consist of multiple PUs, each of which can use either + * uniprediction or biprediction. + * + * \param state encoder state + * \param lcu containing LCU + * \param x x-coordinate of the CU in pixels + * \param y y-coordinate of the CU in pixels + * \param width CU width + */ +void kvz_inter_recon_cu(const encoder_state_t * const state, + lcu_t *lcu, + int32_t x, + int32_t y, + int32_t width) +{ + cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + + const int num_pu = kvz_part_mode_num_parts[cu->part_size]; + for (int i = 0; i < num_pu; ++i) { + const int pu_x = PU_GET_X(cu->part_size, width, x, i); + const int pu_y = PU_GET_Y(cu->part_size, width, y, i); + const int pu_w = PU_GET_W(cu->part_size, width, i); + const int pu_h = PU_GET_H(cu->part_size, width, i); + + cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); + + if (pu->inter.mv_dir == 3) { + const kvz_picture *const refs[2] = { + state->frame->ref->images[ + state->frame->ref_LX[0][ + pu->inter.mv_ref[0]]], + state->frame->ref->images[ + state->frame->ref_LX[1][ + pu->inter.mv_ref[1]]], + }; + kvz_inter_recon_bipred(state, + refs[0], refs[1], + pu_x, pu_y, + pu_w, pu_h, + pu->inter.mv, + lcu); + } else { + const int mv_idx = pu->inter.mv_dir - 1; + const kvz_picture *const ref = + state->frame->ref->images[ + state->frame->ref_LX[mv_idx][ + pu->inter.mv_ref[mv_idx]]]; + + inter_recon_unipred(state, + ref, + pu_x, pu_y, + pu_w, pu_h, + pu->inter.mv[mv_idx], + lcu, + NULL); + } + } +} + /** * \brief Clear unused L0/L1 motion vectors and reference * \param cu coding unit to clear diff --git a/src/inter.h b/src/inter.h index ee324fc1..3380cb1d 100644 --- a/src/inter.h +++ b/src/inter.h @@ -40,26 +40,21 @@ typedef struct { } inter_merge_cand_t; +void kvz_inter_recon_cu(const encoder_state_t * const state, + lcu_t *lcu, + int32_t x, + int32_t y, + int32_t width); -void kvz_inter_recon_lcu(const encoder_state_t * const state, - const kvz_picture * ref, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - const int16_t mv_param[2], - lcu_t* lcu, - hi_prec_buf_t *hi_prec_out); - -void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, - const kvz_picture * ref1, - const kvz_picture * ref2, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - int16_t mv_param[2][2], - lcu_t* lcu); +void kvz_inter_recon_bipred(const encoder_state_t * const state, + const kvz_picture * ref1, + const kvz_picture * ref2, + int32_t xpos, + int32_t ypos, + int32_t width, + int32_t height, + int16_t mv_param[2][2], + lcu_t* lcu); void kvz_inter_get_mv_cand(const encoder_state_t * const state, int32_t x, diff --git a/src/search.c b/src/search.c index 9943570c..31aafc87 100644 --- a/src/search.c +++ b/src/search.c @@ -538,47 +538,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth); - const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; - for (int i = 0; i < num_pu; ++i) { - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i); - const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i); - const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); - - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); - - if (cur_pu->inter.mv_dir == 3) { - const kvz_picture *const refs[2] = { - state->frame->ref->images[ - state->frame->ref_LX[0][ - cur_pu->inter.mv_ref[0]]], - state->frame->ref->images[ - state->frame->ref_LX[1][ - cur_pu->inter.mv_ref[1]]], - }; - kvz_inter_recon_lcu_bipred(state, - refs[0], refs[1], - pu_x, pu_y, - pu_w, pu_h, - cur_pu->inter.mv, - lcu); - } else { - const int mv_idx = cur_pu->inter.mv_dir - 1; - - const kvz_picture *const ref = - state->frame->ref->images[ - state->frame->ref_LX[mv_idx][ - cur_pu->inter.mv_ref[mv_idx]]]; - - kvz_inter_recon_lcu(state, - ref, - pu_x, pu_y, - pu_w, pu_h, - cur_pu->inter.mv[mv_idx], - lcu, - 0); - } - } + kvz_inter_recon_cu(state, lcu, x, y, cu_width); const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; kvz_quantize_lcu_residual(state, diff --git a/src/search_inter.c b/src/search_inter.c index 306f89e1..7e659c3b 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1495,18 +1495,18 @@ static void search_pu_inter(encoder_state_t * const state, continue; } - kvz_inter_recon_lcu_bipred(state, - state->frame->ref->images[ - state->frame->ref_LX[0][merge_cand[i].ref[0]] - ], - state->frame->ref->images[ - state->frame->ref_LX[1][merge_cand[j].ref[1]] - ], - x, y, - width, - height, - mv, - templcu); + kvz_inter_recon_bipred(state, + state->frame->ref->images[ + state->frame->ref_LX[0][merge_cand[i].ref[0]] + ], + state->frame->ref->images[ + state->frame->ref_LX[1][merge_cand[j].ref[1]] + ], + x, y, + width, + height, + mv, + templcu); for (int ypos = 0; ypos < height; ++ypos) { int dst_y = ypos * width; From 8c534170064cabd28dbe6d16ff37bba5793d94a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arttu=20Yl=C3=A4-Outinen?= Date: Wed, 17 Jan 2018 13:39:20 +0200 Subject: [PATCH 2/2] Check zero coefficient cost for inter Checks the cost of flushing all coefficients of an inter block to zero. This is much faster than doing full RDOQ but can still reduce bitrate significantly. Encoding speed is increased since fewer coefficient bits have to be coded with CABAC. --- src/search.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/src/search.c b/src/search.c index 31aafc87..c02aeebe 100644 --- a/src/search.c +++ b/src/search.c @@ -392,6 +392,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, const videoframe_t * const frame = state->tile->frame; int cu_width = LCU_WIDTH >> depth; double cost = MAX_INT; + double inter_zero_coeff_cost = MAX_INT; uint32_t inter_bitcost = MAX_INT; cu_info_t *cur_cu; @@ -518,7 +519,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // rd2. Possibly because the luma mode search already takes chroma // into account, so there is less of a chanse of luma mode being // really bad for chroma. - if (state->encoder_control->cfg.rdo == 3) { + if (ctrl->cfg.rdo == 3) { cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu); lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); } @@ -540,6 +541,30 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, kvz_inter_recon_cu(state, lcu, x, y, cu_width); + if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) { + const int luma_index = y_local * LCU_WIDTH + x_local; + const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2); + + double ssd = 0.0; + ssd += LUMA_MULT * kvz_pixels_calc_ssd( + &lcu->ref.y[luma_index], &lcu->rec.y[luma_index], + LCU_WIDTH, LCU_WIDTH, cu_width + ); + ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index], + LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 + ); + ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index], + LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 + ); + + inter_zero_coeff_cost = ssd + inter_bitcost * state->lambda; + + // Save the pixels at a lower level of the working tree. + copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1]); + } + const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; kvz_quantize_lcu_residual(state, true, has_chroma, @@ -549,7 +574,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int cbf = cbf_is_set_any(cur_cu->cbf, depth); - if(cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) { + if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) { cur_cu->merged = 0; cur_cu->skipped = 1; // Selecting skip reduces bits needed to code the CU @@ -575,6 +600,28 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } cost += mode_bits * state->lambda; + + if (inter_zero_coeff_cost <= cost) { + cost = inter_zero_coeff_cost; + + // Restore saved pixels from lower level of the working tree. + copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); + + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + cur_cu->merged = 0; + cur_cu->skipped = 1; + lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); + } + + if (cur_cu->tr_depth != depth) { + // Reset transform depth since there are no coefficients. This + // ensures that CBF is cleared for the whole area of the CU. + kvz_lcu_set_trdepth(lcu, x, y, depth, depth); + } + + cur_cu->cbf = 0; + lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu); + } } bool can_split_cu =