diff --git a/src/inter.c b/src/inter.c index 51cf1f05..ce9dca48 100644 --- a/src/inter.c +++ b/src/inter.c @@ -306,27 +306,27 @@ static void inter_cp_with_ext_border(const kvz_pixel *ref_buf, int ref_stride, /** - * \brief Reconstruct inter block + * \brief Reconstruct an inter PU using uniprediction. * * \param state encoder state * \param ref picture to copy the data from - * \param xpos block x position - * \param ypos block y position - * \param width block width - * \param height block height + * \param xpos PU x position + * \param ypos PU y position + * \param width PU width + * \param height PU height * \param mv_param motion vector * \param lcu destination lcu - * \param hi_prec_out destination of high precision output (null if not needed) + * \param hi_prec_out destination of high precision output, or NULL if not needed */ -void kvz_inter_recon_lcu(const encoder_state_t * const state, - const kvz_picture * const ref, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - const int16_t mv_param[2], - lcu_t *lcu, - hi_prec_buf_t *hi_prec_out) +static void inter_recon_unipred(const encoder_state_t * const state, + const kvz_picture * const ref, + int32_t xpos, + int32_t ypos, + int32_t width, + int32_t height, + const int16_t mv_param[2], + lcu_t *lcu, + hi_prec_buf_t *hi_prec_out) { const vector2d_t pu_in_tile = { xpos, ypos }; const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH }; @@ -428,27 +428,27 @@ void kvz_inter_recon_lcu(const encoder_state_t * const state, } /** - * \brief Reconstruct bi-pred inter block + * \brief Reconstruct bi-pred inter PU * * \param state encoder state * \param ref1 reference picture to copy the data from * \param ref2 other reference picture to copy the data from - * \param xpos block x position - * \param ypos block y position - * \param width block width - * \param height block height + * \param xpos PU x position + * \param ypos PU y position + * \param width PU width + * \param height PU height * \param mv_param motion vectors * \param lcu destination lcu */ -void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, - const kvz_picture * ref1, - const kvz_picture * ref2, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - int16_t mv_param[2][2], - lcu_t* lcu) +void kvz_inter_recon_bipred(const encoder_state_t * const state, + const kvz_picture * ref1, + const kvz_picture * ref2, + int32_t xpos, + int32_t ypos, + int32_t width, + int32_t height, + int16_t mv_param[2][2], + lcu_t* lcu) { kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH]; kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C]; @@ -468,7 +468,7 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH); if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH); //Reconstruct both predictors - kvz_inter_recon_lcu(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0); + inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0); if (!hi_prec_luma_rec0){ memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64); } @@ -476,7 +476,7 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32); memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32); } - kvz_inter_recon_lcu(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1); + inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1); // After reconstruction, merge the predictors by taking an average of each pixel for (temp_y = 0; temp_y < height; ++temp_y) { @@ -506,6 +506,69 @@ void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1); } + +/** + * Reconstruct a single CU. + * + * The CU may consist of multiple PUs, each of which can use either + * uniprediction or biprediction. + * + * \param state encoder state + * \param lcu containing LCU + * \param x x-coordinate of the CU in pixels + * \param y y-coordinate of the CU in pixels + * \param width CU width + */ +void kvz_inter_recon_cu(const encoder_state_t * const state, + lcu_t *lcu, + int32_t x, + int32_t y, + int32_t width) +{ + cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + + const int num_pu = kvz_part_mode_num_parts[cu->part_size]; + for (int i = 0; i < num_pu; ++i) { + const int pu_x = PU_GET_X(cu->part_size, width, x, i); + const int pu_y = PU_GET_Y(cu->part_size, width, y, i); + const int pu_w = PU_GET_W(cu->part_size, width, i); + const int pu_h = PU_GET_H(cu->part_size, width, i); + + cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); + + if (pu->inter.mv_dir == 3) { + const kvz_picture *const refs[2] = { + state->frame->ref->images[ + state->frame->ref_LX[0][ + pu->inter.mv_ref[0]]], + state->frame->ref->images[ + state->frame->ref_LX[1][ + pu->inter.mv_ref[1]]], + }; + kvz_inter_recon_bipred(state, + refs[0], refs[1], + pu_x, pu_y, + pu_w, pu_h, + pu->inter.mv, + lcu); + } else { + const int mv_idx = pu->inter.mv_dir - 1; + const kvz_picture *const ref = + state->frame->ref->images[ + state->frame->ref_LX[mv_idx][ + pu->inter.mv_ref[mv_idx]]]; + + inter_recon_unipred(state, + ref, + pu_x, pu_y, + pu_w, pu_h, + pu->inter.mv[mv_idx], + lcu, + NULL); + } + } +} + /** * \brief Clear unused L0/L1 motion vectors and reference * \param cu coding unit to clear diff --git a/src/inter.h b/src/inter.h index ee324fc1..3380cb1d 100644 --- a/src/inter.h +++ b/src/inter.h @@ -40,26 +40,21 @@ typedef struct { } inter_merge_cand_t; +void kvz_inter_recon_cu(const encoder_state_t * const state, + lcu_t *lcu, + int32_t x, + int32_t y, + int32_t width); -void kvz_inter_recon_lcu(const encoder_state_t * const state, - const kvz_picture * ref, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - const int16_t mv_param[2], - lcu_t* lcu, - hi_prec_buf_t *hi_prec_out); - -void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, - const kvz_picture * ref1, - const kvz_picture * ref2, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - int16_t mv_param[2][2], - lcu_t* lcu); +void kvz_inter_recon_bipred(const encoder_state_t * const state, + const kvz_picture * ref1, + const kvz_picture * ref2, + int32_t xpos, + int32_t ypos, + int32_t width, + int32_t height, + int16_t mv_param[2][2], + lcu_t* lcu); void kvz_inter_get_mv_cand(const encoder_state_t * const state, int32_t x, diff --git a/src/search.c b/src/search.c index 9943570c..c02aeebe 100644 --- a/src/search.c +++ b/src/search.c @@ -392,6 +392,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, const videoframe_t * const frame = state->tile->frame; int cu_width = LCU_WIDTH >> depth; double cost = MAX_INT; + double inter_zero_coeff_cost = MAX_INT; uint32_t inter_bitcost = MAX_INT; cu_info_t *cur_cu; @@ -518,7 +519,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // rd2. Possibly because the luma mode search already takes chroma // into account, so there is less of a chanse of luma mode being // really bad for chroma. - if (state->encoder_control->cfg.rdo == 3) { + if (ctrl->cfg.rdo == 3) { cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu); lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); } @@ -538,46 +539,30 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth); - const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; - for (int i = 0; i < num_pu; ++i) { - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i); - const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i); - const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); + kvz_inter_recon_cu(state, lcu, x, y, cu_width); - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); + if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) { + const int luma_index = y_local * LCU_WIDTH + x_local; + const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2); - if (cur_pu->inter.mv_dir == 3) { - const kvz_picture *const refs[2] = { - state->frame->ref->images[ - state->frame->ref_LX[0][ - cur_pu->inter.mv_ref[0]]], - state->frame->ref->images[ - state->frame->ref_LX[1][ - cur_pu->inter.mv_ref[1]]], - }; - kvz_inter_recon_lcu_bipred(state, - refs[0], refs[1], - pu_x, pu_y, - pu_w, pu_h, - cur_pu->inter.mv, - lcu); - } else { - const int mv_idx = cur_pu->inter.mv_dir - 1; - - const kvz_picture *const ref = - state->frame->ref->images[ - state->frame->ref_LX[mv_idx][ - cur_pu->inter.mv_ref[mv_idx]]]; + double ssd = 0.0; + ssd += LUMA_MULT * kvz_pixels_calc_ssd( + &lcu->ref.y[luma_index], &lcu->rec.y[luma_index], + LCU_WIDTH, LCU_WIDTH, cu_width + ); + ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index], + LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 + ); + ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index], + LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 + ); - kvz_inter_recon_lcu(state, - ref, - pu_x, pu_y, - pu_w, pu_h, - cur_pu->inter.mv[mv_idx], - lcu, - 0); - } + inter_zero_coeff_cost = ssd + inter_bitcost * state->lambda; + + // Save the pixels at a lower level of the working tree. + copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1]); } const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; @@ -589,7 +574,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, int cbf = cbf_is_set_any(cur_cu->cbf, depth); - if(cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) { + if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) { cur_cu->merged = 0; cur_cu->skipped = 1; // Selecting skip reduces bits needed to code the CU @@ -615,6 +600,28 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } cost += mode_bits * state->lambda; + + if (inter_zero_coeff_cost <= cost) { + cost = inter_zero_coeff_cost; + + // Restore saved pixels from lower level of the working tree. + copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); + + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + cur_cu->merged = 0; + cur_cu->skipped = 1; + lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); + } + + if (cur_cu->tr_depth != depth) { + // Reset transform depth since there are no coefficients. This + // ensures that CBF is cleared for the whole area of the CU. + kvz_lcu_set_trdepth(lcu, x, y, depth, depth); + } + + cur_cu->cbf = 0; + lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu); + } } bool can_split_cu = diff --git a/src/search_inter.c b/src/search_inter.c index 306f89e1..7e659c3b 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1495,18 +1495,18 @@ static void search_pu_inter(encoder_state_t * const state, continue; } - kvz_inter_recon_lcu_bipred(state, - state->frame->ref->images[ - state->frame->ref_LX[0][merge_cand[i].ref[0]] - ], - state->frame->ref->images[ - state->frame->ref_LX[1][merge_cand[j].ref[1]] - ], - x, y, - width, - height, - mv, - templcu); + kvz_inter_recon_bipred(state, + state->frame->ref->images[ + state->frame->ref_LX[0][merge_cand[i].ref[0]] + ], + state->frame->ref->images[ + state->frame->ref_LX[1][merge_cand[j].ref[1]] + ], + x, y, + width, + height, + mv, + templcu); for (int ypos = 0; ypos < height; ++ypos) { int dst_y = ypos * width;