diff --git a/src/cu.h b/src/cu.h index ecb7c695..1d49d347 100644 --- a/src/cu.h +++ b/src/cu.h @@ -77,55 +77,6 @@ typedef enum { MTS_TR_NUM = 6, } mts_idx; -extern const uint8_t uvg_part_mode_num_parts[]; -extern const uint8_t uvg_part_mode_offsets[][4][2]; -extern const uint8_t uvg_part_mode_sizes[][4][2]; - -/** - * \brief Get the x coordinate of a PU. - * - * \param part_mode partition mode of the containing CU - * \param cu_width width of the containing CU - * \param cu_x x coordinate of the containing CU - * \param i number of the PU - * \return location of the left edge of the PU - */ -#define PU_GET_X(part_mode, cu_width, cu_x, i) \ - ((cu_x) + uvg_part_mode_offsets[(part_mode)][(i)][0] * (cu_width) / 4) - -/** - * \brief Get the y coordinate of a PU. - * - * \param part_mode partition mode of the containing CU - * \param cu_width width of the containing CU - * \param cu_y y coordinate of the containing CU - * \param i number of the PU - * \return location of the top edge of the PU - */ -#define PU_GET_Y(part_mode, cu_width, cu_y, i) \ - ((cu_y) + uvg_part_mode_offsets[(part_mode)][(i)][1] * (cu_width) / 4) - -/** - * \brief Get the width of a PU. - * - * \param part_mode partition mode of the containing CU - * \param cu_width width of the containing CU - * \param i number of the PU - * \return width of the PU - */ -#define PU_GET_W(part_mode, cu_width, i) \ - (uvg_part_mode_sizes[(part_mode)][(i)][0] * (cu_width) / 4) - -/** - * \brief Get the height of a PU. - * - * \param part_mode partition mode of the containing CU - * \param cu_width width of the containing CU - * \param i number of the PU - * \return height of the PU - */ -#define PU_GET_H(part_mode, cu_width, i) \ - (uvg_part_mode_sizes[(part_mode)][(i)][1] * (cu_width) / 4) ////////////////////////////////////////////////////////////////////////// // TYPES @@ -142,6 +93,25 @@ enum uvg_tree_type { UVG_CHROMA_T = 2 }; +enum split_type { + NO_SPLIT = 0, + QT_SPLIT = 1, + BT_HOR_SPLIT = 2, + BT_VER_SPLIT = 3, + TT_HOR_SPLIT = 4, + TT_VER_SPLIT = 5, +}; + +typedef struct { + uint32_t split_tree; + uint8_t current_depth; +} split_tree_t; + + +// Split for each depth takes three bits like xxy where if either x bit is set +// it is a MTT split, and if there are any MTT split QT split is not allowed +#define CAN_QT_SPLIT(x) (((x) & 6DB6DB6) == 0) + /** * \brief Struct for CU info */ @@ -149,7 +119,6 @@ typedef struct { uint8_t type : 3; //!< \brief block type, one of cu_type_t values uint8_t depth : 3; //!< \brief depth / size of this block - uint8_t part_size : 3; //!< \brief partition mode, one of part_mode_t values uint8_t tr_depth : 3; //!< \brief transform depth uint8_t skipped : 1; //!< \brief flag to indicate this block is skipped uint8_t merged : 1; //!< \brief flag to indicate this block is merged diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index 46552a12..6f6fc9d8 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -825,11 +825,14 @@ static void encode_transform_coeff( * \param depth Depth from LCU. * \return if non-zero mvd is coded */ -int uvg_encode_inter_prediction_unit(encoder_state_t * const state, - cabac_data_t * const cabac, - const cu_info_t * const cur_cu, - int x, int y, int width, int height, - int depth, lcu_t* lcu, double* bits_out) +int uvg_encode_inter_prediction_unit( + encoder_state_t * const state, + cabac_data_t * const cabac, + const cu_info_t * const cur_cu, + int depth, + lcu_t* lcu, + double* bits_out, + const cu_loc_t* const cu_loc) { // Mergeflag int16_t num_cand = 0; @@ -864,8 +867,8 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state, // Code Inter Dir uint8_t inter_dir = cur_cu->inter.mv_dir; - if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4 - uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(width) + uvg_math_floor_log2(height) + 1) >> 1)); + if ((LCU_WIDTH >> depth) != 4) { // ToDo: limit on 4x8/8x4 + uint32_t inter_dir_ctx = (7 - ((uvg_math_floor_log2(cu_loc->width) + uvg_math_floor_log2(cu_loc->height) + 1) >> 1)); CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir[inter_dir_ctx]), (inter_dir == 3), bits, "inter_pred_idc"); } @@ -916,16 +919,14 @@ int uvg_encode_inter_prediction_unit(encoder_state_t * const state, if (lcu) { uvg_inter_get_mv_cand( state, - x, y, width, height, - mv_cand, cur_cu, - lcu, ref_list_idx); + mv_cand, cur_cu, lcu, ref_list_idx, + cu_loc); } else { uvg_inter_get_mv_cand_cua( state, - x, y, width, height, - mv_cand, cur_cu, ref_list_idx - ); + mv_cand, cur_cu, ref_list_idx, cu_loc + ); } uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx); @@ -1346,11 +1347,11 @@ bool uvg_write_split_flag( if (no_split && allow_split) { // Get left and top block split_flags and if they are present and true, increase model number // ToDo: should use height and width to increase model, PU_GET_W() ? - if (left_cu && PU_GET_H(left_cu->part_size, LCU_WIDTH >> left_cu->depth, 0) < LCU_WIDTH >> depth) { + if (left_cu && LCU_WIDTH >> left_cu->depth < LCU_WIDTH >> depth) { split_model++; } - if (above_cu && PU_GET_W(above_cu->part_size, LCU_WIDTH >> above_cu->depth, 0) < LCU_WIDTH >> depth) { + if (above_cu && LCU_WIDTH >> above_cu->depth < LCU_WIDTH >> depth) { split_model++; } @@ -1625,22 +1626,15 @@ void uvg_encode_coding_tree( if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { uint8_t imv_mode = UVG_IMV_OFF; - - const int num_pu = uvg_part_mode_num_parts[cur_cu->part_size]; bool non_zero_mvd = false; + + // TODO: height for non-square blocks + const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, cu_loc.x, cu_loc.y); - for (int i = 0; i < num_pu; ++i) { - // TODO: height for non-square blocks - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i); - const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i); - const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); - const cu_info_t *cur_pu = uvg_cu_array_at_const(used_array, pu_x, pu_y); - - non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth, NULL, NULL); - DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu); - uvg_hmvp_add_mv(state, x, y, pu_w, pu_h, cur_pu); - } + non_zero_mvd |= uvg_encode_inter_prediction_unit(state, cabac, cur_pu, depth, NULL, NULL, &cu_loc); + DBG_PRINT_MV(state, pu_x, pu_y, pu_w, pu_h, cur_pu); + uvg_hmvp_add_mv(state, x, y, width, height, cur_pu); + // imv mode, select between fullpel, half-pel and quarter-pel resolutions // 0 = off, 1 = fullpel, 2 = 4-pel, 3 = half-pel @@ -1661,7 +1655,7 @@ void uvg_encode_coding_tree( int cbf = cbf_is_set_any(cur_cu->cbf, depth); // Only need to signal coded block flag if not skipped or merged // skip = no coded residual, merge = coded residual - if (cur_cu->part_size != SIZE_2Nx2N || !cur_cu->merged) { + if (!cur_cu->merged) { cabac->cur_ctx = &(cabac->ctx.cu_qt_root_cbf_model); CABAC_BIN(cabac, cbf, "rqt_root_cbf"); } @@ -1747,15 +1741,18 @@ end: double uvg_mock_encode_coding_unit( encoder_state_t* const state, cabac_data_t* cabac, - int x, - int y, - int depth, + const cu_loc_t* const cu_loc, lcu_t* lcu, cu_info_t* cur_cu, enum uvg_tree_type tree_type) { double bits = 0; const encoder_control_t* const ctrl = state->encoder_control; + const int x = cu_loc->x; + const int y = cu_loc->y; + + const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; + int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T); int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T); @@ -1846,7 +1843,7 @@ double uvg_mock_encode_coding_unit( if (cur_cu->type == CU_INTER || cur_cu->type == CU_IBC) { const uint8_t imv_mode = UVG_IMV_OFF; - const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, x, y, cu_width, cu_width, depth, lcu, &bits); + const int non_zero_mvd = uvg_encode_inter_prediction_unit(state, cabac, cur_cu, depth, lcu, &bits, cu_loc); if (ctrl->cfg.amvr && non_zero_mvd) { CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.imv_flag[0]), imv_mode, bits, "imv_flag"); if (imv_mode > UVG_IMV_OFF) { diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h index 575f4afd..231e22ff 100644 --- a/src/encode_coding_tree.h +++ b/src/encode_coding_tree.h @@ -78,20 +78,19 @@ void uvg_encode_mvd(encoder_state_t * const state, double uvg_mock_encode_coding_unit( encoder_state_t* const state, cabac_data_t* cabac, - int x, - int y, - int depth, + const cu_loc_t* const cu_loc, lcu_t* lcu, cu_info_t* cur_cu, enum uvg_tree_type tree_type); -int uvg_encode_inter_prediction_unit(encoder_state_t* const state, - cabac_data_t* const cabac, - const cu_info_t* const cur_cu, - int x, int y, int width, int height, - int depth, - lcu_t* lcu, - double* bits_out); +int uvg_encode_inter_prediction_unit( + encoder_state_t* const state, + cabac_data_t* const cabac, + const cu_info_t* const cur_cu, + int depth, + lcu_t* lcu, + double* bits_out, + const cu_loc_t* const cu_loc); void uvg_encode_intra_luma_coding_unit(const encoder_state_t* const state, cabac_data_t* const cabac, diff --git a/src/filter.c b/src/filter.c index 2d51a17c..26a57100 100644 --- a/src/filter.c +++ b/src/filter.c @@ -855,13 +855,11 @@ static void filter_deblock_edge_luma(encoder_state_t * const state, uint8_t max_filter_length_P = 0; uint8_t max_filter_length_Q = 0; const int cu_size = LCU_WIDTH >> cu_q->depth; - const int pu_part_idx = (y + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ? - 1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0) - + (x + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0); - const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx) - : PU_GET_W(cu_q->part_size, cu_size, pu_part_idx); - const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx) - : x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx); + // TODO: NON square + const int pu_size = dir == EDGE_HOR ? cu_size + : cu_size; + const int pu_pos = dir == EDGE_HOR ? y_coord + : x_coord; get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord, dir, tu_boundary, LCU_WIDTH >> cu_p->tr_depth, @@ -1088,13 +1086,10 @@ static void filter_deblock_edge_chroma(encoder_state_t * const state, } const int cu_size = LCU_WIDTH >> (cu_q->depth + (tree_type == UVG_CHROMA_T)); - const int pu_part_idx = ((y << (tree_type != UVG_CHROMA_T)) + PU_GET_H(cu_q->part_size, cu_size, 0) <= y_coord ? - 1 + (uvg_part_mode_num_parts[cu_q->part_size] >> 2) : 0) - + ((x << (tree_type != UVG_CHROMA_T)) + PU_GET_W(cu_q->part_size, cu_size, 0) <= x_coord ? 1 : 0); - const int pu_size = dir == EDGE_HOR ? PU_GET_H(cu_q->part_size, cu_size, pu_part_idx) - : PU_GET_W(cu_q->part_size, cu_size, pu_part_idx); - const int pu_pos = dir == EDGE_HOR ? y_coord - PU_GET_Y(cu_q->part_size, cu_size, 0, pu_part_idx) - : x_coord - PU_GET_X(cu_q->part_size, cu_size, 0, pu_part_idx); + // TODO: non-square + const int pu_size = dir == EDGE_HOR ? cu_size : cu_size; + const int pu_pos = dir == EDGE_HOR ? y_coord + : x_coord; uint8_t max_filter_length_P = 0; uint8_t max_filter_length_Q = 0; diff --git a/src/inter.c b/src/inter.c index 3bbef427..be353506 100644 --- a/src/inter.c +++ b/src/inter.c @@ -375,23 +375,26 @@ static void inter_cp_with_ext_border(const uvg_pixel *ref_buf, int ref_stride, * \param predict_luma Enable or disable luma prediction for this call. * \param predict_chroma Enable or disable chroma prediction for this call. */ -static unsigned inter_recon_unipred(const encoder_state_t * const state, - const uvg_picture * const ref, - int32_t pu_x, - int32_t pu_y, - int32_t pu_w, - int32_t pu_h, - int32_t out_stride_luma, - const mv_t mv_param[2], - yuv_t *yuv_px, - yuv_im_t *yuv_im, - bool predict_luma, - bool predict_chroma) +static unsigned inter_recon_unipred( + const encoder_state_t * const state, + const uvg_picture * const ref, + int32_t out_stride_luma, + const mv_t mv_param[2], + yuv_t *yuv_px, + yuv_im_t *yuv_im, + bool predict_luma, + bool predict_chroma, + const cu_loc_t* const cu_loc) { vector2d_t int_mv = { mv_param[0], mv_param[1] }; uvg_change_precision_vector2d(INTERNAL_MV_PREC, 0, &int_mv); + const int pu_x = cu_loc->x; + const int pu_y = cu_loc->y; + const int pu_w = cu_loc->width; + const int pu_h = cu_loc->height; + const vector2d_t int_mv_in_frame = { int_mv.x + pu_x + state->tile->offset_x, int_mv.y + pu_y + state->tile->offset_y @@ -507,17 +510,15 @@ static unsigned inter_recon_unipred(const encoder_state_t * const state, * \param predict_luma Enable or disable luma prediction for this call. * \param predict_chroma Enable or disable chroma prediction for this call. */ -void uvg_inter_recon_bipred(const encoder_state_t *const state, +void uvg_inter_recon_bipred( + const encoder_state_t *const state, const uvg_picture *ref1, const uvg_picture *ref2, - int32_t pu_x, - int32_t pu_y, - int32_t pu_w, - int32_t pu_h, mv_t mv_param[2][2], lcu_t *lcu, bool predict_luma, - bool predict_chroma) + bool predict_chroma, + const cu_loc_t* const cu_loc) { // Allocate maximum size arrays for interpolated and copied samples ALIGNED(64) uvg_pixel px_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE]; @@ -525,6 +526,11 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state, ALIGNED(64) uvg_pixel_im im_buf_L0[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE]; ALIGNED(64) uvg_pixel_im im_buf_L1[LCU_LUMA_SIZE + 2 * LCU_CHROMA_SIZE]; + const int pu_x = cu_loc->x; + const int pu_y = cu_loc->y; + const int pu_w = cu_loc->width; + const int pu_h = cu_loc->height; + yuv_t px_L0; px_L0.size = pu_w * pu_h; px_L0.y = &px_buf_L0[0]; @@ -551,10 +557,10 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state, // Sample blocks from both reference picture lists. // Flags state if the outputs were written to high-precision / interpolated sample buffers. - unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[0], - &px_L0, &im_L0, predict_luma, predict_chroma); - unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_x, pu_y, pu_w, pu_h, pu_w, mv_param[1], - &px_L1, &im_L1, predict_luma, predict_chroma); + unsigned im_flags_L0 = inter_recon_unipred(state, ref1, pu_w, mv_param[0], &px_L0, &im_L0, predict_luma, predict_chroma, + cu_loc); + unsigned im_flags_L1 = inter_recon_unipred(state, ref2, pu_w, mv_param[1], &px_L1, &im_L1, predict_luma, predict_chroma, + cu_loc); // After reconstruction, merge the predictors by taking an average of each pixel uvg_bipred_average(lcu, &px_L0, &px_L1, &im_L0, &im_L1, @@ -578,19 +584,14 @@ void uvg_inter_recon_bipred(const encoder_state_t *const state, * \param predict_luma Enable or disable luma prediction for this call. * \param predict_chroma Enable or disable chroma prediction for this call. */ -void uvg_inter_recon_cu(const encoder_state_t * const state, - lcu_t *lcu, - int32_t x, - int32_t y, - int32_t width, - bool predict_luma, - bool predict_chroma) +void uvg_inter_recon_cu( + const encoder_state_t * const state, + lcu_t *lcu, + bool predict_luma, + bool predict_chroma, + const cu_loc_t* const cu_loc) { - cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); - const int num_pu = uvg_part_mode_num_parts[cu->part_size]; - for (int i = 0; i < num_pu; ++i) { - uvg_inter_pred_pu(state, lcu, x, y, width, predict_luma, predict_chroma, i); - } + uvg_inter_pred_pu(state, lcu, predict_luma, predict_chroma, cu_loc); } static void ibc_recon_cu(const encoder_state_t * const state, @@ -599,8 +600,7 @@ static void ibc_recon_cu(const encoder_state_t * const state, int32_t y, int32_t width, bool predict_luma, - bool predict_chroma, - int i_pu) + bool predict_chroma) { const int x_scu = SUB_SCU(x); const int y_scu = SUB_SCU(y); @@ -668,79 +668,63 @@ static void ibc_recon_cu(const encoder_state_t * const state, * \param predict_chroma Enable or disable chroma prediction for this call. * \param i_pu Index of the PU. Always zero for 2Nx2N. Used for SMP+AMP. */ -void uvg_inter_pred_pu(const encoder_state_t * const state, - lcu_t *lcu, - int32_t x, - int32_t y, - int32_t width, - bool predict_luma, - bool predict_chroma, - int i_pu) +void uvg_inter_pred_pu( + const encoder_state_t * const state, + lcu_t *lcu, + bool predict_luma, + bool predict_chroma, + const cu_loc_t* const cu_loc) { - const int x_scu = SUB_SCU(x); - const int y_scu = SUB_SCU(y); - cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu); - const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu); - const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu); - const int pu_w = PU_GET_W(cu->part_size, width, i_pu); - const int pu_h = PU_GET_H(cu->part_size, width, i_pu); - cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); + const int x_scu = SUB_SCU(cu_loc->x); + const int y_scu = SUB_SCU(cu_loc->y); + cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, x_scu, y_scu); - if (cu->type == CU_IBC) { - ibc_recon_cu(state, lcu, x, y, width, predict_luma, predict_chroma, i_pu); - } else { + if (pu->inter.mv_dir == 3) { + const uvg_picture *const refs[2] = { + state->frame->ref->images[ + state->frame->ref_LX[0][ + pu->inter.mv_ref[0]]], + state->frame->ref->images[ + state->frame->ref_LX[1][ + pu->inter.mv_ref[1]]], + }; + uvg_inter_recon_bipred(state, + refs[0], refs[1], + pu->inter.mv, lcu, + predict_luma, predict_chroma, + cu_loc); + } + else if (pu->type == CU_IBC) { + ibc_recon_cu(state, lcu, cu_loc->x, cu_loc->y, cu_loc->width, predict_luma, predict_chroma); + } else{ + const int mv_idx = pu->inter.mv_dir - 1; + const uvg_picture *const ref = + state->frame->ref->images[ + state->frame->ref_LX[mv_idx][ + pu->inter.mv_ref[mv_idx]]]; - if (pu->inter.mv_dir == 3) { - const uvg_picture * const refs[2] = { - state->frame->ref->images[state->frame->ref_LX[0][pu->inter.mv_ref[0]]], - state->frame->ref->images[state->frame->ref_LX[1][pu->inter.mv_ref[1]]], - }; - uvg_inter_recon_bipred( - state, - refs[0], - refs[1], - pu_x, - pu_y, - pu_w, - pu_h, - pu->inter.mv, - lcu, - predict_luma, - predict_chroma); - } else { - const int mv_idx = pu->inter.mv_dir - 1; - const uvg_picture * const ref = - state->frame->ref->images[state->frame->ref_LX[mv_idx][pu->inter.mv_ref[mv_idx]]]; + const unsigned offset_luma = SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x); + const unsigned offset_chroma = SUB_SCU(cu_loc->y) / 2 * LCU_WIDTH_C + SUB_SCU(cu_loc->x) / 2; + yuv_t lcu_adapter; + lcu_adapter.size = cu_loc->width * cu_loc->height; + lcu_adapter.y = lcu->rec.y + offset_luma, + lcu_adapter.u = lcu->rec.u + offset_chroma, + lcu_adapter.v = lcu->rec.v + offset_chroma, - const unsigned offset_luma = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x); - const unsigned offset_chroma = - SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2; - yuv_t lcu_adapter; - lcu_adapter.size = pu_w * pu_h; - lcu_adapter.y = lcu->rec.y + offset_luma, - lcu_adapter.u = lcu->rec.u + offset_chroma, - lcu_adapter.v = lcu->rec.v + offset_chroma, - - inter_recon_unipred( - state, - ref, - pu_x, - pu_y, - pu_w, - pu_h, - LCU_WIDTH, - pu->inter.mv[mv_idx], - &lcu_adapter, - NULL, - predict_luma, - predict_chroma); - } + inter_recon_unipred(state, + ref, + LCU_WIDTH, pu->inter.mv[mv_idx], + &lcu_adapter, + NULL, + predict_luma, + predict_chroma, + cu_loc); } if (predict_chroma && state->encoder_control->cfg.jccr) { const int offset = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; - uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); - uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, width / 2, width / 2, LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(lcu->rec.u + offset, lcu->rec.joint_u + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C); + uvg_pixels_blit(lcu->rec.v + offset, lcu->rec.joint_v + offset, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C); } } @@ -915,14 +899,12 @@ static bool is_b0_cand_coded(int x, int y, int width, int height) * \param ref_idx index in the reference list * \param cand_out will be filled with C0 and C1 candidates */ -static void get_temporal_merge_candidates(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - uint8_t ref_list, - uint8_t ref_idx, - merge_candidates_t *cand_out) +static void get_temporal_merge_candidates( + const encoder_state_t * const state, + const cu_loc_t* const cu_loc, + uint8_t ref_list, + uint8_t ref_idx, + merge_candidates_t *cand_out) { /* Predictor block locations @@ -951,8 +933,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state, cu_array_t *ref_cu_array = state->frame->ref->cu_arrays[colocated_ref]; int cu_per_width = ref_cu_array->width / SCU_WIDTH; - int32_t xColBr = x + width; - int32_t yColBr = y + height; + int32_t xColBr = cu_loc->x + cu_loc->width; + int32_t yColBr = cu_loc->y + cu_loc->height; // C0 must be available if (xColBr < state->encoder_control->in.width && @@ -972,8 +954,8 @@ static void get_temporal_merge_candidates(const encoder_state_t * const state, } } } - int32_t xColCtr = x + (width / 2); - int32_t yColCtr = y + (height / 2); + int32_t xColCtr = cu_loc->x + (cu_loc->width / 2); + int32_t yColCtr = cu_loc->y + (cu_loc->height / 2); // C1 must be inside the LCU, in the center position of current CU if (xColCtr < state->encoder_control->in.width && yColCtr < state->encoder_control->in.height) { @@ -1254,10 +1236,7 @@ static void get_ibc_merge_candidates(const encoder_state_t * const state, * \param lcu current LCU * \param cand_out will be filled with A and B candidates */ -static void get_spatial_merge_candidates(int32_t x, - int32_t y, - int32_t width, - int32_t height, +static void get_spatial_merge_candidates(const cu_loc_t* const cu_loc, int32_t picture_width, int32_t picture_height, lcu_t *lcu, @@ -1276,8 +1255,13 @@ static void get_spatial_merge_candidates(int32_t x, |A1|_________| |A0| */ - int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU - int32_t y_local = SUB_SCU(y); + const int32_t x_local = SUB_SCU(cu_loc->x); //!< coordinates from top-left of this LCU + const int32_t y_local = SUB_SCU(cu_loc->y); + + const int x = cu_loc->x; + const int y = cu_loc->y; + const int width = cu_loc->width; + const int height = cu_loc->height; // A0 and A1 availability testing if (x != 0) { cu_info_t *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1); @@ -1350,15 +1334,13 @@ static void get_spatial_merge_candidates(int32_t x, * \param picture_height tile height in pixels * \param cand_out will be filled with A and B candidates */ -static void get_spatial_merge_candidates_cua(const cu_array_t *cua, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - int32_t picture_width, - int32_t picture_height, - merge_candidates_t *cand_out, - bool wpp) +static void get_spatial_merge_candidates_cua( + const cu_array_t *cua, + int32_t picture_width, + int32_t picture_height, + merge_candidates_t *cand_out, + bool wpp, + const cu_loc_t* const cu_loc) { /* Predictor block locations @@ -1370,8 +1352,12 @@ static void get_spatial_merge_candidates_cua(const cu_array_t *cua, |A1|_________| |A0| */ - int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU - int32_t y_local = SUB_SCU(y); + const int x = cu_loc->x; + const int y = cu_loc->y; + const int width = cu_loc->width; + const int height = cu_loc->height; + const int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU + const int32_t y_local = SUB_SCU(y); // A0 and A1 availability testing if (x != 0) { const cu_info_t *a1 = uvg_cu_array_at_const(cua, x - 1, y + height - 1); @@ -1484,15 +1470,13 @@ static bool add_temporal_candidate(const encoder_state_t *state, /** * \brief Pick two mv candidates from the spatial and temporal candidates. */ -static void get_mv_cand_from_candidates(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - const merge_candidates_t *merge_cand, - const cu_info_t * const cur_cu, - int8_t reflist, - mv_t mv_cand[2][2]) +static void get_mv_cand_from_candidates( + const encoder_state_t * const state, + const merge_candidates_t *merge_cand, + const cu_info_t * const cur_cu, + int8_t reflist, + mv_t mv_cand[2][2], + int ctu_row) { const cu_info_t *const *a = merge_cand->a; const cu_info_t *const *b = merge_cand->b; @@ -1552,7 +1536,6 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state, if (candidates < AMVP_MAX_NUM_CANDS) { - const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; int32_t num_cand = state->tile->frame->hmvp_size[ctu_row]; for (int i = 0; i < MIN(/*MAX_NUM_HMVP_AVMPCANDS*/4,num_cand); i++) { @@ -1595,32 +1578,30 @@ static void get_mv_cand_from_candidates(const encoder_state_t * const state, * \param lcu current LCU * \param reflist reflist index (either 0 or 1) */ -void uvg_inter_get_mv_cand(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - mv_t mv_cand[2][2], - const cu_info_t * const cur_cu, - lcu_t *lcu, - int8_t reflist) +void uvg_inter_get_mv_cand( + const encoder_state_t * const state, + mv_t mv_cand[2][2], + const cu_info_t * const cur_cu, + lcu_t *lcu, + int8_t reflist, + const cu_loc_t* const cu_loc) { merge_candidates_t merge_cand = { 0 }; const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level; if (cur_cu->type == CU_IBC) { mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2]; - get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand); + get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand); memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2); memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2); - } else { - get_spatial_merge_candidates(x, y, width, height, - state->tile->frame->width, - state->tile->frame->height, - lcu, - &merge_cand, parallel_merge_level,state->encoder_control->cfg.wpp); - get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand); - get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); + } else { + get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu, + &merge_cand, + parallel_merge_level, + state->encoder_control->cfg.wpp); + get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand); + get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH); } + uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]); uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]); } @@ -1637,31 +1618,29 @@ void uvg_inter_get_mv_cand(const encoder_state_t * const state, * \param cur_cu current CU * \param reflist reflist index (either 0 or 1) */ -void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - mv_t mv_cand[2][2], - const cu_info_t* cur_cu, - int8_t reflist) +void uvg_inter_get_mv_cand_cua( + const encoder_state_t * const state, + mv_t mv_cand[2][2], + const cu_info_t* cur_cu, + int8_t reflist, + const cu_loc_t* const cu_loc) { merge_candidates_t merge_cand = { 0 }; const cu_array_t *cua = state->tile->frame->cu_array; if (cur_cu->type == CU_IBC) { mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2]; - get_ibc_merge_candidates(state, cur_cu, NULL,cua,x, y, width, height,ibc_mv_cand); + get_ibc_merge_candidates(state, cur_cu, NULL,cua,cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand); memcpy(mv_cand[0], ibc_mv_cand[0], sizeof(mv_t) * 2); memcpy(mv_cand[1], ibc_mv_cand[1], sizeof(mv_t) * 2); } else { get_spatial_merge_candidates_cua(cua, - x, y, width, height, - state->tile->frame->width, state->tile->frame->height, - &merge_cand, state->encoder_control->cfg.wpp); - get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand); - get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand); + state->tile->frame->width, state->tile->frame->height, &merge_cand, state->encoder_control->cfg.wpp, + cu_loc); + get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand); + get_mv_cand_from_candidates(state, &merge_cand, cur_cu, reflist, mv_cand, cu_loc->y >> LOG2_LCU_WIDTH); } + uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[0][0], &mv_cand[0][1]); uvg_round_precision(INTERNAL_MV_PREC, 2, &mv_cand[1][0], &mv_cand[1][1]); } @@ -1885,23 +1864,23 @@ void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv) { * \param lcu lcu containing the block * \return number of merge candidates */ -uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, - int32_t x, int32_t y, - int32_t width, int32_t height, - bool use_a1, bool use_b1, - inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS], - lcu_t *lcu) +uint8_t uvg_inter_get_merge_cand( + const encoder_state_t * const state, + const cu_loc_t* const cu_loc, + inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS], + lcu_t *lcu) { uint8_t candidates = 0; int8_t zero_idx = 0; const uint8_t parallel_merge_level = state->encoder_control->cfg.log2_parallel_merge_level; merge_candidates_t merge_cand = { 0 }; const uint8_t max_num_cands = state->encoder_control->cfg.max_merge; + // Current CU + cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(cu_loc->x), SUB_SCU(cu_loc->y)); - cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); if(cur_cu->type == CU_IBC) { mv_t ibc_mv_cand[IBC_MRG_MAX_NUM_CANDS][2]; - get_ibc_merge_candidates(state, cur_cu,lcu,NULL, x, y, width, height,ibc_mv_cand); + get_ibc_merge_candidates(state, cur_cu,lcu,NULL, cu_loc->x, cu_loc->y, cu_loc->width, cu_loc->height,ibc_mv_cand); for (int i = 0; i < IBC_MRG_MAX_NUM_CANDS; i++) { mv_cand[i].dir = 1; mv_cand[i].mv[0][0] = ibc_mv_cand[i][0]; @@ -1909,18 +1888,16 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, } return IBC_MRG_MAX_NUM_CANDS; } - - get_spatial_merge_candidates(x, y, width, height, - state->tile->frame->width, - state->tile->frame->height, - lcu, - &merge_cand, parallel_merge_level, state->encoder_control->cfg.wpp); + get_spatial_merge_candidates(cu_loc, state->tile->frame->width, state->tile->frame->height, lcu, + &merge_cand, + parallel_merge_level, + state->encoder_control->cfg.wpp); const cu_info_t **a = merge_cand.a; const cu_info_t **b = merge_cand.b; - if (!use_a1) a[1] = NULL; - if (!use_b1) b[1] = NULL; + const int x = cu_loc->x; + const int y = cu_loc->y; if (different_mer(x, y, x, y - 1, parallel_merge_level) && add_merge_candidate(b[1], NULL, NULL, &mv_cand[candidates])) candidates++; if (different_mer(x, y, x - 1, y, parallel_merge_level) && add_merge_candidate(a[1], b[1], NULL, &mv_cand[candidates])) candidates++; @@ -1941,7 +1918,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, for (int reflist = 0; reflist <= max_reflist; reflist++) { // Fetch temporal candidates for the current CU // ToDo: change collocated_from_l0_flag to allow L1 ref - get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand); + get_temporal_merge_candidates(state, cu_loc, 1, 0, &merge_cand); // TODO: enable L1 TMVP candidate // get_temporal_merge_candidates(state, x, y, width, height, 2, 0, &merge_cand); @@ -1973,7 +1950,7 @@ uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, if (candidates == max_num_cands) return candidates; if (candidates != max_num_cands - 1) { - const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); + const uint32_t ctu_row = (cu_loc->y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; int32_t num_cand = state->tile->frame->hmvp_size[ctu_row]; diff --git a/src/inter.h b/src/inter.h index 45f5e5ea..4d5fccd5 100644 --- a/src/inter.h +++ b/src/inter.h @@ -58,61 +58,51 @@ void uvg_change_precision_vector2d(int src, int dst, vector2d_t* mv); void uvg_round_precision(int src, int dst, mv_t* hor, mv_t* ver); void uvg_round_precision_vector2d(int src, int dst, vector2d_t* mv); -void uvg_inter_recon_cu(const encoder_state_t * const state, - lcu_t *lcu, - int32_t x, - int32_t y, - int32_t width, - bool predict_luma, - bool predict_chroma); - -void uvg_inter_pred_pu(const encoder_state_t * const state, +void uvg_inter_recon_cu( + const encoder_state_t * const state, lcu_t *lcu, - int32_t x, - int32_t y, - int32_t width, bool predict_luma, bool predict_chroma, - int i_pu); + const cu_loc_t* const cu_loc); + +void uvg_inter_pred_pu( + const encoder_state_t * const state, + lcu_t *lcu, + bool predict_luma, + bool predict_chroma, + const cu_loc_t* const cu_loc); void uvg_hmvp_add_mv(const encoder_state_t* const state, uint32_t pic_x, uint32_t pic_y, uint32_t block_width, uint32_t block_height, const cu_info_t* cu); -void uvg_inter_recon_bipred(const encoder_state_t * const state, - const uvg_picture * ref1, - const uvg_picture * ref2, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - mv_t mv_param[2][2], - lcu_t* lcu, - bool predict_luma, - bool predict_chroma); +void uvg_inter_recon_bipred( + const encoder_state_t * const state, + const uvg_picture * ref1, + const uvg_picture * ref2, + mv_t mv_param[2][2], + lcu_t* lcu, + bool predict_luma, + bool predict_chroma, + const cu_loc_t* const cu_loc); -void uvg_inter_get_mv_cand(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - mv_t mv_cand[2][2], - const cu_info_t* cur_cu, - lcu_t *lcu, - int8_t reflist); +void uvg_inter_get_mv_cand( + const encoder_state_t * const state, + mv_t mv_cand[2][2], + const cu_info_t* cur_cu, + lcu_t *lcu, + int8_t reflist, + const cu_loc_t* const cu_loc); -void uvg_inter_get_mv_cand_cua(const encoder_state_t * const state, - int32_t x, - int32_t y, - int32_t width, - int32_t height, - mv_t mv_cand[2][2], - const cu_info_t* cur_cu, - int8_t reflist); +void uvg_inter_get_mv_cand_cua( + const encoder_state_t * const state, + mv_t mv_cand[2][2], + const cu_info_t* cur_cu, + int8_t reflist, + const cu_loc_t* const cu_loc); -uint8_t uvg_inter_get_merge_cand(const encoder_state_t * const state, - int32_t x, int32_t y, - int32_t width, int32_t height, - bool use_a1, bool use_b1, - inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS], - lcu_t *lcu); +uint8_t uvg_inter_get_merge_cand( + const encoder_state_t * const state, + const cu_loc_t* const cu_loc, + inter_merge_cand_t mv_cand[MRG_MAX_NUM_CANDS], + lcu_t *lcu); #endif diff --git a/src/search.c b/src/search.c index 0b51412b..d61be039 100644 --- a/src/search.c +++ b/src/search.c @@ -166,7 +166,6 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in cu_info_t *to = LCU_GET_CU_AT_PX(lcu, x, y); to->type = cu->type; to->depth = cu->depth; - to->part_size = cu->part_size; to->qp = cu->qp; //to->tr_idx = cu->tr_idx; to->lfnst_idx = cu->lfnst_idx; @@ -191,22 +190,6 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in } } -static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width, uint8_t type) -{ - const part_mode_t part_mode = LCU_GET_CU_AT_PX(lcu, x_local, y_local)->part_size; - const int num_pu = uvg_part_mode_num_parts[part_mode]; - - for (int i = 0; i < num_pu; ++i) { - const int x_pu = PU_GET_X(part_mode, cu_width, x_local, i); - const int y_pu = PU_GET_Y(part_mode, cu_width, y_local, i); - const int width_pu = PU_GET_W(part_mode, cu_width, i); - const int height_pu = PU_GET_H(part_mode, cu_width, i); - - cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu); - pu->type = type; - lcu_fill_cu_info(lcu, x_pu, y_pu, width_pu, height_pu, pu); - } -} static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, const cu_info_t *cur_cu) { @@ -559,7 +542,7 @@ static double cu_rd_cost_tr_split_accurate( int cbf = cbf_is_set_any(pred_cu->cbf, depth); // Only need to signal coded block flag if not skipped or merged // skip = no coded residual, merge = coded residual - if (pred_cu->type != CU_INTRA && (pred_cu->part_size != SIZE_2Nx2N || !pred_cu->merged)) { + if (pred_cu->type != CU_INTRA && (!pred_cu->merged)) { CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_qt_root_cbf_model), cbf, tr_tree_bits, "rqt_root_cbf"); } @@ -876,18 +859,20 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map) */ static double search_cu( encoder_state_t* const state, - int x, - int y, - int depth, + const cu_loc_t* const cu_loc, lcu_t* work_tree, enum uvg_tree_type - tree_type) + tree_type, + const split_tree_t split_tree) { + const int depth = split_tree.current_depth; const encoder_control_t* ctrl = state->encoder_control; const videoframe_t * const frame = state->tile->frame; - const int cu_width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> depth; - const int cu_height = cu_width; // TODO: height - const int luma_width = LCU_WIDTH >> depth; + const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width; + const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height; + const int x = cu_loc->x; + const int y = cu_loc->y; + const int luma_width = cu_loc->width; assert(cu_width >= 4); double cost = MAX_DOUBLE; double inter_zero_coeff_cost = MAX_DOUBLE; @@ -896,7 +881,7 @@ static double search_cu( cabac_data_t pre_search_cabac; memcpy(&pre_search_cabac, &state->search_cabac, sizeof(pre_search_cabac)); - const uint32_t ctu_row = (y >> LOG2_LCU_WIDTH); + const uint32_t ctu_row = (cu_loc->y >> LOG2_LCU_WIDTH); const uint32_t ctu_row_mul_five = ctu_row * MAX_NUM_HMVP_CANDS; cu_info_t hmvp_lut[MAX_NUM_HMVP_CANDS]; @@ -913,7 +898,7 @@ static double search_cu( int32_t max; } pu_depth_inter, pu_depth_intra; - lcu_t *const lcu = &work_tree[depth]; + lcu_t *const lcu = &work_tree[split_tree.current_depth]; int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T); int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T); @@ -947,10 +932,9 @@ static double search_cu( cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); // Assign correct depth - cur_cu->depth = (depth > MAX_DEPTH) ? MAX_DEPTH : depth; - cur_cu->tr_depth = (depth > 0) ? depth : 1; + cur_cu->depth = (split_tree.current_depth > MAX_DEPTH) ? MAX_DEPTH : split_tree.current_depth; + cur_cu->tr_depth = cu_width > TR_MAX_WIDTH || cu_height > TR_MAX_WIDTH ? 1 : split_tree.current_depth; cur_cu->type = CU_NOTSET; - cur_cu->part_size = SIZE_2Nx2N; cur_cu->qp = state->qp; cur_cu->bdpcmMode = 0; cur_cu->tr_idx = 0; @@ -969,9 +953,9 @@ static double search_cu( int cu_width_inter_min = LCU_WIDTH >> pu_depth_inter.max; bool can_use_inter = state->frame->slicetype != UVG_SLICE_I && - depth <= MAX_DEPTH && + split_tree.current_depth <= MAX_DEPTH && ( - WITHIN(depth, pu_depth_inter.min, pu_depth_inter.max) || + WITHIN(split_tree.current_depth, pu_depth_inter.min, pu_depth_inter.max) || // When the split was forced because the CTU is partially outside the // frame, we permit inter coding even if pu_depth_inter would // otherwise forbid it. @@ -983,10 +967,9 @@ static double search_cu( double mode_cost; double mode_bitcost; uvg_search_cu_inter(state, - x, y, - depth, - lcu, - &mode_cost, &mode_bitcost); + cu_loc, lcu, + &mode_cost, + &mode_bitcost); if (mode_cost < cost) { cost = mode_cost; inter_bitcost = mode_bitcost; @@ -1004,7 +987,7 @@ static double search_cu( int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max; bool can_use_intra = - (WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) || + (WITHIN(split_tree.current_depth, pu_depth_intra.min, pu_depth_intra.max) || // When the split was forced because the CTU is partially outside // the frame, we permit intra coding even if pu_depth_intra would // otherwise forbid it. @@ -1048,7 +1031,7 @@ static double search_cu( int8_t intra_mode = intra_search.pred_cu.intra.mode; // TODO: This heavily relies to square CUs - if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) { + if ((split_tree.current_depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) { intra_search.pred_cu.joint_cb_cr = 0; // There is almost no benefit to doing the chroma mode search for @@ -1097,7 +1080,7 @@ static double search_cu( } intra_search.pred_cu.intra.mode = intra_mode; if(tree_type == UVG_CHROMA_T) { - uvg_lcu_fill_trdepth(lcu, x_local, y_local, depth, depth, tree_type); + uvg_lcu_fill_trdepth(lcu, x_local, y_local, split_tree.current_depth, split_tree.current_depth, tree_type); } } if (intra_cost < cost) { @@ -1120,8 +1103,7 @@ static double search_cu( double mode_cost; double mode_bitcost; uvg_search_cu_ibc(state, - x, y, - depth, + cu_loc, lcu, &mode_cost, &mode_bitcost); if (mode_cost < cost) { @@ -1138,11 +1120,10 @@ static double search_cu( // Reconstruct best mode because we need the reconstructed pixels for // mode search of adjacent CUs. if (cur_cu->type == CU_INTRA) { - assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN); bool recon_chroma = true; bool recon_luma = tree_type != UVG_CHROMA_T; - if ((depth == 4) || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) { + if ((split_tree.current_depth == 4) || state->encoder_control->chroma_format == UVG_CSP_400 || tree_type == UVG_LUMA_T) { recon_chroma = false; } lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); @@ -1153,7 +1134,7 @@ static double search_cu( lcu, tree_type,recon_luma,recon_chroma); - if(depth == 4 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) { + if(split_tree.current_depth == 4 && x % 8 && y % 8 && tree_type != UVG_LUMA_T && state->encoder_control->chroma_format != UVG_CSP_400) { intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma; uvg_intra_recon_cu(state, x, y, @@ -1168,8 +1149,8 @@ static double search_cu( const int split_type = intra_search.pred_cu.intra.isp_mode; const int split_num = split_type == ISP_MODE_NO_ISP ? 0 : uvg_get_isp_split_num(cu_width, cu_height, split_type, true); - const int cbf_cb = cbf_is_set(cur_cu->cbf, depth, COLOR_U); - const int cbf_cr = cbf_is_set(cur_cu->cbf, depth, COLOR_V); + const int cbf_cb = cbf_is_set(cur_cu->cbf, split_tree.current_depth, COLOR_U); + const int cbf_cr = cbf_is_set(cur_cu->cbf, split_tree.current_depth, COLOR_V); const int jccr = cur_cu->joint_cb_cr; for (int i = 0; i < split_num; ++i) { cu_loc_t isp_loc; @@ -1181,15 +1162,14 @@ static double search_cu( uvg_get_isp_cu_arr_coords(&tmp_x, &tmp_y); cu_info_t* split_cu = LCU_GET_CU_AT_PX(lcu, tmp_x % LCU_WIDTH, tmp_y % LCU_WIDTH); bool cur_cbf = (intra_search.best_isp_cbfs >> i) & 1; - // ISP_TODO: here, cbfs are also set for chroma for all ISP splits, is this behavior wanted? - cbf_clear(&split_cu->cbf, depth, COLOR_Y); - cbf_clear(&split_cu->cbf, depth, COLOR_U); - cbf_clear(&split_cu->cbf, depth, COLOR_V); + cbf_clear(&split_cu->cbf, split_tree.current_depth, COLOR_Y); + cbf_clear(&split_cu->cbf, split_tree.current_depth, COLOR_U); + cbf_clear(&split_cu->cbf, split_tree.current_depth, COLOR_V); if (cur_cbf) { - cbf_set(&split_cu->cbf, depth, COLOR_Y); + cbf_set(&split_cu->cbf, split_tree.current_depth, COLOR_Y); } - if(cbf_cb) cbf_set(&split_cu->cbf, depth, COLOR_U); - if(cbf_cr) cbf_set(&split_cu->cbf, depth, COLOR_V); + if(cbf_cb) cbf_set(&split_cu->cbf, split_tree.current_depth, COLOR_U); + if(cbf_cr) cbf_set(&split_cu->cbf, split_tree.current_depth, COLOR_V); split_cu->joint_cb_cr = jccr; } lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); @@ -1205,24 +1185,20 @@ static double search_cu( } // Reset transform depth because intra messes with them. // This will no longer be necessary if the transform depths are not shared. - int tr_depth = MAX(1, depth); - if (cur_cu->part_size != SIZE_2Nx2N) { - tr_depth = depth + 1; - } + int tr_depth = MAX(1, split_tree.current_depth); + uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, tree_type); const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400; - uvg_inter_recon_cu(state, lcu, x, y, cu_width, true, has_chroma); + uvg_inter_recon_cu(state, lcu, true, has_chroma, cu_loc); if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) { //Calculate cost for zero coeffs - inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda; + inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, split_tree.current_depth) + inter_bitcost * state->lambda; } cu_loc_t loc; - const int width = LCU_WIDTH >> depth; - const int height = width; // TODO: height for non-square blocks - uvg_cu_loc_ctor(&loc, x, y, width, height); + uvg_cu_loc_ctor(&loc, x, y, cu_width, cu_height); uvg_quantize_lcu_residual(state, true, has_chroma && !cur_cu->joint_cb_cr, cur_cu->joint_cb_cr, &loc, @@ -1232,9 +1208,9 @@ static double search_cu( false, tree_type); - int cbf = cbf_is_set_any(cur_cu->cbf, depth); + int cbf = cbf_is_set_any(cur_cu->cbf, split_tree.current_depth); - if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) { + if (cur_cu->merged && !cbf) { cur_cu->merged = 0; cur_cu->skipped = 1; // Selecting skip reduces bits needed to code the CU @@ -1244,7 +1220,7 @@ static double search_cu( inter_bitcost += cur_cu->merge_idx; } } - lcu_fill_inter(lcu, x_local, y_local, cu_width, cur_cu->type); + lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu); lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); } } @@ -1253,19 +1229,13 @@ static double search_cu( double bits = 0; cabac_data_t* cabac = &state->search_cabac; cabac->update = 1; + + bits += uvg_mock_encode_coding_unit( + state, + cabac, + cu_loc, lcu, cur_cu, + tree_type); - if(cur_cu->type != CU_INTRA || cur_cu->part_size == SIZE_2Nx2N) { - bits += uvg_mock_encode_coding_unit( - state, - cabac, - x, y, depth, - lcu, - cur_cu, - tree_type); - } - else { - assert(0); - } cost = bits * state->lambda; @@ -1275,15 +1245,15 @@ static double search_cu( cost = inter_zero_coeff_cost; // Restore saved pixels from lower level of the working tree. - copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu, tree_type); + copy_cu_pixels(x_local, y_local, cu_width, &work_tree[split_tree.current_depth + 1], lcu, tree_type); - if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + if (cur_cu->merged) { cur_cu->merged = 0; cur_cu->skipped = 1; lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); } - if (cur_cu->tr_depth != depth) { + if (cur_cu->tr_depth != 0) { // Reset transform depth since there are no coefficients. This // ensures that CBF is cleared for the whole area of the CU. uvg_lcu_fill_trdepth(lcu, x, y, depth, depth, tree_type); @@ -1299,12 +1269,12 @@ static double search_cu( // If the CU is partially outside the frame, we need to split it even // if pu_depth_intra and pu_depth_inter would not permit it. cur_cu->type == CU_NOTSET || - (depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) || + (split_tree.current_depth < pu_depth_intra.max && !(state->encoder_control->cfg.force_inter&& state->frame->slicetype != UVG_SLICE_I)) || (state->frame->slicetype != UVG_SLICE_I && - depth < pu_depth_inter.max); + split_tree.current_depth < pu_depth_inter.max); if(state->encoder_control->cabac_debug_file) { - fprintf(state->encoder_control->cabac_debug_file, "S %4d %4d %d %d", x, y, depth, tree_type); + fprintf(state->encoder_control->cabac_debug_file, "S %4d %4d %d %d", x, y, split_tree.current_depth, tree_type); fwrite(&state->search_cabac.ctx, 1, sizeof(state->search_cabac.ctx), state->encoder_control->cabac_debug_file); } @@ -1312,7 +1282,7 @@ static double search_cu( if (can_split_cu) { int half_cu = cu_width >> (tree_type != UVG_CHROMA_T); double split_cost = 0.0; - int cbf = cbf_is_set_any(cur_cu->cbf, depth); + int cbf = cbf_is_set_any(cur_cu->cbf, split_tree.current_depth); cabac_data_t post_seach_cabac; memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); @@ -1320,7 +1290,7 @@ static double search_cu( double split_bits = 0; - if (depth < MAX_DEPTH) { + if (split_tree.current_depth < MAX_DEPTH) { state->search_cabac.update = 1; // Add cost of cu_split_flag. @@ -1364,10 +1334,24 @@ static double search_cu( // It is ok to interrupt the search as soon as it is known that // the split costs at least as much as not splitting. if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) { - if (split_cost < cost) split_cost += search_cu(state, x, y, depth + 1, work_tree, tree_type); - if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y, depth + 1, work_tree, tree_type); - if (split_cost < cost) split_cost += search_cu(state, x, y + half_cu, depth + 1, work_tree, tree_type); - if (split_cost < cost) split_cost += search_cu(state, x + half_cu, y + half_cu, depth + 1, work_tree, tree_type); + const split_tree_t new_split = { split_tree.split_tree | QT_SPLIT << split_tree.current_depth, split_tree.current_depth + 1}; + cu_loc_t new_cu_loc; + if (split_cost < cost) { + uvg_cu_loc_ctor(&new_cu_loc, x, y, half_cu, half_cu); + split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split); + } + if (split_cost < cost) { + uvg_cu_loc_ctor(&new_cu_loc, x + half_cu, y, half_cu, half_cu); + split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split); + } + if (split_cost < cost) { + uvg_cu_loc_ctor(&new_cu_loc, x, y + half_cu, half_cu, half_cu); + split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split); + } + if (split_cost < cost) { + uvg_cu_loc_ctor(&new_cu_loc, x + half_cu, y + half_cu, half_cu, half_cu); + split_cost += search_cu(state, &new_cu_loc, work_tree, tree_type, new_split); + } } else { split_cost = INT_MAX; } @@ -1401,7 +1385,6 @@ static double search_cu( cur_cu->intra = cu_d1->intra; cur_cu->type = CU_INTRA; - cur_cu->part_size = SIZE_2Nx2N; // Disable MRL in this case cur_cu->intra.multi_ref_idx = 0; @@ -1687,14 +1670,17 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con int tree_type = state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree ? UVG_LUMA_T : UVG_BOTH_T; + + cu_loc_t start; + uvg_cu_loc_ctor(&start, x, y, LCU_WIDTH, LCU_WIDTH); + split_tree_t split_tree = { 0, 0 }; // Start search from depth 0. double cost = search_cu( - state, - x, - y, - 0, + state, + &start, work_tree, - tree_type); + tree_type, + split_tree); // Save squared cost for rate control. if(state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) { @@ -1710,12 +1696,9 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con if(state->frame->slicetype == UVG_SLICE_I && state->encoder_control->cfg.dual_tree) { cost = search_cu( - state, - x, - y, - 0, + state, &start, work_tree, - UVG_CHROMA_T); + UVG_CHROMA_T, split_tree); if (state->encoder_control->cfg.rc_algorithm == UVG_LAMBDA) { uvg_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight += cost * cost; diff --git a/src/search_ibc.c b/src/search_ibc.c index 44f9ac50..b7067c8c 100644 --- a/src/search_ibc.c +++ b/src/search_ibc.c @@ -109,8 +109,10 @@ static INLINE bool fracmv_within_ibc_range(const ibc_search_info_t *info, int x, } -static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y) +static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y) { + const uint32_t x = loc->x; + const uint32_t y = loc->y; const int x_scu = SUB_SCU(x); const int y_scu = SUB_SCU(y); @@ -132,9 +134,11 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu cur_cu->inter.mv[0][0] = mv_x * (1 << INTERNAL_MV_PREC);; cur_cu->inter.mv[0][1] = mv_y * (1 << INTERNAL_MV_PREC);; - uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400); + uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc); *cur_cu = cu_backup; + uint32_t width = loc->width; + uint32_t height = loc->height; cost = uvg_satd_any_size(width, width, @@ -162,8 +166,10 @@ static uint32_t calculate_ibc_cost_satd(const encoder_state_t *state, lcu_t* lcu } -static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_sad_func_ptr_t optimized_sad, lcu_t* lcu, int32_t x, int32_t y, int32_t width, int32_t mv_x, int32_t mv_y) +static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_sad_func_ptr_t optimized_sad, lcu_t* lcu, const cu_loc_t* loc, int32_t mv_x, int32_t mv_y) { + const uint32_t x = loc->x; + const uint32_t y = loc->y; cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); cu_info_t cu_backup = *cur_cu; @@ -173,6 +179,8 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s const int y_scu = SUB_SCU(y); const uint32_t offset = x_scu + y_scu * LCU_WIDTH; const uint32_t offset_c = x_scu / 2 + y_scu / 2 * LCU_WIDTH_C; + const uint32_t width = loc->width; + const uint32_t height = loc->height; cur_cu->type = CU_IBC; cur_cu->inter.mv_dir = 1; @@ -183,7 +191,7 @@ static uint32_t calculate_ibc_cost_sad(const encoder_state_t *state, optimized_s cur_cu->inter.mv[0][0] = mv_x * (1 << INTERNAL_MV_PREC);; cur_cu->inter.mv[0][1] = mv_y * (1 << INTERNAL_MV_PREC);; - uvg_inter_recon_cu(state, lcu, x, y, width, true, state->encoder_control->chroma_format != UVG_CSP_400); + uvg_inter_recon_cu(state, lcu, true, state->encoder_control->chroma_format != UVG_CSP_400, loc); *cur_cu = cu_backup; @@ -235,8 +243,11 @@ static bool check_mv_cost(ibc_search_info_t *info, double bitcost = 0; double cost = MAX_DOUBLE; + cu_loc_t loc; + uvg_cu_loc_ctor(&loc, info->origin.x, info->origin.y, info->width, info->height); - cost = calculate_ibc_cost_sad(info->state, info->optimized_sad, info->lcu, info->origin.x, info->origin.y, info->width, x, y); + + cost = calculate_ibc_cost_sad(info->state, info->optimized_sad, info->lcu, &loc, x, y); if (cost >= *best_cost) return false; @@ -246,7 +257,7 @@ static bool check_mv_cost(ibc_search_info_t *info, info->mv_cand, NULL, 0, - NULL, + 0, &bitcost ); @@ -782,63 +793,46 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands, * \param amvp Return searched AMVP PUs sorted by costs * \param merge Return searched Merge PUs sorted by costs */ -static void search_pu_ibc(encoder_state_t * const state, - int x_cu, int y_cu, - int depth, - part_mode_t part_mode, - int i_pu, - unit_stats_map_t *amvp, - unit_stats_map_t *merge, - ibc_search_info_t *info) +static void search_pu_ibc( + encoder_state_t * const state, + const cu_loc_t * const cu_loc, + unit_stats_map_t *amvp, + unit_stats_map_t *merge, + ibc_search_info_t *info) { - const uvg_config *cfg = &state->encoder_control->cfg; - const videoframe_t * const frame = state->tile->frame; - const int width_cu = LCU_WIDTH >> depth; - const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); - const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); - const int width = PU_GET_W(part_mode, width_cu, i_pu); - const int height = PU_GET_H(part_mode, width_cu, i_pu); - - // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and - // nRx2N partitions. - const bool merge_a1 = i_pu == 0 || width >= height; - // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and - // 2NxnD partitions. - const bool merge_b1 = i_pu == 0 || width <= height; - + const uvg_config *cfg = &state->encoder_control->cfg; + const videoframe_t * const frame = state->tile->frame; + const int width_cu = cu_loc->width; + const int height_cu= cu_loc->height; lcu_t *lcu = info->lcu; - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); - cur_pu->type = CU_IBC; - cur_pu->part_size = part_mode; - cur_pu->depth = depth; - cur_pu->tr_depth = depth; - cur_pu->qp = state->qp; - cur_pu->inter.mv_dir = 1; + const int x_local = SUB_SCU(cu_loc->x); + const int y_local = SUB_SCU(cu_loc->y); + cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); + cur_pu->type = CU_IBC; + cur_pu->qp = state->qp; + cur_pu->inter.mv_dir = 1; // Default to candidate 0 CU_SET_MV_CAND(cur_pu, 0, 0); - + FILL(*info, 0); - info->state = state; - info->pic = frame->source; - info->origin.x = x; - info->origin.y = y; - info->width = width; - info->height = height; - info->mvd_cost_func = cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost; - info->optimized_sad = uvg_get_optimized_sad(width); - info->lcu = lcu; + info->state = state; + info->pic = frame->source; + info->origin.x = cu_loc->x; + info->origin.y = cu_loc->y; + info->width = width_cu; + info->height = height_cu; + info->mvd_cost_func = + cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost; + info->optimized_sad = uvg_get_optimized_sad(width_cu); + info->lcu = lcu; // Search for merge mode candidates info->num_merge_cand = uvg_inter_get_merge_cand( state, - x, y, - width, height, - merge_a1, merge_b1, + cu_loc, info->merge_cand, lcu); @@ -853,7 +847,7 @@ static void search_pu_ibc(encoder_state_t * const state, #ifdef COMPLETE_PRED_MODE_BITS // Technically counting these bits would be correct, however counting // them universally degrades quality so this block is disabled by default - const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(x, y, lcu, NULL)], 0); + const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL)], 0); #else const double no_skip_flag = 0; #endif @@ -875,7 +869,7 @@ static void search_pu_ibc(encoder_state_t * const state, { continue; } - uvg_inter_pred_pu(state, info->lcu, x_cu, y_cu, width_cu, true, false, i_pu); + uvg_inter_pred_pu(state, info->lcu, true, false, cu_loc); merge->unit[merge->size] = *cur_pu; merge->unit[merge->size].type = CU_IBC; merge->unit[merge->size].merge_idx = merge_idx; @@ -883,11 +877,11 @@ static void search_pu_ibc(encoder_state_t * const state, merge->unit[merge->size].skipped = false; double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); - if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); + if(state->encoder_control->cfg.rdo >= 2) { + uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc); } else { - merge->cost[merge->size] = uvg_satd_any_size(width, height, + merge->cost[merge->size] = uvg_satd_any_size(width_cu, height_cu, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); bits += no_skip_flag; @@ -909,7 +903,7 @@ static void search_pu_ibc(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400; - if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { + if (cfg->early_skip) { for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) { merge->size = 1; @@ -919,6 +913,7 @@ static void search_pu_ibc(encoder_state_t * const state, merge->keys[0] = 0; } else if(cfg->rdo < 2) { + const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; // Reconstruct blocks with merge candidate. // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. @@ -927,18 +922,18 @@ static void search_pu_ibc(encoder_state_t * const state, cur_pu->inter.mv_dir = info->merge_cand[merge_idx].dir; cur_pu->inter.mv[0][0] = info->merge_cand[merge_idx].mv[0][0]; cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; - uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T); - uvg_inter_recon_cu(state, lcu, x, y, width, true, false); - uvg_quantize_lcu_residual(state, true, false, false, x, y, depth, cur_pu, lcu, true, UVG_BOTH_T); + uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, MAX(1, depth), UVG_BOTH_T); + uvg_inter_recon_cu(state, lcu, true, false, cu_loc); + uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T); if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { continue; } else if (has_chroma) { - uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); + uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc); uvg_quantize_lcu_residual(state, false, has_chroma, false, /*we are only checking for lack of coeffs so no need to check jccr*/ - x, y, depth, cur_pu, lcu, true, UVG_BOTH_T); + cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T); if (!cbf_is_set_any(cur_pu->cbf, depth)) { cur_pu->type = CU_IBC; cur_pu->merge_idx = merge_idx; @@ -964,15 +959,12 @@ static void search_pu_ibc(encoder_state_t * const state, // Do the motion search - uvg_inter_get_mv_cand(info->state, - info->origin.x, - info->origin.y, - info->width, - info->height, + uvg_inter_get_mv_cand(info->state, info->mv_cand, cur_pu, lcu, - NULL); + 0, + cu_loc); vector2d_t best_mv = { 0, 0 }; @@ -1003,9 +995,7 @@ static void search_pu_ibc(encoder_state_t * const state, best_cost = calculate_ibc_cost_satd( info->state, lcu, - info->origin.x, - info->origin.y, - info->width, + cu_loc, (best_mv.x >> INTERNAL_MV_PREC), (best_mv.y >> INTERNAL_MV_PREC)); best_cost += best_bits * info->state->lambda; @@ -1052,16 +1042,16 @@ static void search_pu_ibc(encoder_state_t * const state, }; - if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); + if (state->encoder_control->cfg.rdo >= 2) { + if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc); } if(cfg->rdo < 2) { int predmode_ctx; - const int ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1) * 3; - const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx); + const float ibc_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.ibc_flag[0], 1); + const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx); const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0); @@ -1077,33 +1067,29 @@ static void search_pu_ibc(encoder_state_t * const state, #include "threads.h" static int uvg_search_hash_cu_ibc(encoder_state_t* const state, - int x, int y, int depth, + const cu_loc_t* cu_loc, lcu_t* lcu, double* inter_cost, double* inter_bitcost) { - const int x_cu = x; - const int y_cu = y; + const int x_cu = cu_loc->x; + const int y_cu = cu_loc->y; const int part_mode = SIZE_2Nx2N; const uvg_config *cfg = &state->encoder_control->cfg; const videoframe_t * const frame = state->tile->frame; - const int width_cu = LCU_WIDTH >> depth; - const int width = PU_GET_W(part_mode, width_cu, 0); - const int height = PU_GET_H(part_mode, width_cu, 0); + const int width_cu = cu_loc->width; + const int height_cu = cu_loc->height; const bool merge_a1 = true; const bool merge_b1 = true; ibc_search_info_t info; - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); + const int x_local = SUB_SCU(x_cu); + const int y_local = SUB_SCU(y_cu); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); cur_pu->type = CU_IBC; - cur_pu->part_size = part_mode; - cur_pu->depth = depth; - cur_pu->tr_depth = depth; cur_pu->qp = state->qp; // Default to candidate 0 @@ -1113,22 +1099,19 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, info.state = state; info.pic = frame->source; - info.origin.x = x; - info.origin.y = y; - info.width = width; - info.height = height; + info.origin.x = cu_loc->x; + info.origin.y = cu_loc->y; + info.width = width_cu; + info.height = height_cu; info.mvd_cost_func = cfg->mv_rdo ? uvg_calc_ibc_mvd_cost_cabac : calc_ibc_mvd_cost; - info.optimized_sad = uvg_get_optimized_sad(width); + info.optimized_sad = uvg_get_optimized_sad(width_cu); info.lcu = lcu; // Search for merge mode candidates info.num_merge_cand = uvg_inter_get_merge_cand( state, - x, - y, - width, - height, + cu_loc, merge_a1, merge_b1, info.merge_cand, @@ -1154,8 +1137,8 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, UVG_CLOCK_T hashmap_end_real_time; UVG_GET_TIME(&hashmap_start_real_time); - int xx = x; - int yy = y; + int xx = x_cu; + int yy = y_cu; int best_mv_x = INT_MAX>>2; int best_mv_y = INT_MAX>>2; @@ -1185,12 +1168,12 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, int pos_y = result->value & 0xffff; int mv_x = pos_x - xx; int mv_y = pos_y - yy; - if (pos_x <= xx - width && pos_y <= yy - height) { + if (pos_x <= xx - width_cu && pos_y <= yy - height_cu) { valid_mv = intmv_within_ibc_range(&info, mv_x, mv_y); if (valid_mv) { bool full_block = true; // Is the full block covered by the IBC? - for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width; offset_x+=UVG_HASHMAP_BLOCKSIZE) { - for (int offset_y = 0; offset_y < height; offset_y += UVG_HASHMAP_BLOCKSIZE) { + for (int offset_x = UVG_HASHMAP_BLOCKSIZE; offset_x < width_cu; offset_x+=UVG_HASHMAP_BLOCKSIZE) { + for (int offset_y = 0; offset_y < height_cu; offset_y += UVG_HASHMAP_BLOCKSIZE) { uint32_t crc_other_blocks = state->tile->frame->ibc_hashmap_pos_to_hash[ ((yy+offset_y) / UVG_HASHMAP_BLOCKSIZE)*state->tile->frame->ibc_hashmap_pos_to_hash_stride + (xx+offset_x) / UVG_HASHMAP_BLOCKSIZE]; @@ -1220,7 +1203,7 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, best_mv_y = mv_y; ibc_cost = cost; ibc_bitcost = bits; - fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x,y, width,width, mv_x, mv_y); + fprintf(stderr, "Found best IBC!! %dx%d %dx%d: %d,%d\r\n", x_cu,y_cu, width_cu,height_cu, mv_x, mv_y); found_block = true; //break; } @@ -1274,11 +1257,9 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, uvg_inter_recon_cu( state, lcu, - x, - y, - CU_WIDTH_FROM_DEPTH(depth), true, - state->encoder_control->chroma_format != UVG_CSP_400); + state->encoder_control->chroma_format != UVG_CSP_400, + cu_loc); if (*inter_cost < MAX_DOUBLE) { assert(fracmv_within_ibc_range( @@ -1305,17 +1286,18 @@ static int uvg_search_hash_cu_ibc(encoder_state_t* const state, * \param inter_bitcost Return inter bitcost */ void uvg_search_cu_ibc(encoder_state_t * const state, - int x, int y, int depth, - lcu_t *lcu, - double *inter_cost, - double* inter_bitcost) + const cu_loc_t * const cu_loc, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost) { *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; + // Quick hashmap search /* uvg_search_hash_cu_ibc( state, - x, y, depth, + cu_loc, lcu, inter_cost, inter_bitcost); @@ -1330,7 +1312,7 @@ void uvg_search_cu_ibc(encoder_state_t * const state, info.lcu = lcu; search_pu_ibc(state, - x, y, depth, + cu_loc, SIZE_2Nx2N, 0, amvp, &merge, @@ -1374,14 +1356,14 @@ void uvg_search_cu_ibc(encoder_state_t * const state, return; } - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); + const int x_local = SUB_SCU(cu_loc->x); + const int y_local = SUB_SCU(cu_loc->y); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); *cur_pu = *best_inter_pu; cur_pu->type = CU_IBC; - uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), - true, state->encoder_control->chroma_format != UVG_CSP_400); + uvg_inter_recon_cu(state, lcu, + true, state->encoder_control->chroma_format != UVG_CSP_400, cu_loc); if (*inter_cost < MAX_DOUBLE) { assert(fracmv_within_ibc_range(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); diff --git a/src/search_ibc.h b/src/search_ibc.h index 14ce3b6f..b3c4e544 100644 --- a/src/search_ibc.h +++ b/src/search_ibc.h @@ -46,7 +46,7 @@ void uvg_search_cu_ibc(encoder_state_t * const state, - int x, int y, int depth, + const cu_loc_t * const cu_loc, lcu_t *lcu, double *inter_cost, double* inter_bitcost); diff --git a/src/search_inter.c b/src/search_inter.c index 93598ff2..53587b84 100644 --- a/src/search_inter.c +++ b/src/search_inter.c @@ -1293,8 +1293,8 @@ static void apply_mv_scaling(int32_t current_poc, /** * \brief Perform inter search for a single reference frame. */ -static void search_pu_inter_ref(inter_search_info_t *info, - int depth, +static void search_pu_inter_ref( + inter_search_info_t *info, lcu_t *lcu, cu_info_t *cur_cu, unit_stats_map_t *amvp) @@ -1327,15 +1327,15 @@ static void search_pu_inter_ref(inter_search_info_t *info, // Get MV candidates cur_cu->inter.mv_ref[ref_list] = ref_list_idx[ref_list]; + cu_loc_t cu_loc; + uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height); + uvg_inter_get_mv_cand(info->state, - info->origin.x, - info->origin.y, - info->width, - info->height, - info->mv_cand, - cur_cu, - lcu, - ref_list); + info->mv_cand, + cur_cu, + lcu, + ref_list, + &cu_loc); vector2d_t best_mv = { 0, 0 }; @@ -1498,11 +1498,13 @@ static void search_pu_inter_ref(inter_search_info_t *info, /** * \brief Search bipred modes for a PU. */ -static void search_pu_inter_bipred(inter_search_info_t *info, - int depth, - lcu_t *lcu, - unit_stats_map_t *amvp_bipred) +static void search_pu_inter_bipred( + inter_search_info_t *info, + lcu_t *lcu, + unit_stats_map_t *amvp_bipred) { + cu_loc_t cu_loc; + uvg_cu_loc_ctor(&cu_loc, info->origin.x, info->origin.y, info->width, info->height); const image_list_t *const ref = info->state->frame->ref; uint8_t (*ref_LX)[16] = info->state->frame->ref_LX; const videoframe_t * const frame = info->state->tile->frame; @@ -1551,7 +1553,7 @@ static void search_pu_inter_bipred(inter_search_info_t *info, bipred_pu->skipped = false; for (int reflist = 0; reflist < 2; reflist++) { - uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); + uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, &cu_loc); } // Don't try merge candidates that don't satisfy mv constraints. @@ -1564,13 +1566,11 @@ static void search_pu_inter_bipred(inter_search_info_t *info, uvg_inter_recon_bipred(info->state, ref->images[ref_LX[0][merge_cand[i].ref[0]]], ref->images[ref_LX[1][merge_cand[j].ref[1]]], - x, y, - width, - height, mv, lcu, true, - false); + false, + &cu_loc); const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; const uvg_pixel *src = &frame->source->y[x + y * frame->source->stride]; @@ -1666,11 +1666,9 @@ static bool merge_candidate_in_list(inter_merge_cand_t *all_cands, * \param amvp Return searched AMVP PUs sorted by costs * \param merge Return searched Merge PUs sorted by costs */ -static void search_pu_inter(encoder_state_t * const state, - int x_cu, int y_cu, - int depth, - part_mode_t part_mode, - int i_pu, +static void search_pu_inter( + encoder_state_t * const state, + const cu_loc_t* const cu_loc, lcu_t *lcu, unit_stats_map_t *amvp, unit_stats_map_t *merge, @@ -1678,26 +1676,14 @@ static void search_pu_inter(encoder_state_t * const state, { const uvg_config *cfg = &state->encoder_control->cfg; const videoframe_t * const frame = state->tile->frame; - const int width_cu = LCU_WIDTH >> depth; - const int height_cu = width_cu; // TODO: non-square blocks - const int x = PU_GET_X(part_mode, width_cu, x_cu, i_pu); - const int y = PU_GET_Y(part_mode, width_cu, y_cu, i_pu); - const int width = PU_GET_W(part_mode, width_cu, i_pu); - const int height = PU_GET_H(part_mode, width_cu, i_pu); + const int width_cu = cu_loc->width; + const int height_cu = cu_loc->height; - // Merge candidate A1 may not be used for the second PU of Nx2N, nLx2N and - // nRx2N partitions. - const bool merge_a1 = i_pu == 0 || width >= height; - // Merge candidate B1 may not be used for the second PU of 2NxN, 2NxnU and - // 2NxnD partitions. - const bool merge_b1 = i_pu == 0 || width <= height; - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); + const int x_local = SUB_SCU(cu_loc->x); + const int y_local = SUB_SCU(cu_loc->y); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); cur_pu->type = CU_NOTSET; - cur_pu->part_size = part_mode; - cur_pu->depth = depth; cur_pu->qp = state->qp; // Default to candidate 0 @@ -1708,19 +1694,17 @@ static void search_pu_inter(encoder_state_t * const state, info->state = state; info->pic = frame->source; - info->origin.x = x; - info->origin.y = y; - info->width = width; - info->height = height; + info->origin.x = cu_loc->x; + info->origin.y = cu_loc->y; + info->width = width_cu; + info->height = height_cu; info->mvd_cost_func = cfg->mv_rdo ? uvg_calc_mvd_cost_cabac : calc_mvd_cost; - info->optimized_sad = uvg_get_optimized_sad(width); + info->optimized_sad = uvg_get_optimized_sad(width_cu); // Search for merge mode candidates info->num_merge_cand = uvg_inter_get_merge_cand( state, - x, y, - width, height, - merge_a1, merge_b1, + cu_loc, info->merge_cand, lcu ); @@ -1755,7 +1739,7 @@ static void search_pu_inter(encoder_state_t * const state, // If bipred is not enabled, do not try candidates with mv_dir == 3. // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. if (cur_pu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue; - if (cur_pu->inter.mv_dir == 3 && !(width + height > 12)) continue; + if (cur_pu->inter.mv_dir == 3 && !(cu_loc->width + cu_loc->height > 12)) continue; bool is_duplicate = merge_candidate_in_list(info->merge_cand, cur_cand, merge); @@ -1769,7 +1753,7 @@ static void search_pu_inter(encoder_state_t * const state, { continue; } - uvg_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu); + uvg_inter_pred_pu(state, lcu, true, false, cu_loc); merge->unit[merge->size] = *cur_pu; merge->unit[merge->size].type = CU_INTER; merge->unit[merge->size].merge_idx = merge_idx; @@ -1777,11 +1761,11 @@ static void search_pu_inter(encoder_state_t * const state, merge->unit[merge->size].skipped = false; double bits = merge_flag_cost + merge_idx + CTX_ENTROPY_FBITS(&(state->search_cabac.ctx.cu_merge_idx_ext_model), merge_idx != 0); - if(state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - uvg_cu_cost_inter_rd2(state, x, y, depth, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits); + if(state->encoder_control->cfg.rdo >= 2) { + uvg_cu_cost_inter_rd2(state, &merge->unit[merge->size], lcu, &merge->cost[merge->size], &bits, cu_loc); } else { - merge->cost[merge->size] = uvg_satd_any_size(width, height, + merge->cost[merge->size] = uvg_satd_any_size(cu_loc->width, cu_loc->height, lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); bits += no_skip_flag; @@ -1803,7 +1787,7 @@ static void search_pu_inter(encoder_state_t * const state, // Early Skip Mode Decision bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400; - if (cfg->early_skip && cur_pu->part_size == SIZE_2Nx2N) { + if (cfg->early_skip) { for (int merge_key = 0; merge_key < num_rdo_cands; ++merge_key) { if(cfg->rdo >= 2 && merge->unit[merge->keys[merge_key]].skipped) { merge->size = 1; @@ -1813,6 +1797,8 @@ static void search_pu_inter(encoder_state_t * const state, merge->keys[0] = 0; } else if(cfg->rdo < 2) { + + const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; // Reconstruct blocks with merge candidate. // Check luma CBF. Then, check chroma CBFs if luma CBF is not set // and chroma exists. @@ -1825,23 +1811,20 @@ static void search_pu_inter(encoder_state_t * const state, cur_pu->inter.mv[0][1] = info->merge_cand[merge_idx].mv[0][1]; cur_pu->inter.mv[1][0] = info->merge_cand[merge_idx].mv[1][0]; cur_pu->inter.mv[1][1] = info->merge_cand[merge_idx].mv[1][1]; - uvg_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth), UVG_BOTH_T); - uvg_inter_recon_cu(state, lcu, x, y, width, true, false); + uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, MAX(1, depth), UVG_BOTH_T); + uvg_inter_recon_cu(state, lcu, true, false, cu_loc); - cu_loc_t loc; - uvg_cu_loc_ctor(&loc, x, y, width_cu, height_cu); - - uvg_quantize_lcu_residual(state, true, false, false, &loc, depth, cur_pu, lcu, true, UVG_BOTH_T); + uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T); if (cbf_is_set(cur_pu->cbf, depth, COLOR_Y)) { continue; } else if (has_chroma) { - uvg_inter_recon_cu(state, lcu, x, y, width, false, has_chroma); + uvg_inter_recon_cu(state, lcu, false, has_chroma, cu_loc); uvg_quantize_lcu_residual(state, false, has_chroma, false, /*we are only checking for lack of coeffs so no need to check jccr*/ - &loc, depth, cur_pu, lcu, + cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T); if (!cbf_is_set_any(cur_pu->cbf, depth)) { @@ -1876,7 +1859,7 @@ static void search_pu_inter(encoder_state_t * const state, info->ref_idx = ref_idx; info->ref = state->frame->ref->images[ref_idx]; - search_pu_inter_ref(info, depth, lcu, cur_pu, amvp); + search_pu_inter_ref(info, lcu, cur_pu, amvp); } assert(amvp[0].size <= MAX_UNIT_STATS_MAP_SIZE); @@ -1941,14 +1924,11 @@ static void search_pu_inter(encoder_state_t * const state, info->ref = ref->images[info->ref_idx]; uvg_inter_get_mv_cand(info->state, - info->origin.x, - info->origin.y, - info->width, - info->height, - info->mv_cand, - unipred_pu, - lcu, - list); + info->mv_cand, + unipred_pu, + lcu, + list, + cu_loc); double frac_cost = MAX_DOUBLE; double frac_bits = MAX_INT; @@ -1969,8 +1949,8 @@ static void search_pu_inter(encoder_state_t * const state, unipred_pu->inter.mv[list][1] = frac_mv.y; CU_SET_MV_CAND(unipred_pu, list, cu_mv_cand); - if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - uvg_cu_cost_inter_rd2(state, x, y, depth, unipred_pu, lcu, &frac_cost, &frac_bits); + if (state->encoder_control->cfg.rdo >= 2) { + uvg_cu_cost_inter_rd2(state, unipred_pu, lcu, &frac_cost, &frac_bits, cu_loc); } amvp[list].cost[key] = frac_cost; @@ -1992,15 +1972,15 @@ static void search_pu_inter(encoder_state_t * const state, amvp[list].size = n_best; } - if (state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N && cfg->fme_level == 0) { - if (amvp[0].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]]); - if (amvp[1].size) uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]]); + if (state->encoder_control->cfg.rdo >= 2 && cfg->fme_level == 0) { + if (amvp[0].size) uvg_cu_cost_inter_rd2(state, &amvp[0].unit[best_keys[0]], lcu, &amvp[0].cost[best_keys[0]], &amvp[0].bits[best_keys[0]], cu_loc); + if (amvp[1].size) uvg_cu_cost_inter_rd2(state, &amvp[1].unit[best_keys[1]], lcu, &amvp[1].cost[best_keys[1]], &amvp[1].bits[best_keys[1]], cu_loc); } // Search bi-pred positions bool can_use_bipred = state->frame->slicetype == UVG_SLICE_B && cfg->bipred - && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred + && cu_loc->width + cu_loc->height >= 16; // 4x8 and 8x4 PBs are restricted to unipred if (can_use_bipred) { @@ -2031,25 +2011,23 @@ static void search_pu_inter(encoder_state_t * const state, bipred_pu->skipped = false; for (int reflist = 0; reflist < 2; reflist++) { - uvg_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, bipred_pu, lcu, reflist); + uvg_inter_get_mv_cand(info->state, info->mv_cand, bipred_pu, lcu, reflist, cu_loc); } uvg_inter_recon_bipred(info->state, - ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]], - ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]], - x, y, - width, - height, - mv, - lcu, - true, - false); + ref->images[ref_LX[0][bipred_pu->inter.mv_ref[0]]], + ref->images[ref_LX[1][bipred_pu->inter.mv_ref[1]]], + mv, lcu, + true, + false, + cu_loc + ); - const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; - const uvg_pixel *src = &lcu->ref.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; + const uvg_pixel *rec = &lcu->rec.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)]; + const uvg_pixel *src = &lcu->ref.y[SUB_SCU(cu_loc->y) * LCU_WIDTH + SUB_SCU(cu_loc->x)]; best_bipred_cost = - uvg_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH); + uvg_satd_any_size(cu_loc->width, cu_loc->height, rec, LCU_WIDTH, src, LCU_WIDTH); double bitcost[2] = { 0, 0 }; @@ -2096,17 +2074,17 @@ static void search_pu_inter(encoder_state_t * const state, } // TODO: this probably should have a separate command line option - if (cfg->rdo >= 3) search_pu_inter_bipred(info, depth, lcu, &amvp[2]); + if (cfg->rdo >= 3) search_pu_inter_bipred(info, lcu, &amvp[2]); assert(amvp[2].size <= MAX_UNIT_STATS_MAP_SIZE); uvg_sort_keys_by_cost(&amvp[2]); - if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2 && cur_pu->part_size == SIZE_2Nx2N) { - uvg_cu_cost_inter_rd2(state, x, y, depth, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]]); + if (amvp[2].size > 0 && state->encoder_control->cfg.rdo >= 2) { + uvg_cu_cost_inter_rd2(state, &amvp[2].unit[amvp[2].keys[0]], lcu, &amvp[2].cost[amvp[2].keys[0]], &amvp[2].bits[amvp[2].keys[0]], cu_loc); } } if(cfg->rdo < 2) { int predmode_ctx; - const int skip_contest = uvg_get_skip_context(x, y, lcu, NULL, &predmode_ctx); + const int skip_contest = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, &predmode_ctx); const double no_skip_flag = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_skip_flag_model[skip_contest], 0); const double pred_mode_bits = CTX_ENTROPY_FBITS(&state->search_cabac.ctx.cu_pred_mode_model[predmode_ctx], 0); @@ -2140,25 +2118,23 @@ static void search_pu_inter(encoder_state_t * const state, * \param inter_cost Return inter cost * \param inter_bitcost Return inter bitcost */ -void uvg_cu_cost_inter_rd2(encoder_state_t * const state, - int x, int y, int depth, - cu_info_t* cur_cu, - lcu_t *lcu, - double *inter_cost, - double* inter_bitcost){ - - int tr_depth = MAX(1, depth); - if (cur_cu->part_size != SIZE_2Nx2N) { - tr_depth = depth + 1; - } - uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, UVG_BOTH_T); +void uvg_cu_cost_inter_rd2( + encoder_state_t * const state, + cu_info_t* cur_cu, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost, + const cu_loc_t* const cu_loc){ - const int x_px = SUB_SCU(x); - const int y_px = SUB_SCU(y); + const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; + int tr_depth = MAX(1, depth); + + uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, tr_depth, UVG_BOTH_T); + + const int x_px = SUB_SCU(cu_loc->x); + const int y_px = SUB_SCU(cu_loc->y); const int width = LCU_WIDTH >> depth; const int height = width; // TODO: non-square blocks - cu_loc_t loc; - uvg_cu_loc_ctor(&loc, x, y, width, height); cabac_data_t cabac_copy; memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy)); @@ -2169,7 +2145,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, *cur_pu = *cur_cu; const bool reconstruct_chroma = state->encoder_control->chroma_format != UVG_CSP_400; - uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma); + uvg_inter_recon_cu(state, lcu, true, reconstruct_chroma, cu_loc); int index = y_px * LCU_WIDTH + x_px; double ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index], @@ -2187,13 +2163,13 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, } double no_cbf_bits; double bits = 0; - const int skip_context = uvg_get_skip_context(x, y, lcu, NULL, NULL); - if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + const int skip_context = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, NULL); + if (cur_cu->merged) { no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost; - bits += uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T); + bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T); } else { - no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, x, y, depth, lcu, cur_cu, UVG_BOTH_T); + no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T); bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 1); } double no_cbf_cost = ssd + no_cbf_bits * state->lambda; @@ -2207,7 +2183,8 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, uvg_quantize_lcu_residual(state, true, false, - false, &loc, + false, + cu_loc, depth, cur_cu, lcu, @@ -2243,7 +2220,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, depth, lcu, &cabac_copy, - &loc, + cu_loc, index, 0, cur_cu, @@ -2274,7 +2251,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, uvg_quantize_lcu_residual(state, true, reconstruct_chroma, reconstruct_chroma && state->encoder_control->cfg.jccr, - &loc, + cu_loc, depth, cur_cu, lcu, @@ -2308,7 +2285,7 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, if(no_cbf_cost < *inter_cost) { cur_cu->cbf = 0; - if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + if (cur_cu->merged) { cur_cu->skipped = 1; } *inter_cost = no_cbf_cost; @@ -2332,11 +2309,12 @@ void uvg_cu_cost_inter_rd2(encoder_state_t * const state, * \param inter_cost Return inter cost * \param inter_bitcost Return inter bitcost */ -void uvg_search_cu_inter(encoder_state_t * const state, - int x, int y, int depth, - lcu_t *lcu, - double *inter_cost, - double* inter_bitcost) +void uvg_search_cu_inter( + encoder_state_t * const state, + const cu_loc_t* const cu_loc, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost) { *inter_cost = MAX_DOUBLE; *inter_bitcost = MAX_INT; @@ -2349,12 +2327,8 @@ void uvg_search_cu_inter(encoder_state_t * const state, inter_search_info_t info; search_pu_inter(state, - x, y, depth, - SIZE_2Nx2N, 0, - lcu, - amvp, - &merge, - &info); + cu_loc, lcu, amvp, + &merge, &info); // Early Skip CU decision if (merge.size == 1 && merge.unit[0].skipped) { @@ -2396,13 +2370,14 @@ void uvg_search_cu_inter(encoder_state_t * const state, return; } - const int x_local = SUB_SCU(x); - const int y_local = SUB_SCU(y); + const int x_local = SUB_SCU(cu_loc->x); + const int y_local = SUB_SCU(cu_loc->y); cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, x_local, y_local); *cur_pu = *best_inter_pu; - uvg_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), - true, state->encoder_control->chroma_format != UVG_CSP_400); + uvg_inter_recon_cu(state, lcu, + true, state->encoder_control->chroma_format != UVG_CSP_400, + cu_loc); if (*inter_cost < MAX_DOUBLE && cur_pu->inter.mv_dir & 1) { assert(fracmv_within_tile(&info, cur_pu->inter.mv[0][0], cur_pu->inter.mv[0][1])); diff --git a/src/search_inter.h b/src/search_inter.h index d76dd927..cdabd15a 100644 --- a/src/search_inter.h +++ b/src/search_inter.h @@ -73,11 +73,12 @@ typedef double uvg_mvd_cost_func(const encoder_state_t *state, int32_t ref_idx, double *bitcost); -void uvg_search_cu_inter(encoder_state_t * const state, - int x, int y, int depth, - lcu_t *lcu, - double *inter_cost, - double* inter_bitcost); +void uvg_search_cu_inter( + encoder_state_t * const state, + const cu_loc_t* const cu_loc, + lcu_t *lcu, + double *inter_cost, + double* inter_bitcost); @@ -85,12 +86,13 @@ unsigned uvg_inter_satd_cost(const encoder_state_t* state, const lcu_t *lcu, int x, int y); -void uvg_cu_cost_inter_rd2(encoder_state_t* const state, - int x, int y, int depth, +void uvg_cu_cost_inter_rd2( + encoder_state_t* const state, cu_info_t* cur_cu, lcu_t* lcu, double* inter_cost, - double* inter_bitcost); + double* inter_bitcost, + const cu_loc_t* const cu_loc); int uvg_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a, int* predmode_ctx); diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 2783454d..1d3c117f 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -294,13 +294,6 @@ static void uvg_angular_pred_avx2( f[yy][2] = 16 + offset; f[yy][3] = offset; } - // Cubic must be used if ref line != 0 or if isp mode != 0 - if (multi_ref_index || isp) { - use_cubic = true; - } - const int16_t filter_coeff[4] = { 16 - (delta_fract[yy] >> 1), 32 - (delta_fract[yy] >> 1), 16 + (delta_fract[yy] >> 1), delta_fract[yy] >> 1 }; - const int16_t *temp_f = use_cubic ? cubic_filter[delta_fract[yy]] : filter_coeff; - memcpy(f[yy], temp_f, 4 * sizeof(*temp_f)); } // Do 4-tap intra interpolation filtering diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c index b6d062b0..bc70daab 100644 --- a/src/strategies/avx2/quant-avx2.c +++ b/src/strategies/avx2/quant-avx2.c @@ -708,7 +708,6 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state, (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip) { int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth; - tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0); uvg_rdoq(state, coeff, coeff_out, width, height, color, scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index); } diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c index 4215fc81..be396a8b 100644 --- a/src/strategies/generic/quant-generic.c +++ b/src/strategies/generic/quant-generic.c @@ -316,7 +316,6 @@ int uvg_quant_cbcr_residual_generic( (width > 4 || !state->encoder_control->cfg.rdoq_skip)) { int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth; - tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0); uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, scan_order, cur_cu->type, tr_depth, cur_cu->cbf, cur_cu->cr_lfnst_idx); @@ -499,7 +498,6 @@ int uvg_quantize_residual_generic(encoder_state_t *const state, (width > 4 || !state->encoder_control->cfg.rdoq_skip) && !use_trskip) { int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth; - tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0); uvg_rdoq(state, coeff, coeff_out, width, height, color, scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index); diff --git a/src/transform.c b/src/transform.c index b260eea1..a497003b 100644 --- a/src/transform.c +++ b/src/transform.c @@ -490,7 +490,7 @@ void uvg_chroma_transform_search( int depth, lcu_t* const lcu, cabac_data_t* temp_cabac, - cu_loc_t *cu_loc, + const cu_loc_t* const cu_loc, const int offset, const uint8_t mode, cu_info_t* pred_cu, diff --git a/src/transform.h b/src/transform.h index 6fdef411..a7427ea0 100644 --- a/src/transform.h +++ b/src/transform.h @@ -108,7 +108,7 @@ void uvg_chroma_transform_search( int depth, lcu_t* const lcu, cabac_data_t* temp_cabac, - cu_loc_t *cu_loc, + const cu_loc_t* const cu_loc, const int offset, const uint8_t mode, cu_info_t* pred_cu, diff --git a/tests/mv_cand_tests.c b/tests/mv_cand_tests.c index 84ab9328..849fec2d 100644 --- a/tests/mv_cand_tests.c +++ b/tests/mv_cand_tests.c @@ -46,8 +46,11 @@ TEST test_get_spatial_merge_cand(void) merge_candidates_t cand = { 0 }; - get_spatial_merge_candidates(64 + 32, 64, // x, y - 32, 24, // width, height + cu_loc_t cu_loc; + uvg_cu_loc_ctor(&cu_loc, 64 + 32, 64, // x, y + 32, 24); // width, height) + + get_spatial_merge_candidates(&cu_loc, 1920, 1080, // picture size &lcu, &cand,