From ba0d43d846f6d1b90aa56696878f0f1b14807bcc Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 7 Dec 2022 14:56:40 +0200 Subject: [PATCH] [mtt] Fill chroma data for the whole area covered by the local separate tree chroma cu --- src/cu.h | 2 +- src/encode_coding_tree.c | 6 +- src/intra.c | 15 +++- src/search.c | 145 ++++++++++++++++++++++++++------------- src/search_intra.c | 5 +- src/transform.c | 4 +- 6 files changed, 116 insertions(+), 61 deletions(-) diff --git a/src/cu.h b/src/cu.h index 11325719..751a483c 100644 --- a/src/cu.h +++ b/src/cu.h @@ -150,7 +150,7 @@ typedef struct uint8_t mts_last_scan_pos : 1; uint8_t violates_lfnst_constrained_luma : 1; - uint8_t violates_lfnst_constrained_chroma : 1; + uint8_t violates_lfnst_constrained_chroma; uint8_t lfnst_last_scan_pos : 1; uint8_t lfnst_idx : 2; uint8_t cr_lfnst_idx : 2; diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index e5981eca..c3400524 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -1438,7 +1438,7 @@ void uvg_encode_coding_tree( DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_CU_TYPE, abs_x, abs_y, cu_width, cu_height, cur_cu->type-1); - //fprintf(stderr, "%4d %4d %2d %2d %d\n", x, y, cu_width, cu_height, has_chroma); + //fprintf(stderr, "%4d %4d %2d %2d %d %d\n", x, y, cu_width, cu_height, has_chroma, cur_cu->split_tree); if (ctrl->cfg.lossless) { cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass; @@ -1668,11 +1668,11 @@ void uvg_encode_coding_tree( int8_t luma_dir = uvg_get_co_located_luma_mode(tree_type != UVG_CHROMA_T ? chroma_loc : cu_loc, cu_loc, cur_cu, NULL, frame->cu_array, UVG_CHROMA_T); encode_chroma_intra_cu(cabac, cur_cu, state->encoder_control->cfg.cclm && uvg_cclm_is_allowed(state, cu_loc, cur_cu, tree_type), luma_dir,NULL); // LFNST constraints must be reset here. Otherwise the left over values will interfere when calculating new constraints - cu_info_t* tmp = (cu_info_t*)cur_cu; + cu_info_t* tmp = uvg_cu_array_at((cu_array_t *)used_array, chroma_loc->x, chroma_loc->y); tmp->violates_lfnst_constrained_luma = false; tmp->violates_lfnst_constrained_chroma = false; tmp->lfnst_last_scan_pos = false; - encode_transform_coeff(state, chroma_loc, 1, coeff, cur_cu, tree_type, true, false, &luma_cbf_ctx, chroma_loc, chroma_loc); + encode_transform_coeff(state, chroma_loc, 1, coeff, NULL, tree_type, true, false, &luma_cbf_ctx, chroma_loc, chroma_loc); // Write LFNST only once for single tree structure encode_lfnst_idx(state, cabac, tmp, is_local_dual_tree ? UVG_CHROMA_T : tree_type, COLOR_UV, chroma_loc); } diff --git a/src/intra.c b/src/intra.c index 429254c1..e39878df 100644 --- a/src/intra.c +++ b/src/intra.c @@ -1884,7 +1884,14 @@ void uvg_intra_recon_cu( bool recon_chroma) { const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width]; - const vector2d_t lcu_px = { cu_loc->local_x >> (tree_type == UVG_CHROMA_T), cu_loc->local_y >> (tree_type == UVG_CHROMA_T) }; + const vector2d_t lcu_px = { + cu_loc->local_x >> + (tree_type == UVG_CHROMA_T && state->encoder_control->cfg.dual_tree && + state->frame->slicetype == UVG_SLICE_I), + cu_loc->local_y >> + (tree_type == UVG_CHROMA_T && state->encoder_control->cfg.dual_tree && + state->frame->slicetype == UVG_SLICE_I), + }; const int8_t width = cu_loc->width; const int8_t height = cu_loc->height; if (cur_cu == NULL) { @@ -1917,7 +1924,11 @@ void uvg_intra_recon_cu( cu_loc_t split_cu_loc[4]; const int split_count = uvg_get_split_locs(cu_loc, split, split_cu_loc,NULL); for (int i = 0; i < split_count; ++i) { - uvg_intra_recon_cu(state, search_data, &split_cu_loc[i], NULL, lcu, tree_type, recon_luma, recon_chroma); + uvg_intra_recon_cu( + state, search_data, &split_cu_loc[i], + NULL, lcu, + state->encoder_control->cfg.dual_tree && state->frame->slicetype == UVG_SLICE_I ? tree_type : UVG_BOTH_T, + recon_luma, recon_chroma); } return; diff --git a/src/search.c b/src/search.c index c23540c9..5f141d4d 100644 --- a/src/search.c +++ b/src/search.c @@ -134,7 +134,15 @@ static INLINE void initialize_partial_work_tree(lcu_t* from, lcu_t *to, const cu const int offset = chroma_loc->local_x / 2 + chroma_loc->local_y / 2 * LCU_WIDTH_C; uvg_pixels_blit(&from->ref.u[offset], &to->ref.u[offset], chroma_loc->chroma_width, chroma_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C); uvg_pixels_blit(&from->ref.v[offset], &to->ref.v[offset], chroma_loc->chroma_width, chroma_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C); - } + } + if(chroma_loc->local_y != cu_loc->local_y || chroma_loc->local_x != cu_loc->local_x && tree_type == UVG_BOTH_T) { + for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += SCU_WIDTH) { + for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += SCU_WIDTH) { + memset(LCU_GET_CU_AT_PX(to, x, y), 0, sizeof(cu_info_t)); + } + } + + } const int y_start = (cu_loc->local_y >> (tree_type == UVG_CHROMA_T)) - 4; const int x_start = (cu_loc->local_x >> (tree_type == UVG_CHROMA_T)) - 4; @@ -217,6 +225,8 @@ static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to } } + +static void lcu_fill_chroma_cu_info(lcu_t* lcu, const cu_loc_t* const cu_loc); /** * Copy all non-reference CU data from next level to current level. */ @@ -235,7 +245,20 @@ static void work_tree_copy_up( if (chroma_loc && tree_type != UVG_LUMA_T) { copy_cu_pixels(from, to, chroma_loc, UVG_CHROMA_T); copy_cu_coeffs(chroma_loc, from, to, joint, UVG_CHROMA_T); + + for (int y = chroma_loc->local_y; y < chroma_loc->local_y + chroma_loc->height; y += 4) { + for (int x = chroma_loc->local_x; x < chroma_loc->local_x + chroma_loc->width; x += 4) { + cu_info_t* to_cu = LCU_GET_CU_AT_PX(to, x, y); + cu_info_t* from_cu = LCU_GET_CU_AT_PX(from, x, y); + to_cu->intra.mode_chroma = from_cu->intra.mode_chroma; + to_cu->joint_cb_cr = from_cu->joint_cb_cr; + to_cu->cr_lfnst_idx = from_cu->cr_lfnst_idx; + cbf_copy(&to_cu->cbf, from_cu->cbf, COLOR_U); + cbf_copy(&to_cu->cbf, from_cu->cbf, COLOR_V); + } + } } + } @@ -250,6 +273,8 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in to->split_tree = cu->split_tree; //to->tr_idx = cu->tr_idx; to->lfnst_idx = cu->lfnst_idx; + to->cr_lfnst_idx = cu->cr_lfnst_idx; + to->joint_cb_cr = cu->joint_cb_cr; to->lfnst_last_scan_pos = cu->lfnst_last_scan_pos; to->violates_lfnst_constrained_luma = cu->violates_lfnst_constrained_luma; to->violates_lfnst_constrained_chroma = cu->violates_lfnst_constrained_chroma; @@ -274,23 +299,42 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in } } - -static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, unsigned height, const cu_info_t *cur_cu) +static void lcu_fill_chroma_cu_info(lcu_t *lcu, const cu_loc_t * const cu_loc) { - const uint32_t x_mask = ~((MIN(width, TR_MAX_WIDTH))-1); - const uint32_t y_mask = ~((MIN(height, TR_MAX_WIDTH))-1); + // The bottom right cu will always have the chroma info + cu_info_t *bottom_right = LCU_GET_CU_AT_PX( + lcu, + cu_loc->local_x + cu_loc->width - 1, + cu_loc->local_y + cu_loc->height - 1); + if(bottom_right->type != CU_INTRA) return; + + for(int y = cu_loc->local_y; y < cu_loc->local_y + cu_loc->height; y += 4 ) { + for (int x = cu_loc->local_x; x < cu_loc->local_x + cu_loc->width; x += 4) { + cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x, y); + cu->intra.mode_chroma = bottom_right->intra.mode_chroma; + cu->joint_cb_cr = bottom_right->joint_cb_cr; + cu->cr_lfnst_idx = bottom_right->cr_lfnst_idx; + } + } +} + + + +static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned width, unsigned height, const cu_info_t *cur_cu, enum + uvg_tree_type tree_type) +{ // Set coeff flags in every CU covered by part_mode in this depth. - for (uint32_t y = y_local; y < y_local + height; y += SCU_WIDTH) { - for (uint32_t x = x_local; x < x_local + width; x += SCU_WIDTH) { + for (uint32_t y = 0; y < height; y += SCU_WIDTH) { + for (uint32_t x = 0; x < width; x += SCU_WIDTH) { // Use TU top-left CU to propagate coeff flags - cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x & x_mask, y & y_mask); - cu_info_t *cu_to = LCU_GET_CU_AT_PX(lcu, x, y); + cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x_local + (x & ~(TR_MAX_WIDTH - 1)), y_local + (y & ~(TR_MAX_WIDTH - 1))); + cu_info_t *cu_to = LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y); if (cu_from != cu_to) { // Chroma and luma coeff data is needed for deblocking - cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_Y); - cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_U); - cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_V); + if(tree_type != UVG_CHROMA_T) cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_Y); + if(tree_type != UVG_LUMA_T) cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_U); + if (tree_type != UVG_LUMA_T)cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_V); } } } @@ -1090,7 +1134,6 @@ static double search_cu( if (can_use_intra && !skip_intra) { intra_search.pred_cu = *cur_cu; if(tree_type != UVG_CHROMA_T) { - intra_search.pred_cu.joint_cb_cr = 4; uvg_search_cu_intra(state, &intra_search, lcu, tree_type, cu_loc); } #ifdef COMPLETE_PRED_MODE_BITS @@ -1136,11 +1179,6 @@ static double search_cu( intra_search.pred_cu.intra.mode_chroma = intra_mode; if (ctrl->cfg.rdo >= 2 || ctrl->cfg.jccr || ctrl->cfg.lfnst) { uvg_search_cu_intra_chroma(state, chroma_loc, lcu, &intra_search, intra_mode, tree_type, is_separate_tree); - - if (intra_search.pred_cu.joint_cb_cr == 0) { - intra_search.pred_cu.joint_cb_cr = 4; - } - } else if (!intra_search.pred_cu.intra.mip_flag) { intra_search.pred_cu.intra.mode_chroma = intra_mode; @@ -1221,16 +1259,26 @@ static double search_cu( if((!recon_chroma && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) || tree_type == UVG_CHROMA_T) { intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma; + lcu_fill_chroma_cu_info( + lcu, + chroma_loc); uvg_intra_recon_cu(state, &intra_search, chroma_loc, - cur_cu, lcu, + NULL, lcu, UVG_CHROMA_T, false, true); + lcu_fill_cbf( + lcu, + chroma_loc->local_x, + chroma_loc->local_y, + chroma_loc->width, + chroma_loc->height, + cur_cu, + UVG_CHROMA_T); } else { assert(cur_cu->cr_lfnst_idx == 0 && "If we don't have separate tree chroma lfnst index must be 0"); } - if (cur_cu->joint_cb_cr == 4) cur_cu->joint_cb_cr = 0; // Set isp split cbfs here const int split_type = intra_search.pred_cu.intra.isp_mode; @@ -1302,7 +1350,7 @@ static double search_cu( } } lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_height, cur_cu); - lcu_fill_cbf(lcu, x_local, y_local, cu_width, cu_height, cur_cu); + lcu_fill_cbf(lcu, x_local, y_local, cu_width, cu_height, cur_cu, UVG_BOTH_T); } } @@ -1369,21 +1417,27 @@ static double search_cu( uvg_get_possible_splits(state, cu_loc, split_tree, tree_type, can_split); can_split_cu &= can_split[1] || can_split[2] || can_split[3] || can_split[4] || can_split[5]; - // Recursively split all the way to max search depth. - if (can_split_cu) { + + // If skip mode was selected for the block, skip further search. + // Skip mode means there's no coefficients in the block, so splitting + // might not give any better results but takes more time to do. + // It is ok to interrupt the search as soon as it is known that + // the split costs at least as much as not splitting. + int cbf = cbf_is_set_any(cur_cu->cbf); + if (can_split_cu && (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF)) { lcu_t * split_lcu = MALLOC(lcu_t, 5); enum split_type best_split = 0; double best_split_cost = MAX_DOUBLE; cabac_data_t post_seach_cabac; cabac_data_t best_split_cabac; memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac)); + // Recursively split all the way to max search depth. for (int split_type = QT_SPLIT; split_type <= TT_VER_SPLIT; ++split_type) { if (!can_split[split_type] || (tree_type == UVG_CHROMA_T && split_type == TT_HOR_SPLIT && cu_loc->chroma_height == 8) || (tree_type == UVG_CHROMA_T && split_type == BT_HOR_SPLIT && cu_loc->chroma_height == 4)) continue; double split_cost = 0.0; - int cbf = cbf_is_set_any(cur_cu->cbf); memcpy(&state->search_cabac, &pre_search_cabac, sizeof(post_seach_cabac)); @@ -1438,32 +1492,24 @@ static double search_cu( state->search_cabac.update = 0; split_cost += split_bits * state->lambda; - // If skip mode was selected for the block, skip further search. - // Skip mode means there's no coefficients in the block, so splitting - // might not give any better results but takes more time to do. - // It is ok to interrupt the search as soon as it is known that - // the split costs at least as much as not splitting. - if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg.cu_split_termination == UVG_CU_SPLIT_TERMINATION_OFF) { - cu_loc_t new_cu_loc[4]; - uint8_t separate_chroma = 0; - const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma); - initialize_partial_work_tree(lcu, &split_lcu[split_type - 1], cu_loc, separate_chroma ? chroma_loc : cu_loc , tree_type); - for (int split = 0; split < splits; ++split) { - new_split.part_index = split; - split_cost += search_cu(state, - &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split], - &split_lcu[split_type -1], - tree_type, new_split, - !separate_chroma || (split == splits - 1 && has_chroma)); - // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma - if (split_cost > cost || split_cost > best_split_cost) { - break; - } + cu_loc_t new_cu_loc[4]; + uint8_t separate_chroma = 0; + const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc, &separate_chroma); + initialize_partial_work_tree(lcu, &split_lcu[split_type - 1], cu_loc, separate_chroma ? chroma_loc : cu_loc , tree_type); + for (int split = 0; split < splits; ++split) { + new_split.part_index = split; + split_cost += search_cu(state, + &new_cu_loc[split], separate_chroma ? chroma_loc : &new_cu_loc[split], + &split_lcu[split_type -1], + tree_type, new_split, + !separate_chroma || (split == splits - 1 && has_chroma)); + // If there is no separate chroma the block will always have chroma, otherwise it is the last block of the split that has the chroma + if (split_cost > cost || split_cost > best_split_cost) { + break; } - - } else { - split_cost = INT_MAX; } + + if (split_cost < best_split_cost) { best_split_cost = split_cost; best_split = split_type; @@ -1492,9 +1538,10 @@ static double search_cu( memcpy(&state->search_cabac, &pre_search_cabac, sizeof(pre_search_cabac)); cost = 0; double bits = 0; + bool is_implicit = false; uvg_write_split_flag(state, &state->search_cabac, x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL, - y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, NULL, + y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, &is_implicit, &bits); cur_cu->intra = cu_d1->intra; diff --git a/src/search_intra.c b/src/search_intra.c index 557dff4e..30110927 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -338,7 +338,6 @@ static double search_intra_trdepth( num_transforms = MAX(num_transforms, 2); } pred_cu->intra.mode_chroma = -1; - pred_cu->joint_cb_cr = 4; const int max_tb_size = TR_MAX_WIDTH; // LFNST search params @@ -489,7 +488,6 @@ static double search_intra_trdepth( if (reconstruct_chroma) { int8_t luma_mode = pred_cu->intra.mode; pred_cu->intra.mode_chroma = chroma_mode; - pred_cu->joint_cb_cr = 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently uvg_intra_recon_cu( state, @@ -544,7 +542,6 @@ static double search_intra_trdepth( if(reconstruct_chroma) { int8_t luma_mode = pred_cu->intra.mode; pred_cu->intra.mode_chroma = chroma_mode; - pred_cu->joint_cb_cr= 4; // TODO: Maybe check the jccr mode here also but holy shit is the interface of search_intra_rdo bad currently uvg_intra_recon_cu(state, search_data, cu_loc, pred_cu, lcu, @@ -1623,7 +1620,7 @@ int8_t uvg_search_cu_intra_chroma( chroma_data[i].pred_cu = *cur_pu; chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? luma_mode : modes[i]; chroma_data[i].cost = 0; - if(cu_loc->width != 4 && tree_type == UVG_BOTH_T) { + if(!is_separate && tree_type == UVG_BOTH_T) { memcpy(chroma_data[i].lfnst_costs, search_data->lfnst_costs, sizeof(double) * 3); } } diff --git a/src/transform.c b/src/transform.c index 4d953454..34e246ce 100644 --- a/src/transform.c +++ b/src/transform.c @@ -863,7 +863,7 @@ void uvg_fwd_lfnst( const uint32_t log2_width = uvg_g_convert_to_log2[width]; const uint32_t log2_height = uvg_g_convert_to_log2[height]; int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma; - bool mts_skip = cur_cu->tr_idx == MTS_SKIP; + bool mts_skip = cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y; // This check is safe for 8x16 cus split with TT, since it is checking the dimensions of the // last luma CU which will be 8x4, i.e., 3 + 2 < 6 bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T; @@ -1005,7 +1005,7 @@ void uvg_inv_lfnst( const uint32_t log2_width = uvg_g_convert_to_log2[width]; const uint32_t log2_height = uvg_g_convert_to_log2[height]; int8_t intra_mode = (color == COLOR_Y) ? cur_cu->intra.mode : cur_cu->intra.mode_chroma; - bool mts_skip = cur_cu->tr_idx == MTS_SKIP; + bool mts_skip = cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y; bool is_separate_tree = cur_cu->log2_height + cur_cu->log2_width < 6 || tree_type != UVG_BOTH_T; bool is_cclm_mode = (intra_mode >= 81 && intra_mode <= 83); // CCLM modes are in [81, 83]