From b413aa5c438022c93ca9d8445ce99f407d31f648 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 20 Apr 2022 08:12:42 +0300 Subject: [PATCH] Improve jccr search --- src/encode_coding_tree.c | 4 +- src/encoder_state-bitstream.c | 2 +- src/encoderstate.c | 33 ++++++++++++ src/encoderstate.h | 1 + src/search.c | 25 +++++++--- src/search_intra.c | 1 + src/strategies/generic/quant-generic.c | 69 +++++++++++++------------- 7 files changed, 90 insertions(+), 45 deletions(-) diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c index e6f39926..4884e3ba 100644 --- a/src/encode_coding_tree.c +++ b/src/encode_coding_tree.c @@ -507,8 +507,8 @@ static void encode_transform_coeff(encoder_state_t * const state, const int cb_flag_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y); - const int cb_flag_u = cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U); - const int cb_flag_v = cur_pu->joint_cb_cr ? ((cur_pu->joint_cb_cr & 2) >> 1) : cbf_is_set(cur_cu->cbf, depth, COLOR_V); + const int cb_flag_u = cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U); + const int cb_flag_v = cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_V); // The split_transform_flag is not signaled when: // - transform size is greater than 32 (depth == 0) diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c index 2f24894e..3c0b6b15 100644 --- a/src/encoder_state-bitstream.c +++ b/src/encoder_state-bitstream.c @@ -1125,7 +1125,7 @@ static void kvz_encoder_state_write_bitstream_picture_header( } if (encoder->cfg.jccr) { - WRITE_U(stream, 0, 1, "ph_joint_cbcr_sign_flag"); + WRITE_U(stream, state->frame->jccr_sign, 1, "ph_joint_cbcr_sign_flag"); } // END PICTURE HEADER diff --git a/src/encoderstate.c b/src/encoderstate.c index db5b93f3..05be79ea 100644 --- a/src/encoderstate.c +++ b/src/encoderstate.c @@ -634,6 +634,38 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las } } + +static void set_joint_cb_cr_modes(encoder_state_t* state, kvz_picture* pic) +{ + bool sgnFlag = true; + + if (state->encoder_control->chroma_format != KVZ_CSP_400) + { + const int x1 = pic->width / 2 - 1; + const int y1 = pic->height / 2 - 1; + const int cbs = pic->stride / 2; + const int crs = pic->stride / 2; + const kvz_pixel* p_cb = pic->u + 1 * cbs; + const kvz_pixel* p_cr = pic->v + 1 * crs; + int64_t sum_cb_cr = 0; + + // determine inter-chroma transform sign from correlation between high-pass filtered (i.e., zero-mean) Cb and Cr planes + for (int y = 1; y < y1; y++, p_cb += cbs, p_cr += crs) + { + for (int x = 1; x < x1; x++) + { + int cb = (12 * (int)p_cb[x] - 2 * ((int)p_cb[x - 1] + (int)p_cb[x + 1] + (int)p_cb[x - cbs] + (int)p_cb[x + cbs]) - ((int)p_cb[x - 1 - cbs] + (int)p_cb[x + 1 - cbs] + (int)p_cb[x - 1 + cbs] + (int)p_cb[x + 1 + cbs])); + int cr = (12 * (int)p_cr[x] - 2 * ((int)p_cr[x - 1] + (int)p_cr[x + 1] + (int)p_cr[x - crs] + (int)p_cr[x + crs]) - ((int)p_cr[x - 1 - crs] + (int)p_cr[x + 1 - crs] + (int)p_cr[x - 1 + crs] + (int)p_cr[x + 1 + crs])); + sum_cb_cr += cb * cr; + } + } + + sgnFlag = (sum_cb_cr < 0); + } + + state->frame->jccr_sign = sgnFlag; +} + static void encoder_state_worker_encode_lcu_bitstream(void* opaque); static void encoder_state_worker_encode_lcu_search(void * opaque) @@ -1870,6 +1902,7 @@ void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame) encoder_state_init_new_frame(state, frame); + if(state->encoder_control->cfg.jccr) set_joint_cb_cr_modes(state, frame); // Create a separate job for ALF done after everything else, and only then do final bitstream writing (for ALF parameters) if (state->encoder_control->cfg.alf_type && state->encoder_control->cfg.wpp) { diff --git a/src/encoderstate.h b/src/encoderstate.h index 19c0d196..edfc6a38 100644 --- a/src/encoderstate.h +++ b/src/encoderstate.h @@ -195,6 +195,7 @@ typedef struct encoder_state_config_frame_t { cu_info_t* hmvp_lut; //!< \brief Look-up table for HMVP, one for each LCU row uint8_t* hmvp_size; //!< \brief HMVP LUT size + bool jccr_sign; } encoder_state_config_frame_t; diff --git a/src/search.c b/src/search.c index 3686da07..a474d4c5 100644 --- a/src/search.c +++ b/src/search.c @@ -637,16 +637,17 @@ void kvz_select_jccr_mode( int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V); CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cr_search"); - int cbf_mask = cbf_is_set(pred_cu->cbf, depth, COLOR_U) * 2 + cbf_is_set(pred_cu->cbf, depth, COLOR_V) - 1; + int cbf_mask = u_is_set * 2 + v_is_set - 1; if((cbf_mask != -1 && pred_cu->type == CU_INTRA) || cbf_mask == 2) CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 0, tr_tree_bits, "jccr_flag"); if(pred_cu->joint_cb_cr) { + const int u_jccr = (pred_cu->joint_cb_cr >> 1) & 1; ctx = &(cabac->ctx.qt_cbf_model_cb[0]); - CABAC_FBITS_UPDATE(cabac, ctx, pred_cu->joint_cb_cr & 1, joint_cbcr_tr_tree_bits, "cbf_cb_search"); - ctx = &(cabac->ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]); - CABAC_FBITS_UPDATE(cabac, ctx, (pred_cu->joint_cb_cr & 2) >> 1, joint_cbcr_tr_tree_bits, "cbf_cr_search"); - cbf_mask = (pred_cu->joint_cb_cr & 1) * 2 + ((pred_cu->joint_cb_cr & 2) >> 1) - 1; + CABAC_FBITS_UPDATE(cabac, ctx, u_jccr, joint_cbcr_tr_tree_bits, "cbf_cb_search"); + ctx = &(cabac->ctx.qt_cbf_model_cr[u_jccr]); + CABAC_FBITS_UPDATE(cabac, ctx, pred_cu->joint_cb_cr & 1, joint_cbcr_tr_tree_bits, "cbf_cr_search"); + cbf_mask = pred_cu->joint_cb_cr - 1; CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.joint_cb_cr[cbf_mask]), 1, joint_cbcr_tr_tree_bits, "jccr_flag"); } int ssd = 0; @@ -695,10 +696,10 @@ void kvz_select_jccr_mode( } cbf_clear(&pred_cu->cbf, depth, COLOR_U); cbf_clear(&pred_cu->cbf, depth, COLOR_V); - if (pred_cu->joint_cb_cr & 1) { + if (pred_cu->joint_cb_cr & 2) { cbf_set(&pred_cu->cbf, depth, COLOR_U); } - if (pred_cu->joint_cb_cr & 2) { + if (pred_cu->joint_cb_cr & 1) { cbf_set(&pred_cu->cbf, depth, COLOR_V); } int lcu_width = LCU_WIDTH_C; @@ -989,6 +990,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, downsample_cclm_rec( state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64] ); + cur_cu->joint_cb_cr = 0; // TODO: This heavily relies to square CUs if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != KVZ_CSP_400) { @@ -996,7 +998,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, // rd2. Possibly because the luma mode search already takes chroma // into account, so there is less of a chanse of luma mode being // really bad for chroma. - cur_cu->joint_cb_cr = 0; intra_search.pred_cu.intra.mode_chroma = cur_cu->intra.mode_chroma; // skip luma if (ctrl->cfg.rdo >= 3 && !cur_cu->intra.mip_flag) { cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search); @@ -1022,6 +1023,14 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth, } else if(depth != 0 && state->encoder_control->cfg.jccr && cur_cu->joint_cb_cr & 3) { assert(cur_cu->joint_cb_cr < 4); + cbf_clear(&cur_cu->cbf, depth, COLOR_U); + cbf_clear(&cur_cu->cbf, depth, COLOR_V); + if (cur_cu->joint_cb_cr & 2) { + cbf_set(&cur_cu->cbf, depth, COLOR_U); + } + if (cur_cu->joint_cb_cr & 1) { + cbf_set(&cur_cu->cbf, depth, COLOR_V); + } const vector2d_t lcu_px = { (x_local & ~7) / 2, (y_local & ~7) / 2 }; int lcu_width = LCU_WIDTH_C; const int index = lcu_px.x + lcu_px.y * lcu_width; diff --git a/src/search_intra.c b/src/search_intra.c index 1aa0f361..7a8eb41b 100644 --- a/src/search_intra.c +++ b/src/search_intra.c @@ -320,6 +320,7 @@ static double search_intra_trdepth( num_transforms = MAX(num_transforms, 2); } pred_cu->intra.mode_chroma = -1; + pred_cu->joint_cb_cr = 4; for (; trafo < num_transforms; trafo++) { pred_cu->tr_idx = trafo; if (mts_enabled) diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c index 41ce1d58..5601106f 100644 --- a/src/strategies/generic/quant-generic.c +++ b/src/strategies/generic/quant-generic.c @@ -225,39 +225,40 @@ int kvz_quant_cbcr_residual_generic( int64_t best_cost = INT64_MAX; // This changes the order of the cbf_masks so 2 and 3 are swapped compared with VTM - for(int cbf_mask = cur_cu->type == CU_INTRA ? 1 : 3; cbf_mask < 4; cbf_mask++) { + for(int i = cur_cu->type == CU_INTRA ? 1 : 3; i < 4; i++) { int64_t d1 = 0; + const int cbf_mask = i * (state->frame->jccr_sign ? -1 : 1); for (int y = 0; y < width; y++) { for (int x = 0; x < width; x++) { int cbx = u_residual[x + y * width], crx = v_residual[x + y * width]; - if (cbf_mask == 1) + if (cbf_mask == 2) { - u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx + 2 * crx) / 5); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (u1_residual[cbf_mask / 2][x + y * width] >> 1)); + u1_residual[i - 2][x + y * width] = ((4 * cbx + 2 * crx) / 5); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (u1_residual[i - 2][x + y * width] >> 1)); } - else if (cbf_mask == -1) + else if (cbf_mask == -2) { - u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx - 2 * crx) / 5); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (-u1_residual[cbf_mask / 2][x + y * width] >> 1)); + u1_residual[i - 2][x + y * width] = ((4 * cbx - 2 * crx) / 5); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (-u1_residual[i - 2][x + y * width] >> 1)); } else if (cbf_mask == 3) { - u1_residual[cbf_mask / 2][x + y * width] = ((cbx + crx) / 2); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - u1_residual[cbf_mask / 2][x + y * width]); + u1_residual[i - 2][x + y * width] = ((cbx + crx) / 2); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - u1_residual[i - 2][x + y * width]); } else if (cbf_mask == -3) { - u1_residual[cbf_mask / 2][x + y * width] = ((cbx - crx) / 2); - d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx + u1_residual[cbf_mask / 2][x + y * width]); + u1_residual[i - 2][x + y * width] = ((cbx - crx) / 2); + d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx + u1_residual[i - 2][x + y * width]); } - else if (cbf_mask == 2) + else if (cbf_mask == 1) { v1_residual[x + y * width] = ((4 * crx + 2 * cbx) / 5); d1 += square(cbx - (v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]); } - else if (cbf_mask == -2) + else if (cbf_mask == -1) { v1_residual[x + y * width] = ((4 * crx - 2 * cbx) / 5); d1 += square(cbx - (-v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]); @@ -270,19 +271,19 @@ int kvz_quant_cbcr_residual_generic( } } if (d1 < best_cost) { - best_cbf_mask = cbf_mask; + best_cbf_mask = i; best_cost = d1; } } - kvz_transform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu); + kvz_transform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu); if (state->encoder_control->cfg.rdoq_enable && (width > 4 || !state->encoder_control->cfg.rdoq_skip)) { int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth; tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0); - kvz_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, + kvz_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, scan_order, cur_cu->type, tr_depth, cur_cu->cbf); } else if (state->encoder_control->cfg.rdoq_enable && false) { @@ -290,7 +291,7 @@ int kvz_quant_cbcr_residual_generic( scan_order); } else { - kvz_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, + kvz_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false); } @@ -309,10 +310,10 @@ int kvz_quant_cbcr_residual_generic( int y, x; // Get quantized residual. (coeff_out -> coeff -> residual) - kvz_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, + kvz_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false); - kvz_itransform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu); + kvz_itransform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu); //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) { @@ -333,32 +334,32 @@ int kvz_quant_cbcr_residual_generic( // } // } //} - + const int temp = best_cbf_mask * (state->frame->jccr_sign ? -1 : 1); // Get quantized reconstruction. (residual + pred_in -> rec_out) for (int y = 0; y < width; y++) { for (int x = 0; x < width; x++) { - if (best_cbf_mask == 1) { - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; - v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width] >> 1; + if (temp == 2) { + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; + v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width] >> 1; } - else if (best_cbf_mask == -1) { - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; - v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width] >> 1; + else if (temp == -2) { + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; + v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width] >> 1; } - else if (best_cbf_mask == 3) { - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; - v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; + else if (temp == 3) { + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; + v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; } - else if (best_cbf_mask == -3) { + else if (temp == -3) { // non-normative clipping to prevent 16-bit overflow - u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x]; - v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width]; + u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x]; + v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width]; } - else if (best_cbf_mask == 2) { + else if (temp == 1) { u_residual[x + y * width] = v1_residual[x + y * width] >> 1; v_residual[x + y * width] = v1_residual[x + y * width]; } - else if (best_cbf_mask == -2) { + else if (temp == -1) { u_residual[x + y * width] = v1_residual[x + y * width] >> 1; v_residual[x + y * width] = -v1_residual[x + y * width]; }