WIP

2024-11-23 18:14:06 +00:00 · 2022-11-16 12:27:28 +02:00 · 2022-11-16 12:27:28 +02:00 · f19084569d
parent bbbd391b9e
commit f19084569d
16 changed files with 185 additions and 107 deletions
--- a/src/cabac.h
+++ b/src/cabac.h
@ -77,6 +77,8 @@ typedef struct
    cabac_ctx_t mts_idx_model[4];
    cabac_ctx_t split_flag_model[9]; //!< \brief split flag context models
    cabac_ctx_t qt_split_flag_model[6]; //!< \brief qt split flag context models
+    cabac_ctx_t mtt_vertical_model[5]; 
+    cabac_ctx_t mtt_binary_model[4]; 
    cabac_ctx_t intra_luma_mpm_flag_model;    //!< \brief intra mode context models
    cabac_ctx_t intra_subpart_model[2];    //!< \brief intra sub part context models
    cabac_ctx_t chroma_pred_model;
--- a/src/cfg.c
+++ b/src/cfg.c
@ -222,6 +222,22 @@ int uvg_config_init(uvg_config *cfg)
  cfg->cabac_debug_file_name = NULL;

  cfg->dual_tree = 0;
+
+  cfg->min_qt_size[0] = 4;
+  cfg->min_qt_size[1] = 4;
+  cfg->min_qt_size[2] = 4;
+
+  cfg->max_btt_depth[0] = 1;
+  cfg->max_btt_depth[1] = 0;
+  cfg->max_btt_depth[2] = 0;
+
+  cfg->max_tt_size[0] = 64;
+  cfg->max_bt_size[0] = 64;
+  cfg->max_tt_size[1] = 64;
+  cfg->max_bt_size[1] = 64;
+  cfg->max_tt_size[2] = 64;
+  cfg->max_bt_size[2] = 64;
+
  cfg->intra_rough_search_levels = 2;

  cfg->ibc = 0;
--- a/src/context.c
+++ b/src/context.c
@ -50,6 +50,21 @@ static const uint8_t  INIT_QT_SPLIT_FLAG[4][6] = {
  {   0,   8,   8,  12,  12,   8, },
 };

+
+static const uint8_t INIT_VERTICAL_SPLIT_FLAG[4][5] = {
+  {  43,  42,  37,  42,  44, },
+  {  43,  35,  37,  34,  52, },
+  {  43,  42,  29,  27,  44, },
+  {   9,   8,   9,   8,   5, },
+};
+
+static const uint8_t INIT_BINARY_SPLIT_FLAG[4][4] = {
+  {  28,  29,  28,  29, },
+  {  43,  37,  21,  22, },
+  {  36,  45,  36,  45, },
+  {  12,  13,  12,  13, },
+  };
+
 static const uint8_t INIT_SKIP_FLAG[4][3] = {
  {  57,  60,  46, },
  {  57,  59,  45, },
@ -574,6 +589,11 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
    uvg_ctx_init(&cabac->ctx.part_size_model[i], QP, INIT_PART_SIZE[slice][i], INIT_PART_SIZE[3][i]);
    uvg_ctx_init(&cabac->ctx.bdpcm_mode[i], QP, BDPCM_MODE_INIT[slice][i], BDPCM_MODE_INIT[3][i]);
    uvg_ctx_init(&cabac->ctx.qt_cbf_model_luma[i], QP, INIT_QT_CBF[slice][i], INIT_QT_CBF[3][i]);
+    uvg_ctx_init(&cabac->ctx.mtt_binary_model[i], QP, INIT_BINARY_SPLIT_FLAG[slice][i], INIT_BINARY_SPLIT_FLAG[3][i]);
+  }
+
+  for (i = 0; i < 5; i++) {
+    uvg_ctx_init(&cabac->ctx.mtt_vertical_model[i], QP, INIT_VERTICAL_SPLIT_FLAG[slice][i], INIT_VERTICAL_SPLIT_FLAG[3][i]);
  }

  for (i = 0; i < 6; i++) {  
--- a/src/cu.h
+++ b/src/cu.h
@ -105,6 +105,7 @@ enum split_type {
 typedef struct  {
  uint32_t split_tree;
  uint8_t current_depth;
+  uint8_t mtt_depth;
 } split_tree_t;


--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@ -1199,14 +1199,13 @@ void uvg_encode_intra_luma_coding_unit(
 }


-bool uvg_write_split_flag(
-  const encoder_state_t * const state,
+uint8_t uvg_write_split_flag(
+  const encoder_state_t* const state,
  cabac_data_t* cabac,
-  const cu_info_t * left_cu,
-  const cu_info_t * above_cu,
+  const cu_info_t* left_cu,
+  const cu_info_t* above_cu,
  const cu_loc_t* const cu_loc,
-  const uint32_t split_tree,
-  int depth,
+  split_tree_t split_tree,
  enum uvg_tree_type tree_type,
  double* bits_out)
 {
@ -1217,15 +1216,15 @@ bool uvg_write_split_flag(
  // Implisit split flag when on border
  // Exception made in VVC with flag not being implicit if the BT can be used for
  // horizontal or vertical split, then this flag tells if QT or BT is used
+  const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;

  bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split;
  no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true;
-  if (depth > MAX_DEPTH) allow_qt = false;
-  // ToDo: update this when btt is actually used
-  bool allow_btt = true;// when mt_depth < MAX_BT_DEPTH
  
  const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+  if (cu_width == state->encoder_control->cfg.min_qt_size[slice_type] || split_tree.mtt_depth > 0) allow_qt = false;
+  bool allow_btt = state->encoder_control->cfg.max_btt_depth[slice_type] > split_tree.mtt_depth && cu_width <= 64;

  uint8_t implicit_split_mode = UVG_NO_SPLIT;
  //bool implicit_split = border;
@ -1255,10 +1254,16 @@ bool uvg_write_split_flag(
  if (!allow_btt) {
    bh_split = bv_split = th_split = tv_split = false;
  }
+  else {
+    bv_split &= cu_width <= state->encoder_control->cfg.max_bt_size[slice_type];
+    tv_split &= cu_width <= state->encoder_control->cfg.max_tt_size[slice_type];
+    bh_split &= cu_height <= state->encoder_control->cfg.max_bt_size[slice_type];
+    th_split &= cu_height <= state->encoder_control->cfg.max_tt_size[slice_type];
+  }

  bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split;

-  int split_flag = (split_tree >> (depth * 3)) & 7;
+  int split_flag = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7;

  split_flag = implicit_split_mode != UVG_NO_SPLIT ? implicit_split_mode : split_flag;

@ -1286,33 +1291,41 @@ bool uvg_write_split_flag(

    cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]);

-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag != 0, bits, "split_flag");
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag != NO_SPLIT, bits, "split_cu_flag");
  }

+
+  if (implicit_split_mode == UVG_NO_SPLIT && allow_qt && (bh_split || bv_split || th_split || tv_split) && split_flag != NO_SPLIT) {
    bool qt_split = split_flag == QT_SPLIT;
-
-  if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) {
-    split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3);
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "QT_split_flag");
+    if((bv_split || bh_split || tv_split || th_split) && allow_qt) {
+      split_model = (left_cu && GET_SPLITDATA(left_cu, split_tree.current_depth)) + (above_cu && GET_SPLITDATA(above_cu, split_tree.current_depth)) + (split_tree.current_depth < 2 ? 0 : 3);
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "qt_split_flag");
    }
-
-  // Only signal split when it is not implicit, currently only Qt split supported
-  if (!(implicit_split_mode == UVG_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) {
-
+    if (!qt_split) {
+      const bool is_vertical = split_flag == BT_VER_SPLIT || split_flag == TT_VER_SPLIT;
+      if((bh_split || th_split) && (bv_split || tv_split)) {
        split_model = 0;
-
-    // TODO: These are incorrect
-    if (left_cu && (1 << left_cu->log2_height) > cu_height) {
-      split_model++;
+        if(bv_split + tv_split > bh_split + th_split) {
+          split_model = 4;
+        } else if(bv_split + tv_split < bh_split + th_split) {
+          split_model = 3;
+        } else {
+          const int d_a = cu_width / (above_cu ? (1 << above_cu->log2_width) : 1);
+          const int d_l = cu_height / (left_cu ? (1 << left_cu->log2_height) : 1);
+          if(d_a != d_l && above_cu && left_cu) {
+            split_model = d_a < d_l ? 1 : 2;
+          }
+        }
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_vertical_model[split_model]), is_vertical, bits, "mtt_vertical_flag");
+      }
+      if ((bv_split && tv_split && is_vertical) || (bh_split && th_split && !is_vertical)) {
+        split_model = 2 * is_vertical + split_tree.mtt_depth <= 1;
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_binary_model[split_model]), 
+          split_flag == BT_VER_SPLIT || split_flag == BT_HOR_SPLIT, bits, "mtt_binary_flag");
+      }
+    }
  }

-    if (above_cu && (1 << above_cu->log2_width) > cu_width) {
-      split_model++;
-    }
-
-    split_model += (depth > 2 ? 0 : 3);
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "split_cu_mode");
-  }
  if (bits_out) *bits_out += bits;
  return split_flag;
 }
@ -1322,7 +1335,7 @@ void uvg_encode_coding_tree(
  lcu_coeff_t *coeff,
  enum uvg_tree_type tree_type,
  const cu_loc_t* const cu_loc,
-  const split_tree_t split_tree)
+  split_tree_t split_tree)
 {
  cabac_data_t * const cabac = &state->cabac;
  const encoder_control_t * const ctrl = state->encoder_control;
@ -1332,7 +1345,6 @@ void uvg_encode_coding_tree(
  
  const int cu_width  = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
-  const int half_cu  = cu_width >> 1;
 
  const int x = cu_loc->x;
  const int y = cu_loc->y;
@ -1357,9 +1369,9 @@ void uvg_encode_coding_tree(
  int32_t frame_height = tree_type != UVG_CHROMA_T ? ctrl->in.height : ctrl->in.height / 2;
  // Check for slice border
  bool border_x = frame_width  < abs_x + cu_width;
-  bool border_y = frame_height < abs_y + cu_width;
-  bool border_split_x = frame_width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
-  bool border_split_y = frame_height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
+  bool border_y = frame_height < abs_y + cu_height;
+  bool border_split_x = frame_width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + cu_width / 2;
+  bool border_split_y = frame_height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + cu_height / 2;
  bool border = border_x || border_y; /*!< are we in any border CU */

  if (depth <= state->frame->max_qp_delta_depth) {
@ -1368,21 +1380,20 @@ void uvg_encode_coding_tree(

  // When not in MAX_DEPTH, insert split flag and split the blocks if needed
  if (cu_width + cu_height > 8) {
-
+    split_tree.split_tree = cur_cu->split_tree;
    const int split_flag = uvg_write_split_flag(
      state,
      cabac,
      left_cu,
      above_cu, 
      cu_loc,
-      cur_cu->split_tree,
-      depth,
+      split_tree,
      tree_type,
      NULL);
    
    if (split_flag || border) {
      const int half_luma = cu_loc->width / 2;
-      split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1 };
+      const split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1, split_tree.mtt_depth + (split_flag != QT_SPLIT)};

      cu_loc_t new_cu_loc[4];
      const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc);
@ -1650,7 +1661,8 @@ double uvg_mock_encode_coding_unit(
  const cu_loc_t* const cu_loc,
  lcu_t* lcu,
  cu_info_t* cur_cu,
-  enum uvg_tree_type tree_type) {
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree) {
  double bits = 0;
  const encoder_control_t* const ctrl = state->encoder_control;

@ -1692,8 +1704,7 @@ double uvg_mock_encode_coding_unit(
      left_cu,
      above_cu,
      cu_loc,
-      cur_cu->split_tree,
-      depth,
+      split_tree,
      tree_type,
      &bits);
  }
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@ -54,7 +54,7 @@ void uvg_encode_coding_tree(
  lcu_coeff_t *coeff,
  enum uvg_tree_type tree_type,
  const cu_loc_t* const cu_loc,
-  const split_tree_t split_tree);
+  split_tree_t split_tree);

 void uvg_encode_ts_residual(encoder_state_t* const state,
  cabac_data_t* const cabac,
@ -77,7 +77,8 @@ double uvg_mock_encode_coding_unit(
  const cu_loc_t* const cu_loc,
  lcu_t* lcu,
  cu_info_t* cur_cu,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree);

 int uvg_encode_inter_prediction_unit(
  encoder_state_t* const state,
@ -96,14 +97,13 @@ void uvg_encode_intra_luma_coding_unit(
  double* bits_out);


-bool uvg_write_split_flag(
+uint8_t uvg_write_split_flag(
  const encoder_state_t* const state,
  cabac_data_t* cabac,
  const cu_info_t* left_cu,
  const cu_info_t* above_cu,
  const cu_loc_t* const cu_loc,
-  const uint32_t split_tree,
-  int depth,
+  split_tree_t,
  enum uvg_tree_type tree_type,
  double* bits_out);

--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@ -529,10 +529,10 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
  // if(!no_partition_constraints_override_constraint_flag)
    WRITE_U(stream, 0, 1, "partition_constraints_override_enabled_flag");
  WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
-  WRITE_UE(stream, encoder->cfg.max_intra_slice_btt_depth, "sps_max_mtt_hierarchy_depth_intra_slice_luma");
-  if (encoder->cfg.max_intra_slice_btt_depth) {
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma");
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma");
+  WRITE_UE(stream, encoder->cfg.max_btt_depth[0], "sps_max_mtt_hierarchy_depth_intra_slice_luma");
+  if (encoder->cfg.max_btt_depth[0]) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma");
  }
  
  if (encoder->chroma_format != UVG_CSP_400)
@ -541,17 +541,17 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
  }
  if (encoder->cfg.dual_tree) {
    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
-    WRITE_UE(stream, encoder->cfg.max_intra_slice_btt_depth_chroma, "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
-    if (encoder->cfg.max_intra_slice_btt_depth_chroma) {
-      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
-      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
+    WRITE_UE(stream, encoder->cfg.max_btt_depth[2], "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
+    if (encoder->cfg.max_btt_depth[2]) {
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
    }
  }
  WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_inter_slice");
-  WRITE_UE(stream, encoder->cfg.max_inter_slice_btt_depth, "sps_max_mtt_hierarchy_depth_inter_slice");
-  if (encoder->cfg.max_inter_slice_btt_depth != 0) {
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group");
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group");
+  WRITE_UE(stream, encoder->cfg.max_btt_depth[1], "sps_max_mtt_hierarchy_depth_inter_slice");
+  if (encoder->cfg.max_btt_depth[1] != 0) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group");
  }

  if (LCU_WIDTH > 32)
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -883,7 +883,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
  //Encode coding tree
  cu_loc_t start;
  uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
-  split_tree_t split_tree = { 0, 0 };
+  split_tree_t split_tree = { 0, 0, 0 };

  uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, split_tree);

--- a/src/rdo.c
+++ b/src/rdo.c
@ -712,12 +712,13 @@ void uvg_rdoq_sign_hiding(
  const int32_t last_pos,
  const coeff_t *const coeffs,
  coeff_t *const quant_coeffs,
-    const int8_t color)
+  const int8_t color,
+  const bool need_sqrt_adjust)
 {
  const encoder_control_t * const ctrl = state->encoder_control;
  const double lambda = color ? state->c_lambda : state->lambda;

-  int inv_quant = uvg_g_inv_quant_scales[qp_scaled % 6];
+  int inv_quant = uvg_g_inv_quant_scales[need_sqrt_adjust][qp_scaled % 6];
  // This somehow scales quant_delta into fractional bits. Instead of the bits
  // being multiplied by lambda, the residual is divided by it, or something
  // like that.
@ -1203,7 +1204,7 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_

  const bool   needs_sqrt2_scale = false; // from VTM: should always be false - transform-skipped blocks don't require sqrt(2) compensation.
  const int    q_bits = QUANT_SHIFT + qp_scaled / 6  + (needs_sqrt2_scale ? -1 : 0);  // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
-  const int32_t quant_coeff = uvg_g_quant_scales[qp_scaled % 6];
+  const int32_t quant_coeff = uvg_g_quant_scales[needs_sqrt2_scale][qp_scaled % 6];
 
  const double error_scale = (double)(1 << CTX_FRAC_BITS) / quant_coeff / quant_coeff;

@ -1416,8 +1417,10 @@ void uvg_rdoq(
  cabac_data_t * const cabac = &state->cabac;
  const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  bool needs_block_size_trafo_scale = !false && ((log2_block_width + log2_block_height) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size

-  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1);  // Represents scaling through forward transform
+  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1) + needs_block_size_trafo_scale;  // Represents scaling through forward transform
  uint16_t go_rice_param     = 0;
  uint32_t reg_bins = (width * height * 28) >> 4;
  
@ -1789,7 +1792,7 @@ void uvg_rdoq(
  }

  if (encoder->cfg.signhide_enable && abs_sum >= 2) {
-    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color);
+    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color, needs_block_size_trafo_scale);
  }
 }

--- a/src/scalinglist.c
+++ b/src/scalinglist.c
@ -88,8 +88,14 @@ static const int32_t g_quant_inter_default_8x8[64] =
  24, 25, 28, 33, 41, 54, 71, 91
 };

-const int16_t uvg_g_quant_scales[6] = {26214, 23302, 20560, 18396, 16384, 14564};
-const int16_t uvg_g_inv_quant_scales[6] = {40, 45, 51, 57, 64, 72};
+const int16_t uvg_g_quant_scales[2][6] = {
+  {26214, 23302, 20560, 18396, 16384, 14564},
+    { 18396,16384,14564,13107,11651,10280 }
+};
+const int16_t uvg_g_inv_quant_scales[2][6] = {
+  {40, 45, 51, 57, 64, 72},
+  { 57,64,72,80,90,102 }
+};


 /**
@ -406,11 +412,11 @@ void uvg_scalinglist_set(scaling_list_t* const scaling_list, const int32_t* cons
  int32_t* quantcoeff = (int32_t*)scaling_list->quant_coeff[size_id_x][size_id_y][listId][qp];
  int32_t* dequantcoeff = (int32_t*)scaling_list->de_quant_coeff[size_id_x][size_id_y][listId][qp];

-  // Encoder list
-  uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[qp] << 4, height, width, ratio,
+  // Encoder list TODO: the sqrt adjusted lists
+  uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[0][qp] << 4, height, width, ratio,
                              MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);
  // Decoder list
-  scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[qp], height, width, ratio,
+  scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[0][qp], height, width, ratio,
                          MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);


--- a/src/search.c
+++ b/src/search.c
@ -1294,7 +1294,8 @@ static double search_cu(
      tree_type != UVG_CHROMA_T ? cu_loc : &chroma_loc, 
      lcu,
      cur_cu,
-      tree_type);
+      tree_type,
+      split_tree);

    
    cost = bits * state->lambda;
@ -1335,7 +1336,11 @@ static double search_cu(
  // Recursively split all the way to max search depth.
  if (can_split_cu) {
    const int split_type = depth == 0 ? QT_SPLIT : BT_HOR_SPLIT;
-    const split_tree_t new_split = { split_tree.split_tree | split_type << (split_tree.current_depth * 3), split_tree.current_depth + 1 };
+    const split_tree_t new_split = {
+      split_tree.split_tree | split_type << (split_tree.current_depth * 3),
+      split_tree.current_depth + 1,
+      split_tree.mtt_depth + (split_type != QT_SPLIT),
+    };
    
    double split_cost = 0.0;
    int cbf = cbf_is_set_any(cur_cu->cbf);
@ -1374,8 +1379,7 @@ static double search_cu(
        left_cu,
        above_cu, 
        tree_type != UVG_CHROMA_T ? cu_loc : &chroma_loc,
-        new_split.split_tree,
-        depth,
+        split_tree,
        tree_type,
        &split_bits);
    }
@ -1394,7 +1398,7 @@ static double search_cu(
      const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc);
      for (int split = 0; split < splits; ++split) {
        split_cost += search_cu(state, &new_cu_loc[split], &split_lcu, tree_type, new_split);
-        if (split_cost < cost) {
+        if (split_cost > cost) {
          break;
        }
      }
@ -1426,7 +1430,7 @@ static double search_cu(
        double bits = 0;
        uvg_write_split_flag(state, &state->search_cabac,
                             x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
-                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, cur_cu->split_tree, depth, tree_type, &bits);
+                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, &bits);

        cur_cu->intra = cu_d1->intra;
        cur_cu->type = CU_INTRA;
@ -1715,7 +1719,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con

  cu_loc_t start;
  uvg_cu_loc_ctor(&start, x, y, LCU_WIDTH, LCU_WIDTH);
-  split_tree_t split_tree = { 0, 0 };
+  split_tree_t split_tree = { 0, 0, 0 };
  // Start search from depth 0.
  double cost = search_cu(
    state, 
--- a/src/search_inter.c
+++ b/src/search_inter.c
@ -2125,8 +2125,6 @@ void uvg_cu_cost_inter_rd2(
  double* inter_bitcost,
  const cu_loc_t* const cu_loc){
  
-  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-
  const int x_px = SUB_SCU(cu_loc->x);
  const int y_px = SUB_SCU(cu_loc->y);
  const int width = cu_loc->width;
@ -2160,12 +2158,24 @@ void uvg_cu_cost_inter_rd2(
  double no_cbf_bits;
  double bits = 0;
  const int skip_context = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, NULL);
+
+  int8_t depth = 0;
+  int8_t mtt_depth = 0;
+  uint32_t splits = cur_cu->split_tree;
+  while (splits & 7) {
+    if ((splits & 7) != QT_SPLIT) {
+      mtt_depth++;
+    }
+    depth++;
+    splits >>= 3;
+  }
+  const split_tree_t splitt_tree = { cur_cu->split_tree, depth, mtt_depth };
  if (cur_cu->merged) {
    no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
-    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T);
+    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
  }
  else {
-    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T);
+    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
    bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 1);
  }
  double no_cbf_cost = ssd + no_cbf_bits * state->lambda;
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@ -386,11 +386,13 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr

  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size
  
  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
  const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); //!< Represents scaling through forward transform
-  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
+  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
  const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
  const int32_t q_bits8 = q_bits - 8;

@ -792,13 +794,15 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
  int32_t n;
  const uint32_t log2_tr_width =  uvg_g_convert_to_log2[width];
  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
-  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform
+  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1);
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size// Represents scaling through forward transform


  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;

-  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
+  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);

  if (encoder->scaling_list.enable)
  {
@ -822,7 +826,7 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
      }
    }
  } else {
-    int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
+    int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
    add = 1 << (shift-1);

    __m256i v_scale = _mm256_set1_epi32(scale);
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@ -68,12 +68,13 @@ void uvg_quant_generic(

  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
-  
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size
    
  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
  const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
-  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); //!< Represents scaling through forward transform
-  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
+  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform
+  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift );
  const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
  const int32_t q_bits8 = q_bits - 8;

@ -592,11 +593,13 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
  const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform

+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size

  int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;

-  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
+  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);

  if (encoder->scaling_list.enable)
  {
@ -620,7 +623,7 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
      }
    }
  } else {
-    int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
+    int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
    add = 1 << (shift-1);

    for (n = 0; n < width * height; n++) {
--- a/src/transform.h
+++ b/src/transform.h
@ -44,8 +44,8 @@
 #include "global.h" // IWYU pragma: keep

 extern const uint8_t uvg_g_chroma_scale[58];
-extern const int16_t uvg_g_inv_quant_scales[6];
-extern const int16_t uvg_g_quant_scales[6];
+extern const int16_t uvg_g_inv_quant_scales[2][6];
+extern const int16_t uvg_g_quant_scales[2][6];

 #define COEFF_ORDER_LINEAR 0
 #define COEFF_ORDER_CU 1
--- a/src/uvg266.h
+++ b/src/uvg266.h
@ -543,13 +543,11 @@ typedef struct uvg_config

  uint8_t dual_tree;

-  uint8_t min_qt_size[3];
+  uint8_t min_qt_size[3];  /* intra, inter, dual tree chroma*/
  uint8_t max_bt_size[3];
  uint8_t max_tt_size[3];

-  uint8_t max_intra_slice_btt_depth;
-  uint8_t max_intra_slice_btt_depth_chroma;
-  uint8_t max_inter_slice_btt_depth;
+  uint8_t max_btt_depth[3];

  uint8_t intra_rough_search_levels;