diff --git a/src/cabac.h b/src/cabac.h
index be249ba2..f38030a9 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -77,6 +77,8 @@ typedef struct
     cabac_ctx_t mts_idx_model[4];
     cabac_ctx_t split_flag_model[9]; //!< \brief split flag context models
     cabac_ctx_t qt_split_flag_model[6]; //!< \brief qt split flag context models
+    cabac_ctx_t mtt_vertical_model[5]; 
+    cabac_ctx_t mtt_binary_model[4]; 
     cabac_ctx_t intra_luma_mpm_flag_model;    //!< \brief intra mode context models
     cabac_ctx_t intra_subpart_model[2];    //!< \brief intra sub part context models
     cabac_ctx_t chroma_pred_model;
diff --git a/src/cfg.c b/src/cfg.c
index f2073da5..39643e9f 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -222,6 +222,22 @@ int uvg_config_init(uvg_config *cfg)
   cfg->cabac_debug_file_name = NULL;
 
   cfg->dual_tree = 0;
+
+  cfg->min_qt_size[0] = 4;
+  cfg->min_qt_size[1] = 4;
+  cfg->min_qt_size[2] = 4;
+
+  cfg->max_btt_depth[0] = 1;
+  cfg->max_btt_depth[1] = 0;
+  cfg->max_btt_depth[2] = 0;
+
+  cfg->max_tt_size[0] = 64;
+  cfg->max_bt_size[0] = 64;
+  cfg->max_tt_size[1] = 64;
+  cfg->max_bt_size[1] = 64;
+  cfg->max_tt_size[2] = 64;
+  cfg->max_bt_size[2] = 64;
+
   cfg->intra_rough_search_levels = 2;
 
   cfg->ibc = 0;
diff --git a/src/context.c b/src/context.c
index 708b9da4..30861849 100644
--- a/src/context.c
+++ b/src/context.c
@@ -50,6 +50,21 @@ static const uint8_t  INIT_QT_SPLIT_FLAG[4][6] = {
   {   0,   8,   8,  12,  12,   8, },
 };
 
+
+static const uint8_t INIT_VERTICAL_SPLIT_FLAG[4][5] = {
+  {  43,  42,  37,  42,  44, },
+  {  43,  35,  37,  34,  52, },
+  {  43,  42,  29,  27,  44, },
+  {   9,   8,   9,   8,   5, },
+};
+
+static const uint8_t INIT_BINARY_SPLIT_FLAG[4][4] = {
+  {  28,  29,  28,  29, },
+  {  43,  37,  21,  22, },
+  {  36,  45,  36,  45, },
+  {  12,  13,  12,  13, },
+  };
+
 static const uint8_t INIT_SKIP_FLAG[4][3] = {
   {  57,  60,  46, },
   {  57,  59,  45, },
@@ -574,6 +589,11 @@ void uvg_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
     uvg_ctx_init(&cabac->ctx.part_size_model[i], QP, INIT_PART_SIZE[slice][i], INIT_PART_SIZE[3][i]);
     uvg_ctx_init(&cabac->ctx.bdpcm_mode[i], QP, BDPCM_MODE_INIT[slice][i], BDPCM_MODE_INIT[3][i]);
     uvg_ctx_init(&cabac->ctx.qt_cbf_model_luma[i], QP, INIT_QT_CBF[slice][i], INIT_QT_CBF[3][i]);
+    uvg_ctx_init(&cabac->ctx.mtt_binary_model[i], QP, INIT_BINARY_SPLIT_FLAG[slice][i], INIT_BINARY_SPLIT_FLAG[3][i]);
+  }
+
+  for (i = 0; i < 5; i++) {
+    uvg_ctx_init(&cabac->ctx.mtt_vertical_model[i], QP, INIT_VERTICAL_SPLIT_FLAG[slice][i], INIT_VERTICAL_SPLIT_FLAG[3][i]);
   }
 
   for (i = 0; i < 6; i++) {  
diff --git a/src/cu.h b/src/cu.h
index 7f1bd0e3..cc2f6925 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -105,6 +105,7 @@ enum split_type {
 typedef struct  {
   uint32_t split_tree;
   uint8_t current_depth;
+  uint8_t mtt_depth;
 } split_tree_t;
 
 
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 4468390c..ac8d206e 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -1199,14 +1199,13 @@ void uvg_encode_intra_luma_coding_unit(
 }
 
 
-bool uvg_write_split_flag(
-  const encoder_state_t * const state,
+uint8_t uvg_write_split_flag(
+  const encoder_state_t* const state,
   cabac_data_t* cabac,
-  const cu_info_t * left_cu,
-  const cu_info_t * above_cu,
+  const cu_info_t* left_cu,
+  const cu_info_t* above_cu,
   const cu_loc_t* const cu_loc,
-  const uint32_t split_tree,
-  int depth,
+  split_tree_t split_tree,
   enum uvg_tree_type tree_type,
   double* bits_out)
 {
@@ -1217,15 +1216,15 @@ bool uvg_write_split_flag(
   // Implisit split flag when on border
   // Exception made in VVC with flag not being implicit if the BT can be used for
   // horizontal or vertical split, then this flag tells if QT or BT is used
+  const int slice_type = state->frame->is_irap ? (tree_type == UVG_CHROMA_T ? 2 : 0) : 1;
 
   bool no_split, allow_qt, bh_split, bv_split, th_split, tv_split;
   no_split = allow_qt = bh_split = bv_split = th_split = tv_split = true;
-  if (depth > MAX_DEPTH) allow_qt = false;
-  // ToDo: update this when btt is actually used
-  bool allow_btt = true;// when mt_depth < MAX_BT_DEPTH
   
   const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
   const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
+  if (cu_width == state->encoder_control->cfg.min_qt_size[slice_type] || split_tree.mtt_depth > 0) allow_qt = false;
+  bool allow_btt = state->encoder_control->cfg.max_btt_depth[slice_type] > split_tree.mtt_depth && cu_width <= 64;
 
   uint8_t implicit_split_mode = UVG_NO_SPLIT;
   //bool implicit_split = border;
@@ -1255,10 +1254,16 @@ bool uvg_write_split_flag(
   if (!allow_btt) {
     bh_split = bv_split = th_split = tv_split = false;
   }
+  else {
+    bv_split &= cu_width <= state->encoder_control->cfg.max_bt_size[slice_type];
+    tv_split &= cu_width <= state->encoder_control->cfg.max_tt_size[slice_type];
+    bh_split &= cu_height <= state->encoder_control->cfg.max_bt_size[slice_type];
+    th_split &= cu_height <= state->encoder_control->cfg.max_tt_size[slice_type];
+  }
 
   bool allow_split = allow_qt | bh_split | bv_split | th_split | tv_split;
 
-  int split_flag = (split_tree >> (depth * 3)) & 7;
+  int split_flag = (split_tree.split_tree >> (split_tree.current_depth * 3)) & 7;
 
   split_flag = implicit_split_mode != UVG_NO_SPLIT ? implicit_split_mode : split_flag;
 
@@ -1286,33 +1291,41 @@ bool uvg_write_split_flag(
 
     cabac->cur_ctx = &(cabac->ctx.split_flag_model[split_model]);
 
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag != 0, bits, "split_flag");
+    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.split_flag_model[split_model]), split_flag != NO_SPLIT, bits, "split_cu_flag");
   }
 
-  bool qt_split = split_flag == QT_SPLIT;
 
-  if (!(implicit_split_mode == UVG_NO_SPLIT) && (allow_qt && allow_btt)) {
-    split_model = (left_cu && GET_SPLITDATA(left_cu, depth)) + (above_cu && GET_SPLITDATA(above_cu, depth)) + (depth < 2 ? 0 : 3);
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "QT_split_flag");
-  }
-
-  // Only signal split when it is not implicit, currently only Qt split supported
-  if (!(implicit_split_mode == UVG_NO_SPLIT) && !qt_split && (bh_split | bv_split | th_split | tv_split)) {
-
-    split_model = 0;
-
-    // TODO: These are incorrect
-    if (left_cu && (1 << left_cu->log2_height) > cu_height) {
-      split_model++;
+  if (implicit_split_mode == UVG_NO_SPLIT && allow_qt && (bh_split || bv_split || th_split || tv_split) && split_flag != NO_SPLIT) {
+    bool qt_split = split_flag == QT_SPLIT;
+    if((bv_split || bh_split || tv_split || th_split) && allow_qt) {
+      split_model = (left_cu && GET_SPLITDATA(left_cu, split_tree.current_depth)) + (above_cu && GET_SPLITDATA(above_cu, split_tree.current_depth)) + (split_tree.current_depth < 2 ? 0 : 3);
+      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "qt_split_flag");
     }
-
-    if (above_cu && (1 << above_cu->log2_width) > cu_width) {
-      split_model++;
+    if (!qt_split) {
+      const bool is_vertical = split_flag == BT_VER_SPLIT || split_flag == TT_VER_SPLIT;
+      if((bh_split || th_split) && (bv_split || tv_split)) {
+        split_model = 0;
+        if(bv_split + tv_split > bh_split + th_split) {
+          split_model = 4;
+        } else if(bv_split + tv_split < bh_split + th_split) {
+          split_model = 3;
+        } else {
+          const int d_a = cu_width / (above_cu ? (1 << above_cu->log2_width) : 1);
+          const int d_l = cu_height / (left_cu ? (1 << left_cu->log2_height) : 1);
+          if(d_a != d_l && above_cu && left_cu) {
+            split_model = d_a < d_l ? 1 : 2;
+          }
+        }
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_vertical_model[split_model]), is_vertical, bits, "mtt_vertical_flag");
+      }
+      if ((bv_split && tv_split && is_vertical) || (bh_split && th_split && !is_vertical)) {
+        split_model = 2 * is_vertical + split_tree.mtt_depth <= 1;
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.mtt_binary_model[split_model]), 
+          split_flag == BT_VER_SPLIT || split_flag == BT_HOR_SPLIT, bits, "mtt_binary_flag");
+      }
     }
-
-    split_model += (depth > 2 ? 0 : 3);
-    CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_split_flag_model[split_model]), qt_split, bits, "split_cu_mode");
   }
+
   if (bits_out) *bits_out += bits;
   return split_flag;
 }
@@ -1322,7 +1335,7 @@ void uvg_encode_coding_tree(
   lcu_coeff_t *coeff,
   enum uvg_tree_type tree_type,
   const cu_loc_t* const cu_loc,
-  const split_tree_t split_tree)
+  split_tree_t split_tree)
 {
   cabac_data_t * const cabac = &state->cabac;
   const encoder_control_t * const ctrl = state->encoder_control;
@@ -1332,8 +1345,7 @@ void uvg_encode_coding_tree(
   
   const int cu_width  = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
   const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;
-  const int half_cu  = cu_width >> 1;
-
+ 
   const int x = cu_loc->x;
   const int y = cu_loc->y;
 
@@ -1357,9 +1369,9 @@ void uvg_encode_coding_tree(
   int32_t frame_height = tree_type != UVG_CHROMA_T ? ctrl->in.height : ctrl->in.height / 2;
   // Check for slice border
   bool border_x = frame_width  < abs_x + cu_width;
-  bool border_y = frame_height < abs_y + cu_width;
-  bool border_split_x = frame_width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
-  bool border_split_y = frame_height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
+  bool border_y = frame_height < abs_y + cu_height;
+  bool border_split_x = frame_width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + cu_width / 2;
+  bool border_split_y = frame_height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + cu_height / 2;
   bool border = border_x || border_y; /*!< are we in any border CU */
 
   if (depth <= state->frame->max_qp_delta_depth) {
@@ -1368,21 +1380,20 @@ void uvg_encode_coding_tree(
 
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
   if (cu_width + cu_height > 8) {
-
+    split_tree.split_tree = cur_cu->split_tree;
     const int split_flag = uvg_write_split_flag(
       state,
       cabac,
       left_cu,
       above_cu, 
       cu_loc,
-      cur_cu->split_tree,
-      depth,
+      split_tree,
       tree_type,
       NULL);
     
     if (split_flag || border) {
       const int half_luma = cu_loc->width / 2;
-      split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1 };
+      const split_tree_t new_split_tree = { cur_cu->split_tree, split_tree.current_depth + 1, split_tree.mtt_depth + (split_flag != QT_SPLIT)};
 
       cu_loc_t new_cu_loc[4];
       const int splits = uvg_get_split_locs(cu_loc, split_flag, new_cu_loc);
@@ -1650,7 +1661,8 @@ double uvg_mock_encode_coding_unit(
   const cu_loc_t* const cu_loc,
   lcu_t* lcu,
   cu_info_t* cur_cu,
-  enum uvg_tree_type tree_type) {
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree) {
   double bits = 0;
   const encoder_control_t* const ctrl = state->encoder_control;
 
@@ -1692,8 +1704,7 @@ double uvg_mock_encode_coding_unit(
       left_cu,
       above_cu,
       cu_loc,
-      cur_cu->split_tree,
-      depth,
+      split_tree,
       tree_type,
       &bits);
   }
diff --git a/src/encode_coding_tree.h b/src/encode_coding_tree.h
index 86605e4d..357e059a 100644
--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@@ -54,7 +54,7 @@ void uvg_encode_coding_tree(
   lcu_coeff_t *coeff,
   enum uvg_tree_type tree_type,
   const cu_loc_t* const cu_loc,
-  const split_tree_t split_tree);
+  split_tree_t split_tree);
 
 void uvg_encode_ts_residual(encoder_state_t* const state,
   cabac_data_t* const cabac,
@@ -77,7 +77,8 @@ double uvg_mock_encode_coding_unit(
   const cu_loc_t* const cu_loc,
   lcu_t* lcu,
   cu_info_t* cur_cu,
-  enum uvg_tree_type tree_type);
+  enum uvg_tree_type tree_type,
+  const split_tree_t split_tree);
 
 int uvg_encode_inter_prediction_unit(
   encoder_state_t* const state,
@@ -96,14 +97,13 @@ void uvg_encode_intra_luma_coding_unit(
   double* bits_out);
 
 
-bool uvg_write_split_flag(
+uint8_t uvg_write_split_flag(
   const encoder_state_t* const state,
   cabac_data_t* cabac,
   const cu_info_t* left_cu,
   const cu_info_t* above_cu,
   const cu_loc_t* const cu_loc,
-  const uint32_t split_tree,
-  int depth,
+  split_tree_t,
   enum uvg_tree_type tree_type,
   double* bits_out);
 
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 1649d944..8e9f7c52 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -529,10 +529,10 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
   // if(!no_partition_constraints_override_constraint_flag)
     WRITE_U(stream, 0, 1, "partition_constraints_override_enabled_flag");
   WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_luma");
-  WRITE_UE(stream, encoder->cfg.max_intra_slice_btt_depth, "sps_max_mtt_hierarchy_depth_intra_slice_luma");
-  if (encoder->cfg.max_intra_slice_btt_depth) {
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma");
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma");
+  WRITE_UE(stream, encoder->cfg.max_btt_depth[0], "sps_max_mtt_hierarchy_depth_intra_slice_luma");
+  if (encoder->cfg.max_btt_depth[0]) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_bt_min_qt_intra_slice_luma");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[0]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[0]], "sps_log2_diff_max_tt_min_qt_intra_slice_luma");
   }
   
   if (encoder->chroma_format != UVG_CSP_400)
@@ -541,17 +541,17 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
   }
   if (encoder->cfg.dual_tree) {
     WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_intra_slice_chroma");
-    WRITE_UE(stream, encoder->cfg.max_intra_slice_btt_depth_chroma, "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
-    if (encoder->cfg.max_intra_slice_btt_depth_chroma) {
-      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
-      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
+    WRITE_UE(stream, encoder->cfg.max_btt_depth[2], "sps_max_mtt_hierarchy_depth_intra_slice_chroma");
+    if (encoder->cfg.max_btt_depth[2]) {
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_bt_min_qt_intra_slice_chroma");
+      WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[2]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[2]], "sps_log2_diff_max_tt_min_qt_intra_slice_chroma");
     }
   }
   WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - MIN_SIZE, "sps_log2_diff_min_qt_min_cb_inter_slice");
-  WRITE_UE(stream, encoder->cfg.max_inter_slice_btt_depth, "sps_max_mtt_hierarchy_depth_inter_slice");
-  if (encoder->cfg.max_inter_slice_btt_depth != 0) {
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group");
-    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group");
+  WRITE_UE(stream, encoder->cfg.max_btt_depth[1], "sps_max_mtt_hierarchy_depth_inter_slice");
+  if (encoder->cfg.max_btt_depth[1] != 0) {
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_bt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_bt_min_qt_inter_tile_group");
+    WRITE_UE(stream, uvg_g_convert_to_log2[encoder->cfg.max_tt_size[1]] - uvg_g_convert_to_log2[encoder->cfg.min_qt_size[1]], "sps_log2_diff_max_tt_min_qt_inter_tile_group");
   }
 
   if (LCU_WIDTH > 32)
diff --git a/src/encoderstate.c b/src/encoderstate.c
index eb529b2b..e8af6add 100644
--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@@ -883,7 +883,7 @@ static void encoder_state_worker_encode_lcu_bitstream(void * opaque)
   //Encode coding tree
   cu_loc_t start;
   uvg_cu_loc_ctor(&start, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, LCU_WIDTH, LCU_WIDTH);
-  split_tree_t split_tree = { 0, 0 };
+  split_tree_t split_tree = { 0, 0, 0 };
 
   uvg_encode_coding_tree(state, lcu->coeff, tree_type, &start, split_tree);
 
diff --git a/src/rdo.c b/src/rdo.c
index 26f31634..5fef3b3c 100644
--- a/src/rdo.c
+++ b/src/rdo.c
@@ -705,19 +705,20 @@ static void calc_last_bits(encoder_state_t * const state, int32_t width, int32_t
  * tables generated during RDOQ to select the best coefficient to change.
  */
 void uvg_rdoq_sign_hiding(
-    const encoder_state_t *const state,
-    const int32_t qp_scaled,
-    const uint32_t *const scan2raster,
-    const struct sh_rates_t *const sh_rates,
-    const int32_t last_pos,
-    const coeff_t *const coeffs,
-    coeff_t *const quant_coeffs, 
-    const int8_t color)
+  const encoder_state_t *const state,
+  const int32_t qp_scaled,
+  const uint32_t *const scan2raster,
+  const struct sh_rates_t *const sh_rates,
+  const int32_t last_pos,
+  const coeff_t *const coeffs,
+  coeff_t *const quant_coeffs,
+  const int8_t color,
+  const bool need_sqrt_adjust)
 {
   const encoder_control_t * const ctrl = state->encoder_control;
   const double lambda = color ? state->c_lambda : state->lambda;
 
-  int inv_quant = uvg_g_inv_quant_scales[qp_scaled % 6];
+  int inv_quant = uvg_g_inv_quant_scales[need_sqrt_adjust][qp_scaled % 6];
   // This somehow scales quant_delta into fractional bits. Instead of the bits
   // being multiplied by lambda, the residual is divided by it, or something
   // like that.
@@ -1203,7 +1204,7 @@ int uvg_ts_rdoq(encoder_state_t* const state, coeff_t* src_coeff, coeff_t* dest_
 
   const bool   needs_sqrt2_scale = false; // from VTM: should always be false - transform-skipped blocks don't require sqrt(2) compensation.
   const int    q_bits = QUANT_SHIFT + qp_scaled / 6  + (needs_sqrt2_scale ? -1 : 0);  // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
-  const int32_t quant_coeff = uvg_g_quant_scales[qp_scaled % 6];
+  const int32_t quant_coeff = uvg_g_quant_scales[needs_sqrt2_scale][qp_scaled % 6];
  
   const double error_scale = (double)(1 << CTX_FRAC_BITS) / quant_coeff / quant_coeff;
 
@@ -1416,8 +1417,10 @@ void uvg_rdoq(
   cabac_data_t * const cabac = &state->cabac;
   const uint32_t log2_block_width = uvg_g_convert_to_log2[width];
   const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  bool needs_block_size_trafo_scale = !false && ((log2_block_width + log2_block_height) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size
 
-  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1);  // Represents scaling through forward transform
+  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_block_width + log2_block_height) >> 1) + needs_block_size_trafo_scale;  // Represents scaling through forward transform
   uint16_t go_rice_param     = 0;
   uint32_t reg_bins = (width * height * 28) >> 4;
   
@@ -1789,7 +1792,7 @@ void uvg_rdoq(
   }
 
   if (encoder->cfg.signhide_enable && abs_sum >= 2) {
-    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color);
+    uvg_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff, color, needs_block_size_trafo_scale);
   }
 }
 
diff --git a/src/scalinglist.c b/src/scalinglist.c
index 5c32ac4c..01edfa27 100644
--- a/src/scalinglist.c
+++ b/src/scalinglist.c
@@ -88,8 +88,14 @@ static const int32_t g_quant_inter_default_8x8[64] =
   24, 25, 28, 33, 41, 54, 71, 91
 };
 
-const int16_t uvg_g_quant_scales[6] = {26214, 23302, 20560, 18396, 16384, 14564};
-const int16_t uvg_g_inv_quant_scales[6] = {40, 45, 51, 57, 64, 72};
+const int16_t uvg_g_quant_scales[2][6] = {
+  {26214, 23302, 20560, 18396, 16384, 14564},
+    { 18396,16384,14564,13107,11651,10280 }
+};
+const int16_t uvg_g_inv_quant_scales[2][6] = {
+  {40, 45, 51, 57, 64, 72},
+  { 57,64,72,80,90,102 }
+};
 
 
 /**
@@ -406,11 +412,11 @@ void uvg_scalinglist_set(scaling_list_t* const scaling_list, const int32_t* cons
   int32_t* quantcoeff = (int32_t*)scaling_list->quant_coeff[size_id_x][size_id_y][listId][qp];
   int32_t* dequantcoeff = (int32_t*)scaling_list->de_quant_coeff[size_id_x][size_id_y][listId][qp];
 
-  // Encoder list
-  uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[qp] << 4, height, width, ratio,
+  // Encoder list TODO: the sqrt adjusted lists
+  uvg_scalinglist_process_enc(coeff, quantcoeff, uvg_g_quant_scales[0][qp] << 4, height, width, ratio,
                               MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);
   // Decoder list
-  scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[qp], height, width, ratio,
+  scalinglist_process_dec(coeff, dequantcoeff, uvg_g_inv_quant_scales[0][qp], height, width, ratio,
                           MIN(8, g_scaling_list_size_x[size_id_x]), dc, !scaling_list->enable);
 
 
diff --git a/src/search.c b/src/search.c
index 56f8f566..f61ce721 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1294,7 +1294,8 @@ static double search_cu(
       tree_type != UVG_CHROMA_T ? cu_loc : &chroma_loc, 
       lcu,
       cur_cu,
-      tree_type);
+      tree_type,
+      split_tree);
 
     
     cost = bits * state->lambda;
@@ -1335,7 +1336,11 @@ static double search_cu(
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
     const int split_type = depth == 0 ? QT_SPLIT : BT_HOR_SPLIT;
-    const split_tree_t new_split = { split_tree.split_tree | split_type << (split_tree.current_depth * 3), split_tree.current_depth + 1 };
+    const split_tree_t new_split = {
+      split_tree.split_tree | split_type << (split_tree.current_depth * 3),
+      split_tree.current_depth + 1,
+      split_tree.mtt_depth + (split_type != QT_SPLIT),
+    };
     
     double split_cost = 0.0;
     int cbf = cbf_is_set_any(cur_cu->cbf);
@@ -1374,8 +1379,7 @@ static double search_cu(
         left_cu,
         above_cu, 
         tree_type != UVG_CHROMA_T ? cu_loc : &chroma_loc,
-        new_split.split_tree,
-        depth,
+        split_tree,
         tree_type,
         &split_bits);
     }
@@ -1394,7 +1398,7 @@ static double search_cu(
       const int splits = uvg_get_split_locs(cu_loc, split_type, new_cu_loc);
       for (int split = 0; split < splits; ++split) {
         split_cost += search_cu(state, &new_cu_loc[split], &split_lcu, tree_type, new_split);
-        if (split_cost < cost) {
+        if (split_cost > cost) {
           break;
         }
       }
@@ -1426,7 +1430,7 @@ static double search_cu(
         double bits = 0;
         uvg_write_split_flag(state, &state->search_cabac,
                              x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
-                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, cur_cu->split_tree, depth, tree_type, &bits);
+                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL, cu_loc, split_tree, tree_type, &bits);
 
         cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;
@@ -1715,7 +1719,7 @@ void uvg_search_lcu(encoder_state_t * const state, const int x, const int y, con
 
   cu_loc_t start;
   uvg_cu_loc_ctor(&start, x, y, LCU_WIDTH, LCU_WIDTH);
-  split_tree_t split_tree = { 0, 0 };
+  split_tree_t split_tree = { 0, 0, 0 };
   // Start search from depth 0.
   double cost = search_cu(
     state, 
diff --git a/src/search_inter.c b/src/search_inter.c
index 37adaf27..4703152a 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -2124,9 +2124,7 @@ void uvg_cu_cost_inter_rd2(
   double   *inter_cost,
   double* inter_bitcost,
   const cu_loc_t* const cu_loc){
-
-  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-
+  
   const int x_px = SUB_SCU(cu_loc->x);
   const int y_px = SUB_SCU(cu_loc->y);
   const int width = cu_loc->width;
@@ -2160,12 +2158,24 @@ void uvg_cu_cost_inter_rd2(
   double no_cbf_bits;
   double bits = 0;
   const int skip_context = uvg_get_skip_context(cu_loc->x, cu_loc->y, lcu, NULL, NULL);
+
+  int8_t depth = 0;
+  int8_t mtt_depth = 0;
+  uint32_t splits = cur_cu->split_tree;
+  while (splits & 7) {
+    if ((splits & 7) != QT_SPLIT) {
+      mtt_depth++;
+    }
+    depth++;
+    splits >>= 3;
+  }
+  const split_tree_t splitt_tree = { cur_cu->split_tree, depth, mtt_depth };
   if (cur_cu->merged) {
     no_cbf_bits = CTX_ENTROPY_FBITS(&state->cabac.ctx.cu_skip_flag_model[skip_context], 1) + *inter_bitcost;
-    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T);
+    bits += uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
   }
   else {
-    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T);
+    no_cbf_bits = uvg_mock_encode_coding_unit(state, cabac, cu_loc, lcu, cur_cu, UVG_BOTH_T, splitt_tree);
     bits += no_cbf_bits - CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 0) + CTX_ENTROPY_FBITS(&cabac->ctx.cu_qt_root_cbf_model, 1);
   }
   double no_cbf_cost = ssd + no_cbf_bits * state->lambda;
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index 2d45166c..8c967bdb 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -386,11 +386,13 @@ void uvg_quant_avx2(const encoder_state_t * const state, const coeff_t * __restr
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size
   
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); //!< Represents scaling through forward transform
-  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
+  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
   const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
@@ -792,13 +794,15 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
   int32_t n;
   const uint32_t log2_tr_width =  uvg_g_convert_to_log2[width];
   const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
-  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform
+  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1);
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size// Represents scaling through forward transform
 
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
 
-  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
+  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
 
   if (encoder->scaling_list.enable)
   {
@@ -822,7 +826,7 @@ void uvg_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
       }
     }
   } else {
-    int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
+    int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
     add = 1 << (shift-1);
 
     __m256i v_scale = _mm256_set1_epi32(scale);
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index eed95e59..13e08f3a 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -68,12 +68,13 @@ void uvg_quant_generic(
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
-  
-  
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size
+    
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)color;
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
-  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); //!< Represents scaling through forward transform
-  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
+  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1) - needs_block_size_trafo_scale; //!< Represents scaling through forward transform
+  const int64_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift );
   const int32_t add = ((state->frame->slicetype == UVG_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
@@ -592,11 +593,13 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
   const uint32_t log2_tr_height = uvg_g_convert_to_log2[height];
   int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); // Represents scaling through forward transform
 
+  bool needs_block_size_trafo_scale = !transform_skip && ((log2_tr_height + log2_tr_width) % 2 == 1);
+  needs_block_size_trafo_scale |= 1; // Non log2 block size
 
   int32_t qp_scaled = uvg_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
 
-  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
+  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift - needs_block_size_trafo_scale);
 
   if (encoder->scaling_list.enable)
   {
@@ -620,7 +623,7 @@ void uvg_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
       }
     }
   } else {
-    int32_t scale = uvg_g_inv_quant_scales[qp_scaled%6] << (qp_scaled/6);
+    int32_t scale = uvg_g_inv_quant_scales[needs_block_size_trafo_scale][qp_scaled%6] << (qp_scaled/6);
     add = 1 << (shift-1);
 
     for (n = 0; n < width * height; n++) {
diff --git a/src/transform.h b/src/transform.h
index ebe31109..e96a2893 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -44,8 +44,8 @@
 #include "global.h" // IWYU pragma: keep
 
 extern const uint8_t uvg_g_chroma_scale[58];
-extern const int16_t uvg_g_inv_quant_scales[6];
-extern const int16_t uvg_g_quant_scales[6];
+extern const int16_t uvg_g_inv_quant_scales[2][6];
+extern const int16_t uvg_g_quant_scales[2][6];
 
 #define COEFF_ORDER_LINEAR 0
 #define COEFF_ORDER_CU 1
diff --git a/src/uvg266.h b/src/uvg266.h
index d2726655..7d772780 100644
--- a/src/uvg266.h
+++ b/src/uvg266.h
@@ -543,13 +543,11 @@ typedef struct uvg_config
 
   uint8_t dual_tree;
 
-  uint8_t min_qt_size[3];
+  uint8_t min_qt_size[3];  /* intra, inter, dual tree chroma*/
   uint8_t max_bt_size[3];
   uint8_t max_tt_size[3];
 
-  uint8_t max_intra_slice_btt_depth;
-  uint8_t max_intra_slice_btt_depth_chroma;
-  uint8_t max_inter_slice_btt_depth;
+  uint8_t max_btt_depth[3];
 
   uint8_t intra_rough_search_levels;