[mtt] Actually remove the last width dependency to depth

2024-11-23 18:14:06 +00:00 · 2022-09-08 15:10:54 +03:00 · 2022-09-08 15:10:54 +03:00 · 6a0864839c
parent dcf879e5ed
commit 6a0864839c
22 changed files with 360 additions and 347 deletions
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@ -47,12 +47,13 @@
 #include "tables.h"
 #include "videoframe.h"

-bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pred_cu)
+bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pred_cu, const cu_loc_t*
+                        const cu_loc)
 {
  uint32_t ts_max_size = 1 << state->encoder_control->cfg.trskip_max_size; 
  const uint32_t max_size = 32; // CU::isIntra(cu) ? MTS_INTRA_MAX_CU_SIZE : MTS_INTER_MAX_CU_SIZE;
-  const uint32_t cu_width    = LCU_WIDTH >> pred_cu->depth;
-  const uint32_t cu_height   = LCU_WIDTH >> pred_cu->depth;
+  const uint32_t cu_width    = cu_loc->width;
+  const uint32_t cu_height   = cu_loc->height;
  //bool mts_allowed = cu.chType == CHANNEL_TYPE_LUMA && compID == COMPONENT_Y;

  uint8_t mts_type = state->encoder_control->cfg.mts;
@ -66,14 +67,16 @@ bool uvg_is_mts_allowed(const encoder_state_t * const state, cu_info_t *const pr
  return mts_allowed;
 }

-static void encode_mts_idx(encoder_state_t * const state,
+static void encode_mts_idx(
+  encoder_state_t * const state,
  cabac_data_t * const cabac,
-  const cu_info_t *const pred_cu)
+  const cu_info_t *const pred_cu,
+  const cu_loc_t* const cu_loc)
 {
  //TransformUnit &tu = *cu.firstTU;
  int mts_idx = pred_cu->tr_idx;

-  if (uvg_is_mts_allowed(state, (cu_info_t* const )pred_cu) && mts_idx != MTS_SKIP
+  if (uvg_is_mts_allowed(state, (cu_info_t* const )pred_cu, cu_loc) && mts_idx != MTS_SKIP
       && !pred_cu->violates_mts_coeff_constraint
       && pred_cu->mts_last_scan_pos       
    )
@ -498,7 +501,7 @@ void uvg_encode_last_significant_xy(cabac_data_t * const cabac,

 static void encode_chroma_tu(
  encoder_state_t* const state,
-  const cu_loc_t *cu_loc,
+  const cu_loc_t * const cu_loc,
  int depth,
  cu_info_t* cur_pu,
  int8_t* scan_idx,
@ -541,8 +544,7 @@ static void encode_chroma_tu(
    }
  }
  else {
-    // const coeff_t *coeff_uv = &coeff->joint_uv[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
-    const coeff_t coeff_uv[TR_MAX_WIDTH * TR_MAX_WIDTH];
+    coeff_t coeff_uv[TR_MAX_WIDTH * TR_MAX_WIDTH];
    uvg_get_sub_coeff(coeff_uv, coeff->joint_uv, x_local, y_local, cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
    if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
      cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
@ -700,7 +702,7 @@ static void encode_transform_coeff(
  }
  */

-  int8_t split = (LCU_WIDTH >> depth > TR_MAX_WIDTH);
+  int8_t split = (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH);

  const int cb_flag_y = tree_type != UVG_CHROMA_T ? cbf_is_set(cur_pu->cbf, depth, COLOR_Y) : 0;
  const int cb_flag_u = tree_type != UVG_LUMA_T ?( cur_pu->joint_cb_cr ? (cur_pu->joint_cb_cr >> 1) & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U)) : 0;
@ -1290,15 +1292,13 @@ bool uvg_write_split_flag(
  const cu_info_t * left_cu,
  const cu_info_t * above_cu,
  uint8_t split_flag,
+  const cu_loc_t* const cu_loc,
  int depth,
-  int cu_width,
-  int x,
-  int y,
  enum uvg_tree_type tree_type,
  double* bits_out)
 {
-  uint16_t abs_x = x + (state->tile->offset_x >> (tree_type == UVG_CHROMA_T));
-  uint16_t abs_y = y + (state->tile->offset_y >> (tree_type == UVG_CHROMA_T));
+  uint16_t abs_x = (cu_loc->x + state->tile->offset_x) >> (tree_type == UVG_CHROMA_T);
+  uint16_t abs_y = (cu_loc->y + state->tile->offset_y) >> (tree_type == UVG_CHROMA_T);
  double bits = 0;
  const encoder_control_t* const ctrl = state->encoder_control;
  // Implisit split flag when on border
@ -1311,10 +1311,12 @@ bool uvg_write_split_flag(
  // ToDo: update this when btt is actually used
  bool allow_btt = false;// when mt_depth < MAX_BT_DEPTH
  
+  const int cu_width = tree_type != UVG_CHROMA_T ? cu_loc->width : cu_loc->chroma_width;
+  const int cu_height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;

  uint8_t implicit_split_mode = UVG_NO_SPLIT;
  //bool implicit_split = border;
-  bool bottom_left_available = ((abs_y + cu_width - 1) < (ctrl->in.height >> (tree_type == UVG_CHROMA_T)));
+  bool bottom_left_available = ((abs_y + cu_height - 1) < (ctrl->in.height >> (tree_type == UVG_CHROMA_T)));
  bool top_right_available = ((abs_x + cu_width - 1) < (ctrl->in.width >> (tree_type == UVG_CHROMA_T)));

  if (!bottom_left_available && !top_right_available && allow_qt) {
@ -1349,11 +1351,11 @@ bool uvg_write_split_flag(
  if (no_split && allow_split) {
    // Get left and top block split_flags and if they are present and true, increase model number
    // ToDo: should use height and width to increase model, PU_GET_W() ?
-    if (left_cu && LCU_WIDTH >> left_cu->depth < LCU_WIDTH >> depth) {
+    if (left_cu && left_cu->depth > depth) {
      split_model++;
    }

-    if (above_cu && LCU_WIDTH >> above_cu->depth < LCU_WIDTH >> depth) {
+    if (above_cu && above_cu->depth > depth) {
      split_model++;
    }

@ -1457,7 +1459,16 @@ void uvg_encode_coding_tree(
  // When not in MAX_DEPTH, insert split flag and split the blocks if needed
  if (depth != MAX_DEPTH && !(tree_type == UVG_CHROMA_T && depth == MAX_DEPTH -1)) {

-    const int split_flag = uvg_write_split_flag(state, cabac, left_cu, above_cu, (cur_cu->split_tree >> (split_tree.current_depth * 3)) & 7, depth, cu_width, x, y, tree_type,NULL);
+    const int split_flag = uvg_write_split_flag(
+      state,
+      cabac,
+      left_cu,
+      above_cu,
+      (cur_cu->split_tree >> (split_tree.current_depth * 3)) & 7,
+      cu_loc,
+      depth,
+      tree_type,
+      NULL);
    
    if (split_flag || border) {
      const int half_luma = cu_loc->width / 2;
@ -1597,8 +1608,8 @@ void uvg_encode_coding_tree(
    uvg_pixel *rec_base_v = &frame->rec->v[x / 2 + y / 2 * ctrl->in.width / 2];

    // Luma
-    for (unsigned y_px = 0; y_px < LCU_WIDTH >> depth; y_px++) {
-      for (unsigned x_px = 0; x_px < LCU_WIDTH >> depth; x_px++) {
+    for (unsigned y_px = 0; y_px < cu_height; y_px++) {
+      for (unsigned x_px = 0; x_px < cu_width; x_px++) {
        uvg_bitstream_put(cabac->stream, base_y[x_px + y_px * ctrl->in.width], 8);
        rec_base_y[x_px + y_px * ctrl->in.width] = base_y[x_px + y_px * ctrl->in.width];
      }
@ -1606,14 +1617,14 @@ void uvg_encode_coding_tree(

    // Chroma
    if (ctrl->chroma_format != UVG_CSP_400) {
-      for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) {
-        for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) {
+      for (unsigned y_px = 0; y_px < cu_loc->chroma_height; y_px++) {
+        for (unsigned x_px = 0; x_px < cu_loc->chroma_width; x_px++) {
          uvg_bitstream_put(cabac->stream, base_u[x_px + y_px * (ctrl->in.width >> 1)], 8);
          rec_base_u[x_px + y_px * (ctrl->in.width >> 1)] = base_u[x_px + y_px * (ctrl->in.width >> 1)];
        }
      }
-      for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) {
-        for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) {
+      for (unsigned y_px = 0; y_px < cu_loc->chroma_height; y_px++) {
+        for (unsigned x_px = 0; x_px < cu_loc->chroma_width; x_px++) {
          uvg_bitstream_put(cabac->stream, base_v[x_px + y_px * (ctrl->in.width >> 1)], 8);
          rec_base_v[x_px + y_px * (ctrl->in.width >> 1)] = base_v[x_px + y_px * (ctrl->in.width >> 1)];
        }
@ -1664,7 +1675,7 @@ void uvg_encode_coding_tree(
        encode_transform_coeff(state, &cu_loc, depth, 0, 0, 0, 0, coeff, tree_type, true, false, &luma_cbf_ctx, cu_loc);
      }

-      encode_mts_idx(state, cabac, cur_cu);
+      encode_mts_idx(state, cabac, cur_cu, cu_loc);

    }
  } else if (cur_cu->type == CU_INTRA) {
@ -1701,7 +1712,7 @@ void uvg_encode_coding_tree(
    if (tree_type != UVG_CHROMA_T) {
      bool lfnst_written = encode_lfnst_idx(state, cabac, cur_cu, x, y, depth, cu_width, cu_height, tree_type, COLOR_Y);
    }
-    encode_mts_idx(state, cabac, cur_cu);
+    encode_mts_idx(state, cabac, cur_cu, cu_loc);

    // For 4x4 the chroma PU/TU is coded after the last 
    if (state->encoder_control->chroma_format != UVG_CSP_400 && 
@ -1731,7 +1742,7 @@ void uvg_encode_coding_tree(

 end:

-  if (is_last_cu_in_qg(state, x, y, depth)) {
+  if (is_last_cu_in_qg(state, cu_loc)) {
    state->last_qp = cur_cu->qp;
  }

@ -1752,11 +1763,9 @@ double uvg_mock_encode_coding_unit(

  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];

-  int x_local = SUB_SCU(x) >> (tree_type == UVG_CHROMA_T);
-  int y_local = SUB_SCU(y) >> (tree_type == UVG_CHROMA_T);
-
-  const int cu_width = LCU_WIDTH >> depth;
-  
+  int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
+  int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
+    
  const cu_info_t* left_cu = NULL, *above_cu = NULL;
  if (x) {
    if(x_local || tree_type != UVG_CHROMA_T) {
@ -1787,16 +1796,14 @@ double uvg_mock_encode_coding_unit(
      left_cu,
      above_cu,
      0,
+      cu_loc,
      depth,
-      cu_width >> (tree_type == UVG_CHROMA_T),
-      x >> (tree_type == UVG_CHROMA_T),
-      y >> (tree_type == UVG_CHROMA_T),
      tree_type,
      &bits);
  }

  // Encode skip flag
-  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
+  if (state->frame->slicetype != UVG_SLICE_I && (cu_loc->width != 4 || cu_loc->height != 4)) {
    int8_t ctx_skip = 0;

    if (left_cu && left_cu->skipped) {
@ -1829,7 +1836,7 @@ double uvg_mock_encode_coding_unit(
    }
  }
  // Prediction mode
-  if (state->frame->slicetype != UVG_SLICE_I && cu_width != 4) {
+  if (state->frame->slicetype != UVG_SLICE_I && (cu_loc->width != 4 || cu_loc->height != 4)) {

    int8_t ctx_predmode = 0;

--- a/src/encode_coding_tree.h
+++ b/src/encode_coding_tree.h
@ -40,7 +40,8 @@
 #include "encoderstate.h"
 #include "global.h"

-bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu);
+bool uvg_is_mts_allowed(const encoder_state_t* const state, cu_info_t* const pred_cu, const cu_loc_t*
+                        const cu_loc);
 bool uvg_is_lfnst_allowed(
  const encoder_state_t* const state,
  const cu_info_t* const pred_cu,
@ -105,10 +106,8 @@ bool uvg_write_split_flag(
  const cu_info_t* left_cu,
  const cu_info_t* above_cu,
  uint8_t split_flag,
+  const cu_loc_t* const cu_loc,
  int depth,
-  int cu_width,
-  int x,
-  int y,
  enum uvg_tree_type tree_type,
  double* bits_out);

--- a/src/encoderstate.c
+++ b/src/encoderstate.c
@ -627,36 +627,45 @@ static void encode_sao(encoder_state_t * const state,
 * \param prev_qp         -1 if QP delta has not been coded in current QG,
 *                        otherwise the QP of the current QG
 */
-static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *last_qp, int *prev_qp)
+static void set_cu_qps(encoder_state_t *state, const cu_loc_t* const cu_loc, int *last_qp, int *prev_qp, const
+                       int depth)
 {

  // Stop recursion if the CU is completely outside the frame.
-  if (x >= state->tile->frame->width || y >= state->tile->frame->height) return;
+  if (cu_loc->x >= state->tile->frame->width || cu_loc->y >= state->tile->frame->height) return;

-  cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, x, y);
-  const int cu_width = LCU_WIDTH >> depth;
+  cu_info_t *cu = uvg_cu_array_at(state->tile->frame->cu_array, cu_loc->x, cu_loc->y);
+  const int width = LCU_WIDTH >> cu->depth;

  if (depth <= state->frame->max_qp_delta_depth) {
    *prev_qp = -1;
  }

-  if (cu->depth > depth) {
+  if (cu_loc->width > width) {
    // Recursively process sub-CUs.
-    const int d = cu_width >> 1;
-    set_cu_qps(state, x,     y,     depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x + d, y,     depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x,     y + d, depth + 1, last_qp, prev_qp);
-    set_cu_qps(state, x + d, y + d, depth + 1, last_qp, prev_qp);
+    const int half_width = cu_loc->width >> 1;
+    const int half_height = cu_loc->height >> 1;
+    cu_loc_t split_cu_loc;
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc,     last_qp,     prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc, last_qp,     prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc,     last_qp, prev_qp, depth + 1);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    set_cu_qps(state, &split_cu_loc, last_qp, prev_qp, depth + 1);

  } else {
    bool cbf_found = *prev_qp >= 0;

+    int y_limit = cu_loc->y + cu_loc->height;
+    int x_limit = cu_loc->x + cu_loc->width;
    if (cu->tr_depth > depth) {
      // The CU is split into smaller transform units. Check whether coded
      // block flag is set for any of the TUs.
      const int tu_width = LCU_WIDTH >> cu->tr_depth;
-      for (int y_scu = y; !cbf_found && y_scu < y + cu_width; y_scu += tu_width) {
-        for (int x_scu = x; !cbf_found && x_scu < x + cu_width; x_scu += tu_width) {
+      for (int y_scu = cu_loc->y; !cbf_found && y_scu < y_limit; y_scu += tu_width) {
+        for (int x_scu = cu_loc->x; !cbf_found && x_scu < x_limit; x_scu += tu_width) {
          cu_info_t *tu = uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
          if (cbf_is_set_any(tu->cbf, cu->depth)) {
            cbf_found = true;
@ -671,18 +680,18 @@ static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *las
    if (cbf_found) {
      *prev_qp = qp = cu->qp;
    } else {
-      qp = uvg_get_cu_ref_qp(state, x, y, *last_qp);
+      qp = uvg_get_cu_ref_qp(state, cu_loc->x, cu_loc->y, *last_qp);
    }

    // Set the correct QP for all state->tile->frame->cu_array elements in
    // the area covered by the CU.
-    for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) {
-      for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) {
+    for (int y_scu = cu_loc->y; y_scu < y_limit; y_scu += SCU_WIDTH) {
+      for (int x_scu = cu_loc->x; x_scu < x_limit; x_scu += SCU_WIDTH) {
        uvg_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp;
      }
    }

-    if (is_last_cu_in_qg(state, x, y, depth)) {
+    if (is_last_cu_in_qg(state, cu_loc)) {
      *last_qp = cu->qp;
    }
  }
@ -812,7 +821,9 @@ static void encoder_state_worker_encode_lcu_search(void * opaque)
  if (state->frame->max_qp_delta_depth >= 0) {
    int last_qp = state->last_qp;
    int prev_qp = -1;
-    set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
+    cu_loc_t cu_loc;
+    uvg_cu_loc_ctor(&cu_loc, lcu->position_px.x, lcu->position_px.y, LCU_WIDTH, LCU_WIDTH);
+    set_cu_qps(state, &cu_loc, &last_qp, &prev_qp, 0);
  }

  if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.sliceReshaperEnableFlag) {
--- a/src/encoderstate.h
+++ b/src/encoderstate.h
@ -401,14 +401,13 @@ static INLINE bool encoder_state_must_write_vps(const encoder_state_t *state)
 * \param depth   depth in the CU tree
 * \return true, if it's the last CU in its QG, otherwise false
 */
-static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
+static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, const cu_loc_t* const cu_loc)
 {
  if (state->frame->max_qp_delta_depth < 0) return false;
-
-  const int cu_width = LCU_WIDTH >> depth;
+  
  const int qg_width = LCU_WIDTH >> state->frame->max_qp_delta_depth;
-  const int right  = x + cu_width;
-  const int bottom = y + cu_width;
+  const int right  = cu_loc->x + cu_loc->width;
+  const int bottom = cu_loc->y + cu_loc->height;
  return (right % qg_width == 0 || right >= state->tile->frame->width) &&
         (bottom % qg_width == 0 || bottom >= state->tile->frame->height);
 }
--- a/src/filter.c
+++ b/src/filter.c
@ -856,8 +856,7 @@ static void filter_deblock_edge_luma(encoder_state_t * const state,
      uint8_t max_filter_length_Q = 0;
      const int cu_size = LCU_WIDTH >> cu_q->depth;
      // TODO: NON square
-      const int pu_size = dir == EDGE_HOR ? cu_size
-                                          : cu_size;
+      const int pu_size = dir == EDGE_HOR ? cu_size : cu_size;
      const int pu_pos = dir == EDGE_HOR ? y_coord 
                                         : x_coord;
      get_max_filter_length(&max_filter_length_P, &max_filter_length_Q, state, x_coord, y_coord,
--- a/src/global.h
+++ b/src/global.h
@ -273,7 +273,6 @@ typedef int32_t mv_t;
 #define CLIP_TO_PIXEL(value) CLIP(0, PIXEL_MAX, (value))
 #define CLIP_TO_QP(value) CLIP(0, 51, (value))
 #define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
-#define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
 #define CEILDIV(x,y) (((x) + (y) - 1) / (y))

--- a/src/intra.c
+++ b/src/intra.c
@ -1555,7 +1555,7 @@ void uvg_intra_predict(
    uvg_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], dst, width, width, stride / 2, width);
    if (data->pred_cu.depth != data->pred_cu.tr_depth || data->cclm_parameters[color == COLOR_U ? 0 : 1].b <= 0) {
      predict_cclm(
-        state, color, width, width, x, y, stride, intra_mode, lcu, refs, dst, 
+        state, color, width, height, x, y, stride, intra_mode, lcu, refs, dst, 
        (cclm_parameters_t*)&data->cclm_parameters[color == COLOR_U ? 0 : 1],
        tree_type);
    }
--- a/src/rdo.c
+++ b/src/rdo.c
@ -297,7 +297,7 @@ out:
 static INLINE double get_coeff_cabac_cost(
  const encoder_state_t * const state,
  const coeff_t *coeff,
-  cu_loc_t *cu_loc,
+  const cu_loc_t* const cu_loc,
  color_t color,
  int8_t scan_mode,
  int8_t tr_skip,
@ -415,7 +415,7 @@ double uvg_get_coeff_cost(
  const encoder_state_t * const state,
  const coeff_t *coeff,
  cu_info_t* cur_tu,
-  cu_loc_t *cu_loc,
+  const cu_loc_t* const cu_loc,
  color_t color,
  int8_t scan_mode,
  int8_t tr_skip,
@ -1409,7 +1409,6 @@ void uvg_rdoq(
  int8_t color,
  int8_t scan_mode,
  int8_t block_type,
-  int8_t tr_depth,
  uint16_t cbf,
  uint8_t lfnst_idx)
 {
--- a/src/rdo.h
+++ b/src/rdo.h
@ -60,7 +60,6 @@ void  uvg_rdoq(
  int8_t type,
  int8_t scan_mode,
  int8_t block_type,
-  int8_t tr_depth,
  uint16_t cbf,
  uint8_t lfnst_idx);

@ -73,7 +72,7 @@ double uvg_get_coeff_cost(
  const encoder_state_t * const state,
  const coeff_t *coeff,
  cu_info_t* cur_tu,
-  cu_loc_t *cu_loc,
+  const cu_loc_t* const cu_loc,
  color_t color,
  int8_t scan_mode,
  int8_t tr_skip,
--- a/src/search.c
+++ b/src/search.c
@ -63,30 +63,39 @@
 static const int INTRA_THRESHOLD = 8;


-static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
+static INLINE void copy_cu_info(lcu_t *from, lcu_t *to, const cu_loc_t* const cu_loc, enum uvg_tree_type
+                                tree_type)
 {
-  for   (int y = y_local; y < y_local + width; y += SCU_WIDTH) {
-    for (int x = x_local; x < x_local + width; x += SCU_WIDTH) {
+  const int y_limit = (cu_loc->local_y + cu_loc->height) >> (tree_type == UVG_CHROMA_T);
+  const int x_limit = (cu_loc->local_x + cu_loc->width) >> (tree_type == UVG_CHROMA_T);
+  for   (int y = cu_loc->local_y >> (tree_type == UVG_CHROMA_T); y < y_limit; y += SCU_WIDTH) {
+    for (int x = cu_loc->local_x >> (tree_type == UVG_CHROMA_T); x < x_limit; x += SCU_WIDTH) {
      *LCU_GET_CU_AT_PX(to, x, y) = *LCU_GET_CU_AT_PX(from, x, y);
    }
  }
 }

-static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, enum uvg_tree_type
-                                  tree_type)
+static INLINE void copy_cu_pixels(
+  lcu_t *from,
+  lcu_t *to,
+  const cu_loc_t* const cu_loc,
+  enum uvg_tree_type
+  tree_type)
 {
+  const int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
+  const int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
  const int luma_index = x_local + y_local * LCU_WIDTH;
  const int chroma_index = tree_type == UVG_CHROMA_T ? x_local + y_local * LCU_WIDTH_C : (x_local / 2) + (y_local / 2) * LCU_WIDTH_C;

  if(tree_type != UVG_CHROMA_T) {
    uvg_pixels_blit(&from->rec.y[luma_index], &to->rec.y[luma_index],
-                    width, width, LCU_WIDTH, LCU_WIDTH);
+                    cu_loc->width, cu_loc->height, LCU_WIDTH, LCU_WIDTH);
  }
  if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
    uvg_pixels_blit(&from->rec.u[chroma_index], &to->rec.u[chroma_index],
-                    width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                    cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
    uvg_pixels_blit(&from->rec.v[chroma_index], &to->rec.v[chroma_index],
-                    width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                    cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C, LCU_WIDTH_C);
  }
 }

@ -103,8 +112,8 @@ static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to

  if (from->rec.chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {
    //const int chroma_z = xy_to_zorder(LCU_WIDTH_C, cu_loc->x >> (tree_type != UVG_CHROMA_T), cu_loc->y >> (tree_type != UVG_CHROMA_T));
-    const int chroma_x = cu_loc->x >> (tree_type != UVG_CHROMA_T);
-    const int chroma_y = cu_loc->y >> (tree_type != UVG_CHROMA_T);
+    const int chroma_x = (cu_loc->x >> 1) & ~3;
+    const int chroma_y = (cu_loc->y >> 1) & ~3;

    const int idx = (chroma_x % LCU_WIDTH_C) + ((chroma_y % LCU_WIDTH_C) * LCU_WIDTH_C);
    copy_coeffs(&from->coeff.u[idx], &to->coeff.u[idx], cu_loc->chroma_width, cu_loc->chroma_height, LCU_WIDTH_C);
@ -118,15 +127,17 @@ static INLINE void copy_cu_coeffs(const cu_loc_t *cu_loc, lcu_t *from, lcu_t *to
 /**
 * Copy all non-reference CU data from next level to current level.
 */
-static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_tree, bool joint, enum
-                              uvg_tree_type tree_type)
+static void work_tree_copy_up(
+  lcu_t *work_tree,
+  bool joint,
+  enum
+  uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc,
+  const int depth)
 {
-  const int width = LCU_WIDTH >> depth;
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x_local, y_local, width, width);
-  copy_cu_info  (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
-  copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], tree_type);
-  copy_cu_coeffs(&loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
+  copy_cu_info  (&work_tree[depth + 1], &work_tree[depth], cu_loc, tree_type);
+  copy_cu_pixels(&work_tree[depth + 1], &work_tree[depth], cu_loc, tree_type);
+  copy_cu_coeffs(cu_loc, &work_tree[depth + 1], &work_tree[depth], joint, tree_type);
  
 }

@ -134,24 +145,32 @@ static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_t
 /**
 * Copy all non-reference CU data from current level to all lower levels.
 */
-static void work_tree_copy_down(int x_local, int y_local, int depth, lcu_t *work_tree, enum uvg_tree_type
-                                tree_type)
+static void work_tree_copy_down(
+  int depth,
+  lcu_t *work_tree,
+  enum uvg_tree_type
+  tree_type,
+  const cu_loc_t* const cu_loc)
 {
-  const int width = tree_type != UVG_CHROMA_T ? LCU_WIDTH >> depth : LCU_WIDTH_C >> 1;
  for (int i = depth + 1; i <= MAX_PU_DEPTH; i++) {
-    copy_cu_info  (x_local, y_local, width, &work_tree[depth], &work_tree[i]);
-    copy_cu_pixels(x_local, y_local, LCU_WIDTH >> depth, &work_tree[depth], &work_tree[i], tree_type);
+    copy_cu_info  (&work_tree[depth], &work_tree[i], cu_loc, tree_type);
+    copy_cu_pixels(&work_tree[depth], &work_tree[i], cu_loc, tree_type);
  }
 }

-void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type
-                          tree_type)
+void uvg_lcu_fill_trdepth(
+  lcu_t *lcu,
+  const cu_loc_t* const cu_loc,
+  uint8_t tr_depth,
+  enum uvg_tree_type
+  tree_type)
 {
-  const int x_local = SUB_SCU(x_px);
-  const int y_local = SUB_SCU(y_px);
-  const unsigned width = (tree_type != UVG_CHROMA_T ? LCU_WIDTH  : LCU_WIDTH_C) >> depth;
+  const int x_local = cu_loc->local_x >> (tree_type == UVG_CHROMA_T);
+  const int y_local = cu_loc->local_y >> (tree_type == UVG_CHROMA_T);
+  const unsigned width = tree_type != UVG_CHROMA_T ? cu_loc->width  : cu_loc->chroma_width;
+  const unsigned height = tree_type != UVG_CHROMA_T ? cu_loc->height : cu_loc->chroma_height;

-  for (unsigned y = 0; y < width; y += SCU_WIDTH) {
+  for (unsigned y = 0; y < height; y += SCU_WIDTH) {
    for (unsigned x = 0; x < width; x += SCU_WIDTH) {
      LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y)->tr_depth = tr_depth;
    }
@ -167,6 +186,7 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
      to->type      = cu->type;
      to->depth     = cu->depth;
      to->qp        = cu->qp;
+      to->split_tree = cu->split_tree;
      //to->tr_idx    = cu->tr_idx;
      to->lfnst_idx = cu->lfnst_idx;
      to->lfnst_last_scan_pos = cu->lfnst_last_scan_pos;
@ -214,34 +234,37 @@ static void lcu_fill_cbf(lcu_t *lcu, int x_local, unsigned y_local, unsigned wid


 //Calculates cost for all zero coeffs
-static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int x, const int y,
+static double cu_zero_coeff_cost(
+  const encoder_state_t *state,
+  lcu_t *work_tree,
+  const cu_loc_t* const cu_loc,
  const int depth)
 {
-  int x_local = SUB_SCU(x);
-  int y_local = SUB_SCU(y);
-  int cu_width = LCU_WIDTH >> depth;
  lcu_t *const lcu = &work_tree[depth];

+  const int y_local = cu_loc->local_y;
+  const int x_local = cu_loc->local_x;
+
  const int luma_index = y_local * LCU_WIDTH + x_local;
  const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);

  double ssd = 0.0;
  ssd += UVG_LUMA_MULT * uvg_pixels_calc_ssd(
    &lcu->ref.y[luma_index], &lcu->rec.y[luma_index],
-    LCU_WIDTH, LCU_WIDTH, cu_width
+    LCU_WIDTH, LCU_WIDTH, cu_loc->width
    );
-  if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
+  if (y_local % 8 == 0 && x_local % 8 == 0 && state->encoder_control->chroma_format != UVG_CSP_400) {
    ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
      &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index],
-      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width
      );
    ssd += UVG_CHROMA_MULT * uvg_pixels_calc_ssd(
      &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index],
-      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_loc->chroma_width
      );
  }
  // Save the pixels at a lower level of the working tree.
-  copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1], UVG_BOTH_T);
+  copy_cu_pixels(lcu, &work_tree[depth + 1], cu_loc, UVG_BOTH_T);

  return ssd;
 }
@ -295,46 +318,45 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
 * Takes into account SSD of reconstruction and the cost of encoding whatever
 * prediction unit data needs to be coded.
 */
-double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                           const int x_px, const int y_px, const int depth,
-                           const cu_info_t *const pred_cu,
-                           lcu_t *const lcu,
-                           uint8_t isp_cbf)
+double uvg_cu_rd_cost_luma(
+  const encoder_state_t *const state,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  uint8_t isp_cbf)
 {
-  const int width  = LCU_WIDTH >> depth;
-  const int height = width; // TODO: height for non-square blocks
  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
-
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
-
+  
  // cur_cu is used for TU parameters.
-  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x, cu_loc->local_y);

  double coeff_bits = 0;
  double tr_tree_bits = 0;

-  // Check that lcu is not in 
-  assert(x_px >= 0 && x_px < LCU_WIDTH);
-  assert(y_px >= 0 && y_px < LCU_WIDTH);
+  // Check that lcu is not in   

-  const uint8_t tr_depth = tr_cu->tr_depth - depth;
-
-  if (tr_depth > 0) {
-    int offset = width / 2;
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
    double sum = 0;
+    const int half_width = cu_loc->width >> 1;
+    const int half_height = cu_loc->height >> 1;
+    cu_loc_t split_cu_loc;

-    sum += uvg_cu_rd_cost_luma(state, x_px, y_px, depth + 1, pred_cu, lcu, isp_cbf);
-    sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, isp_cbf);
-    sum += uvg_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, isp_cbf);
-    sum += uvg_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, isp_cbf);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y+ half_height, half_width, half_height);
+    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    sum += uvg_cu_rd_cost_luma(state, &split_cu_loc, pred_cu, lcu, isp_cbf);

    return sum + tr_tree_bits * state->lambda;
  }

  // Add transform_tree cbf_luma bit cost.
  if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
+    const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
    const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
    int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
    if (pred_cu->type == CU_INTRA ||
@ -347,7 +369,9 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
      CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
    }

-    if (is_set && state->encoder_control->cfg.trskip_enable && width <= (1 << state->encoder_control->cfg.trskip_max_size)) {
+    if (is_set && state->encoder_control->cfg.trskip_enable 
+      && cu_loc->width <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && cu_loc->height <= (1 << state->encoder_control->cfg.trskip_max_size)) {
      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, pred_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
    }
  }
@ -367,28 +391,28 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
  // SSD between reconstruction and original
  int ssd = 0;
  if (!state->encoder_control->cfg.lossless) {
-    int index = y_px * LCU_WIDTH + x_px;
+    int index = cu_loc->local_y * LCU_WIDTH + cu_loc->local_x;
    ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
                                        LCU_WIDTH,          LCU_WIDTH,
-                                        width);
+                                        cu_loc->width);
  }


  if (!skip_residual_coding) {
-    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+    int8_t luma_scan_mode = SCAN_DIAG;
    if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
      //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
      const coeff_t* coeffs = lcu->coeff.y;

-      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, NULL, cu_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
    }
    else {
      int split_type = pred_cu->intra.isp_mode;
-      int split_limit = uvg_get_isp_split_num(width, height, split_type, true);
+      int split_limit = uvg_get_isp_split_num(cu_loc->width, cu_loc->height, split_type, true);

      for (int i = 0; i < split_limit; ++i) {
        cu_loc_t split_loc;
-        uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type, true);
+        uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y,  cu_loc->width, cu_loc->height, i, split_type, true);
        const int part_x = split_loc.x;
        const int part_y = split_loc.y;

@ -406,34 +430,32 @@ double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
 }


-double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                             const int x_px, const int y_px, const int depth,
-                             cu_info_t *const pred_cu,
-                             lcu_t *const lcu)
+double uvg_cu_rd_cost_chroma(
+  const encoder_state_t *const state,
+  cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  const cu_loc_t * const cu_loc)
 {
-  const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
-  const int width  = (depth < MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
-  const int height = width; // TODO: height for non-square blocks
-  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  const vector2d_t lcu_px = { (cu_loc->local_x & ~7) / 2, (cu_loc->local_y & ~7) / 2 };
+  const int width = cu_loc->chroma_width;
+  const int height = cu_loc->chroma_height;
+  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
-
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
-
+  
  double tr_tree_bits = 0;
  double coeff_bits = 0;
-
-  assert(x_px >= 0 && x_px < LCU_WIDTH);
-  assert(y_px >= 0 && y_px < LCU_WIDTH);
-
-  if (depth == 4 && (x_px % 8 == 0 || y_px % 8 == 0)) {
+  
+  if (cu_loc->width == 4 && cu_loc->height == 4 && (cu_loc->x % 8 == 0 || cu_loc->y % 8 == 0)) {
    // For MAX_PU_DEPTH calculate chroma for previous depth for the first
    // block and return 0 cost for all others.
    return 0;
  }
+
+  const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
  int u_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 2) >> 1 : cbf_is_set(pred_cu->cbf, depth, COLOR_U);
  int v_is_set = pred_cu->joint_cb_cr ? (pred_cu->joint_cb_cr & 1) : cbf_is_set(pred_cu->cbf, depth, COLOR_V);

+
  // See luma for why the second condition
  if (!skip_residual_coding) {
    const int tr_depth = depth - pred_cu->depth;
@ -450,14 +472,21 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
  }


-  if (tr_cu->tr_depth > depth) {
-    int offset = LCU_WIDTH >> (depth + 1);
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
    double sum = 0;
+    // Recursively process sub-CUs.
+    const int half_width = cu_loc->width >> 1;
+    const int half_height = cu_loc->height >> 1;
+    cu_loc_t split_cu_loc;

-    sum += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
-    sum += uvg_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
+    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    sum += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, &split_cu_loc);

    return sum + tr_tree_bits * state->lambda;
  }
@ -487,14 +516,17 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,

  if (!skip_residual_coding) {
    int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
-    //const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
+
+    // We need the rounded & shifted coordinates for the chroma coeff calculation
+    cu_loc_t chroma_loc;
+    uvg_cu_loc_ctor(&chroma_loc, lcu_px.x, lcu_px.y, cu_loc->width, cu_loc->height);

    if((pred_cu->joint_cb_cr & 3) == 0){
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
    }
    else {
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, &loc, 2, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, NULL, cu_loc, 2, scan_order, 0, COEFF_ORDER_CU);
      
    }
  }
@ -507,39 +539,30 @@ double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,

 static double cu_rd_cost_tr_split_accurate(
  const encoder_state_t* const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
  const cu_info_t* const pred_cu,
  lcu_t* const lcu,
  enum uvg_tree_type tree_type,
-  uint8_t isp_cbf) {
-  const int width = LCU_WIDTH >> depth;
-  const int height = width; // TODO: height for non-square blocks
-
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x_px, y_px, width, height);
-
+  uint8_t isp_cbf,
+  const cu_loc_t* const cu_loc) {
+  const int width = cu_loc->width;
+  const int height = cu_loc->height; // TODO: height for non-square blocks
+  
  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type != CU_INTRA && pred_cu->cbf == 0);
  // cur_cu is used for TU parameters.
-  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  cu_info_t* const tr_cu = LCU_GET_CU_AT_PX(lcu, cu_loc->local_x >> (tree_type == UVG_CHROMA_T), cu_loc->local_y >> (tree_type == UVG_CHROMA_T));

  double coeff_bits = 0;
  double tr_tree_bits = 0;

-  // Check that lcu is not in 
-  assert(x_px >= 0 && x_px < LCU_WIDTH);
-  assert(y_px >= 0 && y_px < LCU_WIDTH);
-
-  const uint8_t tr_depth = tr_cu->tr_depth - depth;
-
+  const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+  
  const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_U);
  const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_V);

  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;

  {
-    int cbf = cbf_is_set_any(pred_cu->cbf, depth);
+    int cbf = cbf_is_set_any(tr_cu->cbf, depth);
    // Only need to signal coded block flag if not skipped or merged
    // skip = no coded residual, merge = coded residual
    if (pred_cu->type != CU_INTRA && (!pred_cu->merged)) {
@ -548,24 +571,30 @@ static double cu_rd_cost_tr_split_accurate(

  }

-  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 && y_px % 8)) && tree_type != UVG_LUMA_T;
+  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (cu_loc->x % 8 && cu_loc->y % 8)) && tree_type != UVG_LUMA_T;
  if( !skip_residual_coding && has_chroma) {
-    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+    if(tr_cu->tr_depth == depth) {
      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
    } 
-    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+    if(tr_cu->tr_depth == depth) {
      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cr[cb_flag_u]), cb_flag_v, tr_tree_bits, "cbf_cr");
    } 
  }

-  if (tr_depth > 0) {
-    int offset = LCU_WIDTH >> (depth + 1);
+  if (cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH) {
    double sum = 0;

-    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px, y_px + offset, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
-    sum += cu_rd_cost_tr_split_accurate(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu, tree_type, isp_cbf);
+    const int half_width = cu_loc->width >> 1;
+    const int half_height = cu_loc->height >> 1;
+    cu_loc_t split_cu_loc;
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y, half_width, half_height);
+    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y, half_width, half_height);
+    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x, cu_loc->y + half_height, half_width, half_height);
+    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
+    uvg_cu_loc_ctor(&split_cu_loc, cu_loc->x + half_width, cu_loc->y + half_height, half_width, half_height);
+    sum += cu_rd_cost_tr_split_accurate(state, pred_cu, lcu, tree_type, isp_cbf, &split_cu_loc);
    return sum + tr_tree_bits * state->lambda;
  }
  const int cb_flag_y = cbf_is_set(tr_cu->cbf, depth, COLOR_Y) && tree_type != UVG_CHROMA_T;
@ -573,7 +602,7 @@ static double cu_rd_cost_tr_split_accurate(
  const bool is_isp = !(pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP);
  // Add transform_tree cbf_luma bit cost.
  if (!is_isp) {
-    const int is_tr_split = depth - tr_cu->depth;
+    const int is_tr_split = cu_loc->width > TR_MAX_WIDTH || cu_loc->height > TR_MAX_WIDTH;
    if ((pred_cu->type == CU_INTRA ||
      is_tr_split ||
      cb_flag_u ||
@ -610,7 +639,7 @@ static double cu_rd_cost_tr_split_accurate(
  // SSD between reconstruction and original
  unsigned luma_ssd = 0;
  if (!state->encoder_control->cfg.lossless && tree_type != UVG_CHROMA_T) {
-    int index = y_px * LCU_WIDTH + x_px;
+    int index = cu_loc->local_x + LCU_WIDTH * cu_loc->local_y;
    luma_ssd = uvg_pixels_calc_ssd(&lcu->ref.y[index], &lcu->rec.y[index],
      LCU_WIDTH, LCU_WIDTH,
      width);
@ -623,12 +652,12 @@ static double cu_rd_cost_tr_split_accurate(
    if (can_use_tr_skip) {
      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_luma, tr_cu->tr_idx == MTS_SKIP, tr_tree_bits, "transform_skip_flag");
    }
-    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+    int8_t luma_scan_mode = SCAN_DIAG;
    if (pred_cu->type == CU_INTER || pred_cu->intra.isp_mode == ISP_MODE_NO_ISP) {
      //const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];
      const coeff_t* coeffs = lcu->coeff.y;

-      coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, &loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, cu_loc, 0, luma_scan_mode, pred_cu->tr_idx == MTS_SKIP, COEFF_ORDER_CU);
    }
    else {
      int split_type = pred_cu->intra.isp_mode;
@ -636,7 +665,7 @@ static double cu_rd_cost_tr_split_accurate(

      for (int i = 0; i < split_limit; ++i) {
        cu_loc_t split_loc;
-        uvg_get_isp_split_loc(&split_loc, x_px, y_px, width, height, i, split_type, true);
+        uvg_get_isp_split_loc(&split_loc, cu_loc->x, cu_loc->y, width, height, i, split_type, true);
        const int part_x = split_loc.x;
        const int part_y = split_loc.y;

@ -649,8 +678,8 @@ static double cu_rd_cost_tr_split_accurate(
    }
  }

-  if(depth == 4 || tree_type == UVG_LUMA_T) {
-    if (uvg_is_lfnst_allowed(state, tr_cu, width, width, x_px, y_px, tree_type, COLOR_Y, lcu)) {
+  if(cu_loc->width == 4 || tree_type == UVG_LUMA_T) {
+    if (uvg_is_lfnst_allowed(state, tr_cu, width, height, cu_loc->local_x, cu_loc->local_y, tree_type, COLOR_Y, lcu)) {
      const int lfnst_idx = tr_cu->lfnst_idx;
      CABAC_FBITS_UPDATE(
        cabac,
@ -672,14 +701,17 @@ static double cu_rd_cost_tr_split_accurate(

  unsigned chroma_ssd = 0;
  if(has_chroma) {
-    const vector2d_t lcu_px = { (x_px >> (tree_type != UVG_CHROMA_T)) & ~3, (y_px >> (tree_type != UVG_CHROMA_T)) &~3  };
-    uvg_cu_loc_ctor(&loc, lcu_px.x, lcu_px.y, width, height);
-    const int chroma_width  = MAX(4, LCU_WIDTH >> (depth + 1));
-    const int chroma_height = chroma_width; // TODO: height for non-square blocks
-    int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
+    cu_loc_t chroma_loc;
+    const vector2d_t lcu_px = { (cu_loc->local_x >> 1) & ~3, (cu_loc->local_y >> 1) &~3  };
+    uvg_cu_loc_ctor(&chroma_loc, lcu_px.x, lcu_px.y, width, height);
+    const int chroma_width  = cu_loc->chroma_width;
+    const int chroma_height = cu_loc->chroma_height; // TODO: height for non-square blocks
+    int8_t scan_order = SCAN_DIAG;
    //const unsigned index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);

-    const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable && chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size);
+    const bool chroma_can_use_tr_skip = state->encoder_control->cfg.trskip_enable
+      && chroma_width <= (1 << state->encoder_control->cfg.trskip_max_size)
+      && chroma_height <= (1 << state->encoder_control->cfg.trskip_max_size);
    if(pred_cu->joint_cb_cr == 0) {
      if (!state->encoder_control->cfg.lossless) {
        int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
@ -697,8 +729,8 @@ static double cu_rd_cost_tr_split_accurate(
      if(chroma_can_use_tr_skip && cb_flag_v) {
        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 4, tr_tree_bits, "transform_skip_flag");        
      }
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.u, tr_cu, &chroma_loc, COLOR_U, scan_order, tr_cu->tr_skip & 2, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.v, tr_cu, &chroma_loc, COLOR_V, scan_order, tr_cu->tr_skip & 4, COEFF_ORDER_CU);
      
    }
    else {
@ -715,12 +747,12 @@ static double cu_rd_cost_tr_split_accurate(
      if (chroma_can_use_tr_skip) {
        CABAC_FBITS_UPDATE(cabac, &cabac->ctx.transform_skip_model_chroma, tr_cu->tr_skip & 2, tr_tree_bits, "transform_skip_flag");
      }
-      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
+      coeff_bits += uvg_get_coeff_cost(state, lcu->coeff.joint_uv, tr_cu, &chroma_loc, COLOR_U, scan_order, 0, COEFF_ORDER_CU);
    }
  }

-  if (uvg_is_lfnst_allowed(state, tr_cu, width, height, x_px, y_px, tree_type, depth == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) {
-    const int lfnst_idx = (depth != 4 && tree_type != UVG_CHROMA_T) ? tr_cu->lfnst_idx : tr_cu->cr_lfnst_idx;
+  if (uvg_is_lfnst_allowed(state, tr_cu, width, height, cu_loc->local_x, cu_loc->local_y, tree_type, cu_loc->width == 4 || tree_type == UVG_CHROMA_T ? COLOR_UV : COLOR_Y, lcu)) {
+    const int lfnst_idx = (cu_loc->width != 4 && tree_type != UVG_CHROMA_T) ? tr_cu->lfnst_idx : tr_cu->cr_lfnst_idx;
    CABAC_FBITS_UPDATE(
      cabac,
      &cabac->ctx.lfnst_idx_model[tr_cu->depth == 4 || tree_type != UVG_BOTH_T],
@ -739,7 +771,7 @@ static double cu_rd_cost_tr_split_accurate(
  tr_cu->lfnst_last_scan_pos = false;
  tr_cu->violates_lfnst_constrained_luma = false;
  tr_cu->violates_lfnst_constrained_chroma = false;
-  if (uvg_is_mts_allowed(state, tr_cu) && tree_type != UVG_CHROMA_T) {
+  if (uvg_is_mts_allowed(state, tr_cu, cu_loc) && tree_type != UVG_CHROMA_T) {

    bool symbol = tr_cu->tr_idx != 0;
    int ctx_idx = 0;
@ -1035,10 +1067,6 @@ static double search_cu(
        if ((split_tree.current_depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != UVG_CSP_400 && tree_type != UVG_LUMA_T) {

          intra_search.pred_cu.joint_cb_cr = 0;
-          // There is almost no benefit to doing the chroma mode search for
-          // rd2. Possibly because the luma mode search already takes chroma
-          // into account, so there is less of a chanse of luma mode being
-          // really bad for chroma.
          if(tree_type == UVG_CHROMA_T) {
            intra_search.pred_cu.intra = uvg_get_co_located_luma_cu(x, y, luma_width, luma_width, NULL, state->tile->frame->cu_array, UVG_CHROMA_T)->intra;
            intra_mode = intra_search.pred_cu.intra.mode;
@ -1046,7 +1074,7 @@ static double search_cu(
          }
          intra_search.pred_cu.intra.mode_chroma = intra_search.pred_cu.intra.mode;
          if (ctrl->cfg.rdo >= 2 || ctrl->cfg.jccr || ctrl->cfg.lfnst) {
-            uvg_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search, tree_type);
+            uvg_search_cu_intra_chroma(state, cu_loc, lcu, &intra_search, tree_type);

            if (intra_search.pred_cu.joint_cb_cr == 0) {
              intra_search.pred_cu.joint_cb_cr = 4;
@ -1066,7 +1094,7 @@ static double search_cu(
                             false,
                             true);
          if(tree_type != UVG_CHROMA_T) {
-            intra_cost += uvg_cu_rd_cost_chroma(state, x_local, y_local, depth, &intra_search.pred_cu, lcu);
+            intra_cost += uvg_cu_rd_cost_chroma(state, &intra_search.pred_cu, lcu, cu_loc);
          }
          else {
            intra_cost = intra_search.cost;
@ -1080,7 +1108,7 @@ static double search_cu(
        }
        intra_search.pred_cu.intra.mode = intra_mode;
        if(tree_type == UVG_CHROMA_T) {
-          uvg_lcu_fill_trdepth(lcu, x_local, y_local, split_tree.current_depth, split_tree.current_depth, tree_type);
+          uvg_lcu_fill_trdepth(lcu, cu_loc, split_tree.current_depth, tree_type);
        }
      }
      if (intra_cost < cost) {
@ -1187,14 +1215,14 @@ static double search_cu(
        // This will no longer be necessary if the transform depths are not shared.
        int tr_depth = MAX(1, split_tree.current_depth);

-        uvg_lcu_fill_trdepth(lcu, x, y, depth, tr_depth, tree_type);
+        uvg_lcu_fill_trdepth(lcu, cu_loc, tr_depth, tree_type);

        const bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400;
        uvg_inter_recon_cu(state, lcu, true, has_chroma, cu_loc);

        if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
          //Calculate cost for zero coeffs
-          inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, split_tree.current_depth) + inter_bitcost * state->lambda;
+          inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, cu_loc, split_tree.current_depth) + inter_bitcost * state->lambda;

        }
        cu_loc_t loc;
@ -1239,13 +1267,13 @@ static double search_cu(
    
    cost = bits * state->lambda;

-    cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type, 0);
+    cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc);
    
    if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
      cost = inter_zero_coeff_cost;

      // Restore saved pixels from lower level of the working tree.
-      copy_cu_pixels(x_local, y_local, cu_width, &work_tree[split_tree.current_depth + 1], lcu, tree_type);
+      copy_cu_pixels(&work_tree[split_tree.current_depth + 1], lcu, cu_loc, tree_type);

      if (cur_cu->merged) {
        cur_cu->merged = 0;
@ -1256,7 +1284,7 @@ static double search_cu(
      if (cur_cu->tr_depth != 0) {
        // Reset transform depth since there are no coefficients. This
        // ensures that CBF is cleared for the whole area of the CU.
-        uvg_lcu_fill_trdepth(lcu, x, y, depth, depth, tree_type);
+        uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
      }

      cur_cu->cbf = 0;
@ -1317,10 +1345,8 @@ static double search_cu(
        left_cu,
        above_cu,
        1,
+        cu_loc,
        depth,
-        cu_width,
-        x >> (tree_type == UVG_CHROMA_T),
-        y >> (tree_type == UVG_CHROMA_T),
        tree_type,
        &split_bits);
    }
@ -1380,8 +1406,7 @@ static double search_cu(
        uvg_write_split_flag(state, &state->search_cabac,
                             x > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x) - 1, SUB_SCU(y)) : NULL,
                             y > 0 ? LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y) - 1) : NULL,
-                             0, depth, cu_width, x, y, tree_type,
-                             &bits);
+                             0, cu_loc, depth, tree_type, &bits);

        cur_cu->intra = cu_d1->intra;
        cur_cu->type = CU_INTRA;
@ -1391,7 +1416,7 @@ static double search_cu(
        cur_cu->lfnst_idx = 0;
        cur_cu->cr_lfnst_idx = 0;

-        uvg_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth, tree_type);
+        uvg_lcu_fill_trdepth(lcu, cu_loc, cur_cu->tr_depth, tree_type);
        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
        
        intra_search_data_t proxy;
@ -1404,12 +1429,12 @@ static double search_cu(
                           lcu,
                           tree_type,
                           true,
-                           state->encoder_control->chroma_format == UVG_CSP_400);
+                           state->encoder_control->chroma_format != UVG_CSP_400);

        double mode_bits = calc_mode_bits(state, lcu, cur_cu, cu_loc) + bits;
        cost += mode_bits * state->lambda;

-        cost += cu_rd_cost_tr_split_accurate(state, x_local, y_local, depth, cur_cu, lcu, tree_type, 0);
+        cost += cu_rd_cost_tr_split_accurate(state, cur_cu, lcu, tree_type, 0, cu_loc);

        memcpy(&post_seach_cabac, &state->search_cabac, sizeof(post_seach_cabac));
        memcpy(&state->search_cabac, &temp_cabac, sizeof(temp_cabac));
@ -1419,7 +1444,7 @@ static double search_cu(
    if (split_cost < cost) {
      // Copy split modes to this depth.
      cost = split_cost;
-      work_tree_copy_up(x_local, y_local, depth, work_tree, state->encoder_control->cfg.jccr, tree_type);
+      work_tree_copy_up(work_tree, state->encoder_control->cfg.jccr, tree_type, cu_loc, depth);
 #if UVG_DEBUG
      //debug_split = 1;
 #endif
@ -1427,7 +1452,7 @@ static double search_cu(
      // Copy this CU's mode all the way down for use in adjacent CUs mode
      // search.
      memcpy(&state->search_cabac, &post_seach_cabac, sizeof(post_seach_cabac));
-      work_tree_copy_down(x_local, y_local, depth, work_tree, tree_type);
+      work_tree_copy_down(depth, work_tree, tree_type, cu_loc);
      downsample_cclm_rec(
        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
      );
@ -1454,7 +1479,7 @@ static double search_cu(
  } else if (depth >= 0 && depth < MAX_PU_DEPTH) {
    // Need to copy modes down since the lower level of the work tree is used
    // when searching SMP and AMP blocks.
-    work_tree_copy_down(x_local, y_local, depth, work_tree, tree_type);
+    work_tree_copy_down(depth, work_tree, tree_type, cu_loc);
    if(tree_type != UVG_CHROMA_T) {
      downsample_cclm_rec(
        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
--- a/src/search.h
+++ b/src/search.h
@ -84,18 +84,24 @@ void uvg_sort_keys_by_cost(unit_stats_map_t *__restrict map);

 void uvg_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf, lcu_coeff_t *coeff);

-double uvg_cu_rd_cost_luma(const encoder_state_t *const state,
-                           const int x_px, const int y_px, const int depth,
-                           const cu_info_t *const pred_cu,
-                           lcu_t *const lcu,
-                           uint8_t isp_cbf);
-double uvg_cu_rd_cost_chroma(const encoder_state_t *const state,
-                             const int x_px, const int y_px, const int depth,
-                             cu_info_t *const pred_cu,
-                             lcu_t *const lcu);
+double uvg_cu_rd_cost_luma(
+  const encoder_state_t *const state,
+  const cu_loc_t* const cu_loc,
+  const cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  uint8_t isp_cbf);
+double uvg_cu_rd_cost_chroma(
+  const encoder_state_t *const state,
+  cu_info_t *const pred_cu,
+  lcu_t *const lcu,
+  const cu_loc_t * const);

-void uvg_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, uint8_t tr_depth, enum uvg_tree_type
-                          tree_type);
+void uvg_lcu_fill_trdepth(
+  lcu_t *lcu,
+  const cu_loc_t* const cu_loc,
+  uint8_t tr_depth,
+  enum uvg_tree_type
+  tree_type);

 void uvg_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
 void uvg_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
--- a/src/search_inter.c
+++ b/src/search_inter.c
@ -1811,7 +1811,7 @@ static void search_pu_inter(
        cur_pu->inter.mv[0][1]  = info->merge_cand[merge_idx].mv[0][1];
        cur_pu->inter.mv[1][0]  = info->merge_cand[merge_idx].mv[1][0];
        cur_pu->inter.mv[1][1]  = info->merge_cand[merge_idx].mv[1][1];
-        uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, MAX(1, depth), UVG_BOTH_T);
+        uvg_lcu_fill_trdepth(lcu, cu_loc, MAX(1, depth), UVG_BOTH_T);
        uvg_inter_recon_cu(state, lcu, true, false, cu_loc);

        uvg_quantize_lcu_residual(state, true, false, false, cu_loc, depth, cur_pu, lcu, true, UVG_BOTH_T);
@ -2129,12 +2129,12 @@ void uvg_cu_cost_inter_rd2(
  const uint8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
  int tr_depth = MAX(1, depth);

-  uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, tr_depth, UVG_BOTH_T);
+  uvg_lcu_fill_trdepth(lcu, cu_loc, tr_depth, UVG_BOTH_T);

  const int x_px = SUB_SCU(cu_loc->x);
  const int y_px = SUB_SCU(cu_loc->y);
-  const int width = LCU_WIDTH >> depth;
-  const int height = width; // TODO: non-square blocks
+  const int width = cu_loc->width;
+  const int height = cu_loc->height;

  cabac_data_t cabac_copy;
  memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
@ -2155,10 +2155,10 @@ void uvg_cu_cost_inter_rd2(
    int index = y_px / 2 * LCU_WIDTH_C + x_px / 2;
    double ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
                                       LCU_WIDTH_C, LCU_WIDTH_C,
-                                       width / 2);
+                                       cu_loc->chroma_width);
    double ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
                                       LCU_WIDTH_C, LCU_WIDTH_C,
-                                       width / 2);
+                                       cu_loc->chroma_width);
    ssd += (ssd_u + ssd_v) * UVG_CHROMA_MULT;
  }
  double no_cbf_bits;
@ -2217,12 +2217,10 @@ void uvg_cu_cost_inter_rd2(
    uvg_chorma_ts_out_t chorma_ts_out;
    uvg_chroma_transform_search(
      state,
-      depth,
      lcu,
      &cabac_copy,
      cu_loc,
      index,
-      0,
      cur_cu,
      u_pred,
      v_pred,
@ -2262,10 +2260,10 @@ void uvg_cu_cost_inter_rd2(
  int cbf = cbf_is_set_any(cur_cu->cbf, depth);
  
  if(cbf) {
-    *inter_cost = uvg_cu_rd_cost_luma(state, x_px, y_px, depth, cur_cu, lcu, 0);
+    *inter_cost = uvg_cu_rd_cost_luma(state, cu_loc, cur_cu, lcu, 0);
    if (reconstruct_chroma) {
      if (cur_cu->depth != cur_cu->tr_depth || !state->encoder_control->cfg.jccr) {
-        *inter_cost += uvg_cu_rd_cost_chroma(state, x_px, y_px, depth, cur_cu, lcu);
+        *inter_cost += uvg_cu_rd_cost_chroma(state, cur_cu, lcu, cu_loc);
      }
      else {
        *inter_cost += chroma_cost;
--- a/src/search_intra.c
+++ b/src/search_intra.c
@ -431,9 +431,7 @@ static double search_intra_trdepth(
        }
        double rd_cost = uvg_cu_rd_cost_luma(
          state,
-          lcu_px.x,
-          lcu_px.y,
-          depth,
+          cu_loc,
          pred_cu,
          lcu,
          search_data->best_isp_cbfs);
@ -502,11 +500,9 @@ static double search_intra_trdepth(
          );
        best_rd_cost += uvg_cu_rd_cost_chroma(
          state,
-          lcu_px.x,
-          lcu_px.y,
-          depth,
          pred_cu,
-          lcu);
+          lcu,
+          cu_loc);
        pred_cu->intra.mode = luma_mode;

        // Check lfnst constraints for chroma
@ -552,7 +548,7 @@ static double search_intra_trdepth(
                         UVG_BOTH_T,
                         false,
                         true);
-      best_rd_cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+      best_rd_cost += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, cu_loc);
      pred_cu->intra.mode = luma_mode;
    }
    pred_cu->tr_skip = best_tr_idx == MTS_SKIP;
@ -655,7 +651,7 @@ static double search_intra_trdepth(
  if (depth == 0 || split_cost < nosplit_cost) {
    return split_cost;
  } else {
-    uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, depth, tree_type);
+    uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);

    pred_cu->cbf = nosplit_cbf;

@ -690,19 +686,15 @@ static void sort_modes(intra_search_data_t* __restrict modes, uint8_t length)

 static int search_intra_chroma_rough(
  encoder_state_t * const state,
-  int x_px,
-  int y_px,
-  int depth,
-  const vector2d_t* const lcu_px,
  intra_search_data_t* chroma_data,
  lcu_t* lcu,
  int8_t luma_mode,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc)
 {
-  assert(depth != 4 || (x_px & 4 && y_px & 4));
-  const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
+  const int_fast8_t log2_width_c = uvg_g_convert_to_log2[cu_loc->chroma_width];
  const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
-  const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
+  const vector2d_t luma_px = { cu_loc->x & ~7, cu_loc->y & ~7 };
  const int width = 1 << log2_width_c;
  const int height = width; // TODO: height for non-square blocks

@ -714,7 +706,7 @@ static int search_intra_chroma_rough(
  uvg_intra_references refs_v;
  uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs_v, state->encoder_control->cfg.wpp, NULL, 0, 0);

-  vector2d_t lcu_cpx = { (lcu_px->x & ~7) / 2, (lcu_px->y & ~7) / 2 };
+  vector2d_t lcu_cpx = { (cu_loc->local_x & ~7) / 2, (cu_loc->local_y & ~7) / 2 };
  uvg_pixel* orig_u = &lcu->ref.u[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
  uvg_pixel* orig_v = &lcu->ref.v[lcu_cpx.x + lcu_cpx.y * LCU_WIDTH_C];
  
@ -1494,29 +1486,19 @@ double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in

 int8_t uvg_search_intra_chroma_rdo(
  encoder_state_t * const state,
-  int x_px,
-  int y_px,
-  int depth,
  int8_t num_modes,
  lcu_t *const lcu,
  intra_search_data_t* chroma_data,
  int8_t luma_mode,
-  enum uvg_tree_type tree_type)
+  enum uvg_tree_type tree_type,
+  const cu_loc_t* const cu_loc)
 {
-  const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4);
-
-  const int luma_width  = LCU_WIDTH >> depth;
-  const int luma_height = LCU_WIDTH >> depth; // TODO: height
-
-  int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
+  const bool reconstruct_chroma = true;
  
-  cu_loc_t loc;
-  uvg_cu_loc_ctor(&loc, x_px & ~7, y_px & ~7, luma_width, luma_height);
-
-  const int chroma_width  = loc.chroma_width;
-  const int chroma_height = loc.chroma_height;
+  const int chroma_width  = cu_loc->chroma_width;
+  const int chroma_height = cu_loc->chroma_height;
  uvg_intra_references refs[2];
-  const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
+  const vector2d_t luma_px = { cu_loc->x & ~7, cu_loc->y & ~7 };
  const vector2d_t pic_px = {
    state->tile->frame->width,
    state->tile->frame->height,
@ -1524,17 +1506,17 @@ int8_t uvg_search_intra_chroma_rdo(


  if (reconstruct_chroma) {
-    uvg_intra_build_reference(&loc, &loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0);
-    uvg_intra_build_reference(&loc, &loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0);
+    uvg_intra_build_reference(cu_loc, cu_loc, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0, 0);
+    uvg_intra_build_reference(cu_loc, cu_loc, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0, 0);
    
-    const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
+    const vector2d_t lcu_px = { cu_loc->local_x, cu_loc->local_y };
    cabac_data_t temp_cabac;
    memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
    
-    const int offset = ((lcu_px.x & ~7) >> 1) + ((lcu_px.y & ~7) >> 1)* LCU_WIDTH_C;
+    const int offset = ((cu_loc->local_x & ~7) >> 1) + ((cu_loc->local_y & ~7) >> 1)* LCU_WIDTH_C;

    int lfnst_modes_to_check[3];
-    if((depth == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
+    if((cu_loc->width == 4 || tree_type == UVG_CHROMA_T) && state->encoder_control->cfg.lfnst) {
      for (int i = 0; i < 3; ++i) {
        lfnst_modes_to_check[i] = i;
      }
@ -1572,7 +1554,7 @@ int8_t uvg_search_intra_chroma_rdo(
          uvg_intra_predict(
            state,
            &refs[COLOR_U - 1],
-            &loc,
+            cu_loc,
            COLOR_U,
            u_pred,
            &chroma_data[mode_i],
@ -1581,7 +1563,7 @@ int8_t uvg_search_intra_chroma_rdo(
          uvg_intra_predict(
            state,
            &refs[COLOR_V - 1],
-            &loc,
+            cu_loc,
            COLOR_V,
            v_pred,
            &chroma_data[mode_i],
@ -1606,12 +1588,10 @@ int8_t uvg_search_intra_chroma_rdo(
          uvg_chorma_ts_out_t chorma_ts_out;
          uvg_chroma_transform_search(
            state,
-            depth,
            lcu,
            &temp_cabac,
-            &loc,
+            cu_loc,
            offset,
-            mode,
            pred_cu,
            u_pred,
            v_pred,
@ -1653,12 +1633,12 @@ int8_t uvg_search_intra_chroma_rdo(
          state->search_cabac.update = 1;
          chroma_data[mode_i].cost = mode_bits * state->lambda;
          uvg_intra_recon_cu(state,
-                             &chroma_data[mode_i], &loc,
+                             &chroma_data[mode_i], cu_loc,
                             pred_cu, lcu,
                             tree_type,
                             false,
                             true);
-          chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+          chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, pred_cu, lcu, cu_loc);
          memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
        }
      }
@ -1677,14 +1657,11 @@ int8_t uvg_search_intra_chroma_rdo(

 int8_t uvg_search_cu_intra_chroma(
  encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
+  const cu_loc_t* const cu_loc,
  lcu_t *lcu,
  intra_search_data_t *search_data,
  enum uvg_tree_type tree_type)
 {
-  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };

  const cu_info_t *cur_pu = &search_data->pred_cu;
  int8_t intra_mode = !cur_pu->intra.mip_flag ? cur_pu->intra.mode : 0;
@ -1698,6 +1675,9 @@ int8_t uvg_search_cu_intra_chroma(
    }
  }

+  cu_loc_t chroma_loc;
+  uvg_cu_loc_ctor(&chroma_loc, cu_loc->x & ~7, cu_loc->y & ~7, cu_loc->width, cu_loc->height);
+
  // The number of modes to select for slower chroma search. Luma mode
  // is always one of the modes, so 2 means the final decision is made
  // between luma mode and one other mode that looks the best
@ -1715,7 +1695,7 @@ int8_t uvg_search_cu_intra_chroma(
    chroma_data[i].pred_cu = *cur_pu;
    chroma_data[i].pred_cu.intra.mode_chroma = num_modes == 1 ? intra_mode : modes[i];
    chroma_data[i].cost = 0;
-    if(depth != 4 && tree_type == UVG_BOTH_T) {
+    if(cu_loc->width != 4 && tree_type == UVG_BOTH_T) {
      memcpy(chroma_data[i].lfnst_costs, search_data->lfnst_costs, sizeof(double) * 3);
    }
  }
@ -1726,16 +1706,13 @@ int8_t uvg_search_cu_intra_chroma(
  if(state->encoder_control->cfg.cclm && 0){
    

-    num_modes = search_intra_chroma_rough(state, x_px, y_px, depth,
-                                          &lcu_px,
-                                          chroma_data,
-                                          lcu,
-                                          intra_mode,
-                                          tree_type);
+    num_modes = search_intra_chroma_rough(state, chroma_data, lcu, intra_mode,
+                                          tree_type,
+                                          &chroma_loc);
  }
  
  if (num_modes > 1 || state->encoder_control->cfg.jccr) {
-    uvg_search_intra_chroma_rdo(state, x_px, y_px, depth, num_modes, lcu, chroma_data, intra_mode, tree_type);
+    uvg_search_intra_chroma_rdo(state, num_modes, lcu, chroma_data, intra_mode, tree_type, &chroma_loc);
  }
  else if(cur_pu->lfnst_idx) {
    chroma_data[0].pred_cu.cr_lfnst_idx = cur_pu->lfnst_idx;
@ -1983,7 +1960,7 @@ void uvg_search_cu_intra(
  // Set transform depth to current depth, meaning no transform splits.
  {
    const int8_t depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
-    uvg_lcu_fill_trdepth(lcu, cu_loc->x, cu_loc->y, depth, depth, tree_type);
+    uvg_lcu_fill_trdepth(lcu, cu_loc, depth, tree_type);
  }
  // Refine results with slower search or get some results if rough search was skipped.
  const int32_t rdo_level = state->encoder_control->cfg.rdo;
--- a/src/search_intra.h
+++ b/src/search_intra.h
@ -52,9 +52,7 @@ double uvg_chroma_mode_bits(const encoder_state_t *state,

 int8_t uvg_search_cu_intra_chroma(
  encoder_state_t * const state,
-  const int x_px,
-  const int y_px,
-  const int depth,
+  const cu_loc_t* const cu_loc,
  lcu_t *lcu,
  intra_search_data_t* best_cclm,
  enum uvg_tree_type tree_type);
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@ -709,7 +709,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
  {
    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
    uvg_rdoq(state, coeff, coeff_out, width, height, color,
-      scan_order, cur_cu->type, tr_depth, cur_cu->cbf, lfnst_index);
+      scan_order, cur_cu->type, cur_cu->cbf, lfnst_index);
  }
  else if (state->encoder_control->cfg.rdoq_enable && use_trskip) {
    uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
--- a/src/strategies/generic/encode_coding_tree-generic.c
+++ b/src/strategies/generic/encode_coding_tree-generic.c
@ -54,7 +54,7 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
  cabac_data_t * const cabac,
  const coeff_t *coeff,
-  const cu_loc_t *cu_loc,
+  const cu_loc_t * const cu_loc,
  uint8_t color,
  int8_t scan_mode,
  cu_info_t* cur_cu,
@ -80,8 +80,8 @@ void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,

  // CONSTANTS

-  const uint32_t log2_block_width =  uvg_g_convert_to_log2[width];
-  const uint32_t log2_block_height = uvg_g_convert_to_log2[height];
+  const uint8_t log2_block_width =  uvg_g_convert_to_log2[width];
+  const uint8_t log2_block_height = uvg_g_convert_to_log2[height];
  
  const uint32_t log2_cg_size = uvg_g_log2_sbb_size[log2_block_width][log2_block_height][0] + uvg_g_log2_sbb_size[log2_block_width][log2_block_height][1];
  const uint32_t* const scan = uvg_get_scan_order_table(SCAN_GROUP_4X4, scan_mode, log2_block_width, log2_block_height);
--- a/src/strategies/generic/encode_coding_tree-generic.h
+++ b/src/strategies/generic/encode_coding_tree-generic.h
@ -44,7 +44,7 @@
 void uvg_encode_coeff_nxn_generic(encoder_state_t * const state,
                                  cabac_data_t * const cabac,
                                  const coeff_t *coeff,
-                                  const cu_loc_t *loc,
+                                  const cu_loc_t * const loc,
                                  uint8_t color,
                                  int8_t scan_mode,
                                  cu_info_t* cur_cu,
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@ -317,8 +317,7 @@ int uvg_quant_cbcr_residual_generic(
  {
    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
    uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
-             scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
-      cur_cu->cr_lfnst_idx);
+             scan_order, cur_cu->type, cur_cu->cbf, cur_cu->cr_lfnst_idx);
  }
  else if (state->encoder_control->cfg.rdoq_enable && false) {
    uvg_ts_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
@ -499,8 +498,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
  {
    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
    uvg_rdoq(state, coeff, coeff_out, width, height, color,
-             scan_order, cur_cu->type, tr_depth, cur_cu->cbf,
-      lfnst_index);
+             scan_order, cur_cu->type, cur_cu->cbf, lfnst_index);
  } else if(state->encoder_control->cfg.rdoq_enable && use_trskip) {
    uvg_ts_rdoq(state, coeff, coeff_out, width, height, color,
      scan_order);
--- a/src/strategies/strategies-encode.h
+++ b/src/strategies/strategies-encode.h
@ -49,7 +49,7 @@
 typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
                                         cabac_data_t * const cabac,
                                         const coeff_t *coeff,
-                                         const cu_loc_t *loc,
+                                         const cu_loc_t * const loc,
                                         uint8_t color,
                                         int8_t scan_mode,
                                         cu_info_t* cur_cu,
--- a/src/transform.c
+++ b/src/transform.c
@ -434,8 +434,7 @@ static void quantize_chroma(
    (transforms[i] != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
  {
    uvg_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
-             scan_order, CU_INTRA, depth, 0,
-             lfnst_idx);
+             scan_order, CU_INTRA, 0, lfnst_idx);

    int j;
    for (j = 0; j < width * height; ++j) {
@ -449,8 +448,7 @@ static void quantize_chroma(
      uint16_t temp_cbf = 0;
      if (*u_has_coeffs)cbf_set(&temp_cbf, depth, COLOR_U);
      uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
-               scan_order, CU_INTRA, depth, temp_cbf,
-               lfnst_idx);
+               scan_order, CU_INTRA, temp_cbf, lfnst_idx);

    }
  }
@ -486,12 +484,10 @@ static void quantize_chroma(

 void uvg_chroma_transform_search(
  encoder_state_t* const state,
-  int depth,
  lcu_t* const lcu,
  cabac_data_t* temp_cabac,
  const cu_loc_t* const cu_loc,
  const int offset,
-  const uint8_t mode,
  cu_info_t* pred_cu,
  uvg_pixel u_pred[1024],
  uvg_pixel v_pred[1024],
@ -507,6 +503,8 @@ void uvg_chroma_transform_search(
  const int width  = cu_loc->chroma_width;
  const int height = cu_loc->chroma_height;

+  const int depth = 6 - uvg_g_convert_to_log2[cu_loc->width];
+
  uvg_transform2d(
    state->encoder_control, u_resi, u_coeff, width, height, COLOR_U, pred_cu
  );
@ -553,8 +551,6 @@ void uvg_chroma_transform_search(
    coeff_t v_quant_coeff[LCU_WIDTH_C * LCU_WIDTH_C];
    int16_t u_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
    int16_t v_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
-    const coeff_scan_order_t scan_order =
-      uvg_get_scan_order(pred_cu->type, mode, depth);
    bool u_has_coeffs = false;
    bool v_has_coeffs = false;
    if(pred_cu->cr_lfnst_idx) {
@ -575,13 +571,13 @@ void uvg_chroma_transform_search(
      i,
      u_quant_coeff,
      v_quant_coeff,
-      scan_order,
+      SCAN_DIAG,
      &u_has_coeffs,
      &v_has_coeffs,
      pred_cu->cr_lfnst_idx);
      if(pred_cu->cr_lfnst_idx !=0 && !u_has_coeffs && !v_has_coeffs) continue;
    
-    if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (depth == 4 || tree_type == UVG_CHROMA_T)) {
+    if(pred_cu->type == CU_INTRA && transforms[i] != CHROMA_TS && (cu_loc->width == 4 || tree_type == UVG_CHROMA_T)) {
      bool constraints[2] = { false, false };
      uvg_derive_lfnst_constraints(pred_cu, constraints, u_quant_coeff, width, height, NULL, COLOR_U);
      if(!IS_JCCR_MODE(transforms[i])) {
@ -593,9 +589,9 @@ void uvg_chroma_transform_search(
    if (IS_JCCR_MODE(transforms[i]) && !u_has_coeffs) continue;

    if (u_has_coeffs) {
-
      uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, width, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
        pred_cu->type, transforms[i] == CHROMA_TS);
+
      if (transforms[i] != CHROMA_TS) {
        if (pred_cu->cr_lfnst_idx) {
          uvg_inv_lfnst(pred_cu, width, height, COLOR_U, pred_cu->cr_lfnst_idx, &u_coeff[i * trans_offset], tree_type);
@ -606,6 +602,7 @@ void uvg_chroma_transform_search(
      else {
        uvg_itransformskip(state->encoder_control, u_recon_resi, &u_coeff[i * trans_offset], width, height);
      }
+
      if (transforms[i] != JCCR_1) {
        for (int j = 0; j < width * height; j++) {
          u_recon[trans_offset * i + j] = CLIP_TO_PIXEL((uvg_pixel)(u_pred[j] + u_recon_resi[j]));
@ -620,9 +617,12 @@ void uvg_chroma_transform_search(
    else {
      uvg_pixels_blit(u_pred, &u_recon[trans_offset * i], width, height, width, width);
    }
+
+
    if (v_has_coeffs && !(IS_JCCR_MODE(transforms[i]))) {
      uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V,
        pred_cu->type, transforms[i] == CHROMA_TS);
+
      if (transforms[i] != CHROMA_TS) {
        if (pred_cu->cr_lfnst_idx) {
          uvg_inv_lfnst(pred_cu, width, height, COLOR_V, pred_cu->cr_lfnst_idx, &v_coeff[i * trans_offset], tree_type);
@ -633,6 +633,7 @@ void uvg_chroma_transform_search(
      else {
        uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width, height);
      }
+
      for (int j = 0; j < width * height; j++) {
        v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + v_recon_resi[j]);
      }
@ -700,7 +701,7 @@ void uvg_chroma_transform_search(
        pred_cu,
        cu_loc,
        COLOR_U,
-        scan_order,
+        SCAN_DIAG,
        transforms[i] == CHROMA_TS,
        COEFF_ORDER_LINEAR);
      u_bits += coeff_cost;
@ -717,7 +718,7 @@ void uvg_chroma_transform_search(
        pred_cu,
        cu_loc,
        COLOR_V,
-        scan_order,
+        SCAN_DIAG,
        transforms[i] == CHROMA_TS,
        COEFF_ORDER_LINEAR);
    }
--- a/src/transform.h
+++ b/src/transform.h
@ -104,12 +104,10 @@ void uvg_quantize_lcu_residual(

 void uvg_chroma_transform_search(
  encoder_state_t* const state,
-  int depth,
  lcu_t* const lcu,
  cabac_data_t* temp_cabac,
  const cu_loc_t* const cu_loc,
  const int offset,
-  const uint8_t mode,
  cu_info_t* pred_cu,
  uvg_pixel u_pred[1024],
  uvg_pixel v_pred[1024],
--- a/tests/test_cabac_state.sh
+++ b/tests/test_cabac_state.sh
@ -6,10 +6,10 @@ set -eu

 cabacfile="$(mktemp)"

-valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-4 --cclm --rd 3 --mip --jccr --mrl --lfnst -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"

-valgrind_test 256x128 10 yuv420p --preset veryslow --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
+valgrind_test 256x128 10 yuv420p --preset veryslow --pu-depth-intra 0-4 --cclm --rd 3 --mip --jccr --mrl --lfnst --dual-tree -p 1 --owf 0 --no-wpp --cabac-debug-file="${cabacfile}"
 python3 check_cabac_state_consistency.py "${cabacfile}"

 rm -rf "${cabacfile}"