[jccr] Chroma transform search kinda working

2024-11-27 11:24:05 +00:00 · 2022-05-25 13:47:02 +03:00 · 2022-05-25 13:47:02 +03:00 · f056178e80
parent 27b730c2e9
commit f056178e80
12 changed files with 461 additions and 315 deletions
--- a/src/cu.h
+++ b/src/cu.h
@ -146,7 +146,7 @@ typedef struct
  uint8_t skipped     : 1; //!< \brief flag to indicate this block is skipped
  uint8_t merged      : 1; //!< \brief flag to indicate this block is merged
  uint8_t merge_idx   : 3; //!< \brief merge index
-  uint8_t tr_skip     : 1; //!< \brief transform skip flag
+  uint8_t tr_skip     : 3; //!< \brief transform skip flag
  uint8_t tr_idx      : 3; //!< \brief transform index
  uint8_t joint_cb_cr : 3; //!< \brief joint chroma residual coding 

--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@ -514,7 +514,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep
        cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
        // HEVC only supports transform_skip for Luma
        // TODO: transform skip for chroma blocks
-        CABAC_BIN(cabac, 0, "transform_skip_flag");
+        CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_U) & 1, "transform_skip_flag");
      }
      uvg_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, COLOR_U, *scan_idx, NULL, cur_pu);
    }
@ -522,7 +522,7 @@ static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int dep
    if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
      if (state->encoder_control->cfg.trskip_enable && width_c <= (1 << state->encoder_control->cfg.trskip_max_size)) {
        cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
-        CABAC_BIN(cabac, 0, "transform_skip_flag");
+        CABAC_BIN(cabac, (cur_pu->tr_skip >> COLOR_V) & 1, "transform_skip_flag");
      }
      uvg_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, COLOR_V, *scan_idx, NULL, cur_pu);
    }
--- a/src/rdo.c
+++ b/src/rdo.c
@ -233,10 +233,10 @@ int uvg_init_rdcost_outfiles(const char *dir_path)
  // As long as QP is a two-digit number, template and produced string should
  // be equal in length ("%i" -> "22")
  assert(RD_SAMPLING_MAX_LAST_QP <= 99);
-  assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);

  strncpy(fn_template, dir_path, RD_SAMPLING_MAX_FN_LENGTH);
  strncat(fn_template, basename_tmpl, RD_SAMPLING_MAX_FN_LENGTH - strlen(dir_path));
+  assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);

  for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
    pthread_mutex_t *curr = outfile_mutex + qp;
@ -290,7 +290,7 @@ out:
 *
 * \param coeff coefficient array
 * \param width coeff block width
- * \param type data type (0 == luma)
+ * \param color data type (0 == luma)
 *
 * \returns bits needed to code input coefficients
 */
@ -298,7 +298,7 @@ static INLINE double get_coeff_cabac_cost(
  const encoder_state_t * const state,
  const coeff_t *coeff,
  int32_t width,
-  int32_t type,
+  color_t color,
  int8_t scan_mode,
  int8_t tr_skip,
  cu_info_t* cur_tu)
@ -331,7 +331,7 @@ static INLINE double get_coeff_cabac_cost(
                         &cabac_copy,
                         coeff,
                         width,
-                         type,
+                         color,
                         scan_mode,
                         cur_tu,                   
                         &bits);
@ -341,7 +341,7 @@ static INLINE double get_coeff_cabac_cost(
      &cabac_copy,
      coeff,
      width,
-      type,
+      color,
      scan_mode,
      &bits);
  }
@ -383,7 +383,7 @@ static INLINE void save_accuracy(int qp, double ccc, uint32_t fast_cost)
 *
 * \param coeff   coefficient array
 * \param width   coeff block width
- * \param type    data type (0 == luma)
+ * \param color    data type (0 == luma)
 *
 * \returns       number of bits needed to code coefficients
 */
@ -392,7 +392,7 @@ double uvg_get_coeff_cost(
  const coeff_t *coeff,
  cu_info_t* cur_tu,
  int32_t width,
-  int32_t type,
+  color_t color,
  int8_t scan_mode,
  int8_t tr_skip)
 {
@ -411,13 +411,13 @@ double uvg_get_coeff_cost(
      uint64_t weights = uvg_fast_coeff_get_weights(state);
      uint32_t fast_cost = uvg_fast_coeff_cost(coeff, width, weights);
      if (check_accuracy) {
-        double ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode, tr_skip, cur_tu);
+        double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
        save_accuracy(state->qp, ccc, fast_cost);
      }
      return fast_cost;
    }
  } else {
-    double ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode, tr_skip, cur_tu);
+    double ccc = get_coeff_cabac_cost(state, coeff, width, color, scan_mode, tr_skip, cur_tu);
    if (save_cccs) {
      save_ccc(state->qp, coeff, width * width, ccc);
    }
--- a/src/rdo.h
+++ b/src/rdo.h
@ -64,7 +64,7 @@ double uvg_get_coeff_cost(
  const coeff_t *coeff,
  cu_info_t* cur_tu,
  int32_t width,
-  int32_t type,
+  color_t color,
  int8_t scan_mode,
  int8_t tr_skip);

--- a/src/search.c
+++ b/src/search.c
@ -473,8 +473,8 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,

  const uint8_t tr_depth = tr_cu->tr_depth - depth;

-  const int cb_flag_u = cbf_is_set(tr_cu->cbf, depth, COLOR_U);
-  const int cb_flag_v = cbf_is_set(tr_cu->cbf, depth, COLOR_V);
+  const int cb_flag_u = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr >> 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_U);
+  const int cb_flag_v = tr_cu->joint_cb_cr ? tr_cu->joint_cb_cr & 1 : cbf_is_set(tr_cu->cbf, depth, COLOR_V);

  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;

@ -488,7 +488,8 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,

  }

-  if(state->encoder_control->chroma_format != UVG_CSP_400 && !skip_residual_coding && (depth != 4 || (x_px % 8 && y_px % 8))) {
+  bool has_chroma = state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 && y_px % 8));
+  if( !skip_residual_coding && has_chroma) {
    if(tr_cu->depth == depth || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
      CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.qt_cbf_model_cb[0]), cb_flag_u, tr_tree_bits, "cbf_cb");
    } 
@ -522,10 +523,10 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
    CABAC_FBITS_UPDATE(cabac, ctx, cb_flag_y, tr_tree_bits, "cbf_y_search");
  }

-  if (cb_flag_y | cb_flag_u | cb_flag_v) {
+  if (cb_flag_y || cb_flag_u || cb_flag_v) {
    // TODO qp_delta_sign_flag

-    if ((cb_flag_u | cb_flag_v) && x_px % 8 == 0 && y_px % 8 == 0 && state->encoder_control->cfg.jccr) {
+    if ((cb_flag_u || cb_flag_v) && has_chroma && state->encoder_control->cfg.jccr) {
      CABAC_FBITS_UPDATE(cabac, &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1], tr_cu->joint_cb_cr != 0, tr_tree_bits, "tu_joint_cbcr_residual_flag");
    }
  }
@ -547,11 +548,11 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
    int8_t luma_scan_mode = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
    const coeff_t* coeffs = &lcu->coeff.y[xy_to_zorder(LCU_WIDTH, x_px, y_px)];

-    coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, 0, luma_scan_mode, tr_cu->tr_skip);
+    coeff_bits += uvg_get_coeff_cost(state, coeffs, tr_cu, width, 0, luma_scan_mode, tr_cu->tr_skip & 1);
  }

  unsigned chroma_ssd = 0;
-  if(state->encoder_control->chroma_format != UVG_CSP_400 && (depth != 4 || (x_px % 8 != 0 && y_px % 8 != 0))) {
+  if(has_chroma) {
    const vector2d_t lcu_px = { (x_px & ~7 ) / 2, (y_px & ~7) / 2 };
    const int chroma_width = MAX(4, LCU_WIDTH >> (depth + 1));
    int8_t scan_order = uvg_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
@ -567,21 +568,22 @@ static double cu_rd_cost_tr_split_accurate(const encoder_state_t* const state,
          chroma_width);
        chroma_ssd = ssd_u + ssd_v;
      }    
+      coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], NULL, chroma_width, COLOR_U, scan_order, tr_cu->tr_skip & 2);
+      coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], NULL, chroma_width, COLOR_V, scan_order, tr_cu->tr_skip & 4);
      
-      {
-
-        coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.u[index], NULL, chroma_width, 2, scan_order, 0);
-        coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.v[index], NULL, chroma_width, 2, scan_order, 0);
    }
-    } else {
+    else {
+      {
+        int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
        int ssd_u_joint = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index],
          LCU_WIDTH_C, LCU_WIDTH_C,
-        width);
+          chroma_width);
        int ssd_v_joint = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index],
          LCU_WIDTH_C, LCU_WIDTH_C,
          chroma_width);
        chroma_ssd = ssd_u_joint + ssd_v_joint;
-      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, width, 2, scan_order, 0);
+      }
+      coeff_bits += uvg_get_coeff_cost(state, &lcu->coeff.joint_uv[index], NULL, chroma_width, COLOR_U, scan_order, 0);
    }
  }
  if (kvz_is_mts_allowed(state, tr_cu)) {
@ -986,7 +988,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
        if (ctrl->cfg.rdo >= 3) {
          cur_cu->intra.mode_chroma = uvg_search_cu_intra_chroma(state, x, y, depth, lcu, &intra_search);

-          if (intra_search.pred_cu.joint_cb_cr == 0) intra_search.pred_cu.joint_cb_cr = 4;
+          if (intra_search.pred_cu.joint_cb_cr == 0) {
+            intra_search.pred_cu.joint_cb_cr = 4;
+            cur_cu->tr_skip |= intra_search.pred_cu.tr_skip;
+          }
          else cur_cu->joint_cb_cr = intra_search.pred_cu.joint_cb_cr;

          lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
--- a/src/search_intra.c
+++ b/src/search_intra.c
@ -388,6 +388,14 @@ static double search_intra_trdepth(
    }
    pred_cu->intra.mode_chroma = -1;
    pred_cu->joint_cb_cr = 4;
+    for (; trafo < num_transforms; trafo++) {
+      pred_cu->tr_idx = trafo;
+      if (trafo == MTS_SKIP) pred_cu->tr_skip |= 1;
+      else pred_cu->tr_skip &= 6; // Keep chroma tr_skip untouched allthough it probably won't matter here
+      if (mts_enabled)
+      {
+        pred_cu->mts_last_scan_pos = 0;
+        pred_cu->violates_mts_coeff_constraint = 0;

    const int max_tb_size = TR_MAX_WIDTH;
    // LFNST search params
@ -1350,6 +1358,7 @@ double uvg_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
 }


+#define IS_JCCR_MODE(t) ((t) != DCT7_CHROMA && (t) != CHROMA_TS)
 static INLINE int64_t square(int x) {
  return x * (int64_t)x;
 }
@ -1362,6 +1371,181 @@ enum chroma_transforms {
  JCCR_3 = 3,
 };

+static void generate_jccr_transforms(encoder_state_t* const state,
+  intra_search_data_t* chroma_data, int8_t width, int8_t mode_i,
+  int16_t u_resi[1024], int16_t v_resi[1024], coeff_t u_coeff[5120], 
+  enum chroma_transforms transforms[5], const int trans_offset, int* num_transforms)
+{
+  ALIGNED(64) int16_t temp_resi[LCU_WIDTH_C * LCU_WIDTH_C * 3];
+  int64_t costs[4];
+  costs[0] = INT64_MAX;
+  for (int jccr = chroma_data[mode_i].pred_cu.type == CU_INTRA ? 0 : 3; jccr < 4; jccr++) {
+    int64_t d1 = 0;
+    int64_t d2 = 0;
+    const int cbf_mask = jccr * (state->frame->jccr_sign ? -1 : 1);
+    int16_t *current_resi = &temp_resi[(jccr - 1) * trans_offset];
+    for (int y = 0; y < width; y++)
+    {
+      for (int x = 0; x < width; x++)
+      {
+        const int16_t cbx = u_resi[x + y * width], crx = v_resi[x + y * width];
+        if (cbf_mask == 2)
+        {
+          const int16_t resi = ((4 * cbx + 2 * crx) / 5);
+          current_resi[x + y * width] = resi;
+          d1 += square(cbx - resi) + square(crx - (resi >> 1));
+        }
+        else if (cbf_mask == -2)
+        {
+          const int16_t resi = ((4 * cbx - 2 * crx) / 5);
+          current_resi[x + y * width] = resi;
+          d1 += square(cbx - resi) + square(crx - (-resi >> 1));
+        }
+        else if (cbf_mask == 3)
+        {
+          const int16_t resi = ((cbx + crx) / 2);
+          current_resi[x + y * width] = resi;
+          d1 += square(cbx - resi) + square(crx - resi);
+        }
+        else if (cbf_mask == -3)
+        {
+          const int16_t resi = ((cbx - crx) / 2);
+          current_resi[x + y * width] = resi;
+          d1 += square(cbx - resi) + square(crx + resi);
+        }
+        else if (cbf_mask == 1)
+        {
+          const int16_t resi = ((4 * crx + 2 * cbx) / 5);
+          current_resi[x + y * width] = resi;
+          d1 += square(cbx - (resi >> 1)) + square(crx - resi);
+        }
+        else if (cbf_mask == -1)
+        {
+          const int16_t resi = ((4 * crx - 2 * cbx) / 5);
+          current_resi[x + y * width] = resi;
+          d1 += square(cbx - (-resi >> 1)) + square(crx - resi);
+        }
+        else
+        {
+          d1 += square(cbx);
+          d2 += square(crx);
+        }
+      }
+    }
+    costs[jccr] = d2 != 0 ? MIN(d1, d2) : d1;
+  }
+  int64_t min_dist1 = costs[0];
+  int64_t min_dist2 = INT64_MAX;
+  int     cbf_mask1 = 0;
+  int     cbf_mask2 = 0;
+  for (int cbfMask = 1; cbfMask < 4; cbfMask++)
+  {
+    if (costs[cbfMask] < min_dist1)
+    {
+      cbf_mask2 = cbf_mask1; min_dist2 = min_dist1;
+      cbf_mask1 = cbfMask;  min_dist1 = costs[cbf_mask1];
+    }
+    else if (costs[cbfMask] < min_dist2)
+    {
+      cbf_mask2 = cbfMask;  min_dist2 = costs[cbf_mask2];
+    }
+  }
+  if (cbf_mask1)
+  {
+    kvz_transform2d(
+      state->encoder_control,
+      &temp_resi[(cbf_mask1 - 1) * trans_offset],
+      &u_coeff[*num_transforms * trans_offset],
+      width,
+      COLOR_U,
+      &chroma_data[cbf_mask1].pred_cu
+    );
+    transforms[(*num_transforms)] = cbf_mask1;
+    (*num_transforms)++;
+  }
+  if (cbf_mask2 && ((min_dist2 < (9 * min_dist1) / 8) || (!cbf_mask1 && min_dist2 < (3 * min_dist1) / 2)))
+  {
+    kvz_transform2d(
+      state->encoder_control,
+      &temp_resi[(cbf_mask2 - 1) * trans_offset],
+      &u_coeff[*num_transforms * trans_offset],
+      width,
+      COLOR_U,
+      &chroma_data[cbf_mask2].pred_cu
+    );
+    transforms[(*num_transforms)] = cbf_mask2;
+    (*num_transforms)++;
+  }
+}
+
+static void quantize_chroma(
+  encoder_state_t* const state,
+  int depth,
+  int8_t width,
+  int8_t height,
+  coeff_t u_coeff[5120],
+  coeff_t v_coeff[2048],
+  enum chroma_transforms transforms[5],
+  const int trans_offset,
+  int i,
+  coeff_t u_quant_coeff[1024],
+  coeff_t v_quant_coeff[1024],
+  const coeff_scan_order_t scan_order,
+  bool* u_has_coeffs,
+  bool* v_has_coeffs)
+{
+  if (state->encoder_control->cfg.rdoq_enable &&
+      (transforms[i] != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
+  {
+    uvg_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height,  transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
+             scan_order, CU_INTRA, depth, 0);
+            
+    int j;
+    for (j = 0; j < width * height; ++j) {
+      if (u_quant_coeff[j]) {
+        *u_has_coeffs = 1;
+        break;
+      }
+    }
+            
+    if(transforms[i] == DCT7_CHROMA) {
+      uint16_t temp_cbf = 0;
+      if (*u_has_coeffs)cbf_set(&temp_cbf, depth, COLOR_U);
+      uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
+               scan_order, CU_INTRA, depth, temp_cbf);
+              
+    }
+  }
+  else if (state->encoder_control->cfg.rdoq_enable && transforms[i] == CHROMA_TS) {
+    uvg_ts_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, COLOR_U,scan_order);
+    uvg_ts_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,scan_order);
+  }
+  else {
+    uvg_quant(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
+              scan_order, CU_INTRA, transforms[i] == CHROMA_TS);
+
+    if(!IS_JCCR_MODE(transforms[i])) {
+      uvg_quant(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
+                scan_order, CU_INTRA, transforms[i] == CHROMA_TS);
+    }
+  }
+          
+  for (int j = 0; j < width * height; ++j) {
+    if (u_quant_coeff[j]) {
+      *u_has_coeffs = 1;
+      break;
+    }
+  }
+  if (!IS_JCCR_MODE(transforms[i])) {
+    for (int j = 0; j < width * height; ++j) {
+      if (v_quant_coeff[j]) {
+        *v_has_coeffs = 1;
+        break;
+      }
+    }
+  }
+}
+
 int8_t uvg_search_intra_chroma_rdo(
  encoder_state_t * const state,
  int x_px,
@ -1384,159 +1568,103 @@ int8_t uvg_search_intra_chroma_rdo(


  if (reconstruct_chroma) {
-    uvg_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
-    uvg_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
+    int log2_width = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
+    uvg_intra_build_reference(log2_width, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp, NULL, 0);
+    uvg_intra_build_reference(log2_width, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp, NULL, 0);
    
    const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
    cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
    cabac_data_t temp_cabac;
    memcpy(&temp_cabac, &state->search_cabac, sizeof(cabac_data_t));
-    int8_t width = MAX(4, LCU_CU_WIDTH >> (depth - 1));
-    int8_t height = MAX(4, LCU_CU_WIDTH >> (depth - 1));
-    const cu_loc_t loc = { x_px, y_px, width, height, width, height};
-    const int offset = (lcu_px.x >> 1) + (lcu_px.y >> 1)* LCU_WIDTH_C;
+    int8_t width = 1 << log2_width;
+    int8_t height = 1 << log2_width;
+    const cu_loc_t loc = { x_px &~7, y_px & ~7, width, height, width, height};
+    const int offset = ((lcu_px.x & ~7) >> 1) + ((lcu_px.y & ~7) >> 1)* LCU_WIDTH_C;
    
-    for (int8_t i = 0; i < num_modes; ++i) {
-      const uint8_t mode = chroma_data[i].pred_cu.intra.mode_chroma;
+    for (int8_t mode_i = 0; mode_i < num_modes; ++mode_i) {
+      const uint8_t mode = chroma_data[mode_i].pred_cu.intra.mode_chroma;
+      double mode_bits = kvz_chroma_mode_bits(state, mode, luma_mode);
+      chroma_data[mode_i].cost = mode_bits * state->lambda;
      if ((state->encoder_control->cfg.jccr ||
           (state->encoder_control->cfg.trskip_enable &&
            (1 << state->encoder_control->cfg.trskip_max_size) >= width)) &&
-          chroma_data[i].pred_cu.tr_depth == chroma_data[i].pred_cu.depth) {
+          chroma_data[mode_i].pred_cu.tr_depth == chroma_data[mode_i].pred_cu.depth) {
        ALIGNED(64) kvz_pixel u_pred[LCU_WIDTH_C * LCU_WIDTH_C];
        ALIGNED(64) kvz_pixel v_pred[LCU_WIDTH_C * LCU_WIDTH_C];
        ALIGNED(64) int16_t u_resi[LCU_WIDTH_C * LCU_WIDTH_C];
        ALIGNED(64) int16_t v_resi[LCU_WIDTH_C * LCU_WIDTH_C];
        uvg_intra_predict(
          state,
-          &refs[COLOR_U],
+          &refs[COLOR_U - 1],
          &loc,
          COLOR_U,
          u_pred,
-          &chroma_data[i],
+          &chroma_data[mode_i],
          lcu);
        uvg_intra_predict(
          state,
-          &refs[COLOR_V],
+          &refs[COLOR_V - 1],
          &loc,
          COLOR_V,
          v_pred,
-          &chroma_data[i],
+          &chroma_data[mode_i],
          lcu);
        uvg_generate_residual(
          &lcu->ref.u[offset],
          u_pred,
          u_resi,
          width,
+          LCU_WIDTH_C,
          width);
        uvg_generate_residual(
          &lcu->ref.v[offset],
          v_pred,
          v_resi,
          width,
+          LCU_WIDTH_C,
          width);
        ALIGNED(64) coeff_t u_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 5];
        ALIGNED(64) uint8_t u_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
        ALIGNED(64) coeff_t v_coeff[LCU_WIDTH_C * LCU_WIDTH_C * 2];
        ALIGNED(64) uint8_t v_recon[LCU_WIDTH_C * LCU_WIDTH_C * 5];
        uvg_transform2d(
-          state->encoder_control, u_resi, u_coeff, width, COLOR_U, &chroma_data[i].pred_cu
+          state->encoder_control, u_resi, u_coeff, width, COLOR_U, &chroma_data[mode_i].pred_cu
        );
        uvg_transform2d(
-          state->encoder_control, v_resi, v_coeff, width, COLOR_V, &chroma_data[i].pred_cu
+          state->encoder_control, v_resi, v_coeff, width, COLOR_V, &chroma_data[mode_i].pred_cu
        );
        enum chroma_transforms transforms[5];
        transforms[0] = DCT7_CHROMA;
        const int trans_offset = width * height;
        int num_transforms = 1;
-        if(state->encoder_control->cfg.trskip_enable &&
-          (1 << state->encoder_control->cfg.trskip_max_size) >= width) {
+        const int can_use_tr_skip = state->encoder_control->cfg.trskip_enable &&
+          (1 << state->encoder_control->cfg.trskip_max_size) >= width;
+        if(can_use_tr_skip) {
          uvg_transformskip(state->encoder_control, u_resi, u_coeff + num_transforms * trans_offset, width);
          uvg_transformskip(state->encoder_control, v_resi, v_coeff + num_transforms * trans_offset, width);
          transforms[num_transforms] = CHROMA_TS;
          num_transforms++;
        }
        if(state->encoder_control->cfg.jccr) {
-          ALIGNED(64) int16_t temp_resi[LCU_WIDTH_C * LCU_WIDTH_C * 3];
-          int64_t costs[4];
-          costs[0] = INT64_MAX;
-          for (int jccr = chroma_data[i].pred_cu.type == CU_INTRA ? 0 : 3; jccr < 4; jccr++) {
-            int64_t d1 = 0;
-            int64_t d2 = 0;
-            const int cbf_mask = jccr * (state->frame->jccr_sign ? -1 : 1);
-            int16_t *current_resi = &temp_resi[(jccr - 1) + trans_offset];
-            for (int y = 0; y < width; y++)
-            {
-              for (int x = 0; x < width; x++)
-              {
-                int cbx = u_resi[x + y * width], crx = v_resi[x + y * width];
-                if (cbf_mask == 2)
-                {
-                  const int resi = ((4 * cbx + 2 * crx) / 5);
-                  current_resi[x + y * width] = resi;
-                  d1 += square(cbx - resi) + square(crx - (resi >> 1));
-                }
-                else if (cbf_mask == -2)
-                {
-                  const int resi = ((4 * cbx - 2 * crx) / 5);
-                  current_resi[x + y * width] = resi;
-                  d1 += square(cbx - resi) + square(crx - (resi >> 1));
-                }
-                else if (cbf_mask == 3)
-                {
-                  const int resi = ((cbx + crx) / 2);
-                  current_resi[x + y * width] = resi;
-                  d1 += square(cbx - resi) + square(crx - resi);
-                }
-                else if (cbf_mask == -3)
-                {
-                  const int resi = ((cbx - crx) / 2);
-                  current_resi[x + y * width] = resi;
-                  d1 += square(cbx - resi) + square(crx + resi);
-                }
-                else if (cbf_mask == 1)
-                {
-                  const int resi = ((4 * crx + 2 * cbx) / 5);
-                  current_resi[x + y * width] = resi;
-                  d1 += square(cbx - (resi >> 1)) + square(crx - resi);
-                }
-                else if (cbf_mask == -1)
-                {
-                  const int resi = ((4 * crx - 2 * cbx) / 5);
-                  current_resi[x + y * width] = resi;
-                  d1 += square(cbx - (resi >> 1)) + square(crx - resi);
-                }
-                else
-                {
-                  d1 += square(cbx);
-                  d2 += square(crx);
-                }
-              }
-            }
-            costs[jccr] = d2 != 0 ? MIN(d1, d2) : d1;
-          }
-          for(int jccr = chroma_data[i].pred_cu.type == CU_INTRA ? 1 : 3; jccr < 4; jccr++) {
-            if(costs[jccr] < costs[0]) {
-              uvg_transform2d(
-                state->encoder_control,
-                &temp_resi[(jccr - 1) + trans_offset],
-                &u_coeff[num_transforms * trans_offset],
+          generate_jccr_transforms(
+            state,
+            chroma_data,
            width,
-                COLOR_U,
-                &chroma_data[jccr].pred_cu
-                );
-              transforms[num_transforms] = jccr;
-              num_transforms++;
+            mode_i,
+            u_resi,
+            v_resi,
+            u_coeff,
+            transforms,
+            trans_offset,
+            &num_transforms);
        }
-          }
-        }
-
        double best_u_cost = MAX_INT64;
        double best_v_cost = MAX_INT64;
        double best_combined_cost = MAX_INT64;
        int best_u_index = -1;
        int best_v_index = -1;
        int best_combined_index = -1;
-        for(int trans = 0; trans < num_transforms; trans++) {
+        for(int i = 0; i < num_transforms; i++) {
          coeff_t u_quant_coeff[LCU_WIDTH_C * LCU_WIDTH_C];
          coeff_t v_quant_coeff[LCU_WIDTH_C * LCU_WIDTH_C];
          int16_t u_recon_resi[LCU_WIDTH_C * LCU_WIDTH_C];
@ -1545,56 +1673,24 @@ int8_t uvg_search_intra_chroma_rdo(
            uvg_get_scan_order(CU_INTRA, mode, depth);
          bool u_has_coeffs = false;
          bool v_has_coeffs = false;
-          if (state->encoder_control->cfg.rdoq_enable &&
-            (transforms[i] != CHROMA_TS || !state->encoder_control->cfg.rdoq_skip))
-          {
-            uvg_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height,  transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
-              scan_order, CU_INTRA, depth, 0);
+          quantize_chroma(
+            state,
+            depth,
+            width,
+            height,
+            u_coeff,
+            v_coeff,
+            transforms,
+            trans_offset,
+            i,
+            u_quant_coeff,
+            v_quant_coeff,
+            scan_order,
+            &u_has_coeffs,
+            &v_has_coeffs);

-            int j;
-            for (j = 0; i < width * height; ++j) {
-              if (u_quant_coeff[num_transforms * trans_offset + j]) {
-                u_has_coeffs = 1;
-                break;
-              }
-            }
+          if(IS_JCCR_MODE(transforms[i]) && !u_has_coeffs) continue;

-            if(transforms[i] == DCT7_CHROMA) {
-              int16_t temp_cbf = 0;
-              if (u_has_coeffs)cbf_set(&temp_cbf, depth, COLOR_U);
-              uvg_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
-                scan_order, CU_INTRA, depth, temp_cbf);
-              
-            }
-          }
-          else if (state->encoder_control->cfg.rdoq_enable && transforms[i] == CHROMA_TS) {
-            uvg_ts_rdoq(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, COLOR_U,scan_order);
-            uvg_ts_rdoq(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,scan_order);
-          }
-          else {
-            uvg_quant(state, &u_coeff[i * trans_offset], u_quant_coeff, width, height, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
-              scan_order, CU_INTRA, transforms[i] == CHROMA_TS);
-
-            if(transforms[i] != CHROMA_TS && transforms[i] != DCT7_CHROMA) {
-              uvg_quant(state, &v_coeff[i * trans_offset], v_quant_coeff, width, height, COLOR_V,
-                scan_order, CU_INTRA, transforms[i] == CHROMA_TS);
-            }
-          }
-          
-          for (int j = 0; i < width * height; ++j) {
-            if (u_quant_coeff[num_transforms * trans_offset + j]) {
-              u_has_coeffs = 1;
-              break;
-            }
-          }
-          if (transforms[i] != CHROMA_TS && transforms[i] != DCT7_CHROMA) {
-            for (int j = 0; i < width * height; ++j) {
-              if (v_quant_coeff[num_transforms * trans_offset + j]) {
-                v_has_coeffs = 1;
-                break;
-              }
-            }
-          }
          if(u_has_coeffs) {
            uvg_dequant(state, u_quant_coeff, &u_coeff[i * trans_offset], width, width, transforms[i] != JCCR_1 ? COLOR_U : COLOR_V,
              CU_INTRA, transforms[i] == CHROMA_TS);
@ -1607,19 +1703,19 @@ int8_t uvg_search_intra_chroma_rdo(
            }
            if(transforms[i] != JCCR_1) {
              for (int j = 0; j < width * height; j++) {
-                u_recon[offset * i + j] = CLIP_TO_PIXEL((int16_t)u_pred[j] + u_recon_resi[j]);
+                u_recon[trans_offset * i + j] = CLIP_TO_PIXEL((kvz_pixel)(u_pred[j] + u_recon_resi[j]));
              }              
            }
            else {
              for (int j = 0; j < width * height; j++) {
-                u_recon[offset * i + j] = CLIP_TO_PIXEL((int16_t)u_pred[j] + (u_recon_resi[j] >> 1));
+                u_recon[trans_offset * i + j] = CLIP_TO_PIXEL(u_pred[j] + ((state->frame->jccr_sign ? -u_recon_resi[j] : u_recon_resi[j]) >> 1));
              }              
            }
          }
          else {
-            uvg_pixels_blit(u_pred, &u_recon[offset * i], width, height, width, width);
+            uvg_pixels_blit(u_pred, &u_recon[trans_offset * i], width, height, width, width);
          }
-          if(v_has_coeffs  && (transforms[i] == DCT7_CHROMA || transforms[i] == CHROMA_TS)) {
+          if(v_has_coeffs  && !(IS_JCCR_MODE(transforms[i]))) {
            uvg_dequant(state, v_quant_coeff, &v_coeff[i * trans_offset], width, width, COLOR_V,
              CU_INTRA, transforms[i] == CHROMA_TS);
            if (transforms[i] != CHROMA_TS) {
@ -1630,70 +1726,135 @@ int8_t uvg_search_intra_chroma_rdo(
              uvg_itransformskip(state->encoder_control, v_recon_resi, &v_coeff[i * trans_offset], width);
            }
            for (int j = 0; j < width * height; j++) {
-              v_recon[offset * i + j] = CLIP_TO_PIXEL((int16_t)u_pred[j] + v_recon_resi[j]);
+              v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + v_recon_resi[j]);
            }            
          }
-          else if(u_has_coeffs && (transforms[i] != DCT7_CHROMA && transforms[i] != CHROMA_TS)) {
-            if(transforms[i] != JCCR_2) {
+          else if(u_has_coeffs && IS_JCCR_MODE(transforms[i])) {
+            if (transforms[i] == JCCR_1) {
              for (int j = 0; j < width * height; j++) {
-                v_recon[offset * i + j] = CLIP_TO_PIXEL((int16_t)v_pred[j] + (state->frame->jccr_sign ? -u_recon_resi[j] : u_recon_resi[j]));
+                v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + u_recon_resi[j]);
              }
-            } else {
+            }
+            else if(transforms[i] == JCCR_3) {
              for (int j = 0; j < width * height; j++) {
-                v_recon[offset * i + j] = CLIP_TO_PIXEL((int16_t)v_pred[j] + (state->frame->jccr_sign ? -u_recon_resi[j] : u_recon_resi[j]));
+                v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + (state->frame->jccr_sign ? -u_recon_resi[j] : u_recon_resi[j]));
+              }
+            }
+            else {
+              for (int j = 0; j < width * height; j++) {
+                v_recon[trans_offset * i + j] = CLIP_TO_PIXEL(v_pred[j] + ((state->frame->jccr_sign ? -u_recon_resi[j] : u_recon_resi[j]) >> 1));
              }              
            }
          }
          else {
-            uvg_pixels_blit(v_pred, &v_recon[offset * i], width, height, width, width);
+            uvg_pixels_blit(v_pred, &v_recon[trans_offset * i], width, height, width, width);
          }

-          int ssd_u;
-          int ssd_v;
+          unsigned ssd_u = 0;
+          unsigned ssd_v = 0;
          if (!state->encoder_control->cfg.lossless) {
-            int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
-            ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
-              LCU_WIDTH_C, LCU_WIDTH_C,
+            ssd_u = uvg_pixels_calc_ssd(&lcu->ref.u[offset], &u_recon[trans_offset * i],
+              LCU_WIDTH_C, width,
              width);
-            ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.v[index],
-              LCU_WIDTH_C, LCU_WIDTH_C,
+            ssd_v = uvg_pixels_calc_ssd(&lcu->ref.v[offset], &v_recon[trans_offset * i],
+              LCU_WIDTH_C, width,
              width);
          }

          double u_bits = 0;
          double v_bits = 0;
          state->search_cabac.update = 1;
-          if(state->encoder_control->cfg.jccr) {
-            CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.joint_cb_cr[transforms[i]],
-              transforms[i] != DCT7_CHROMA && transforms[i] != CHROMA_TS, u_bits, "jccr_flag"
-            );
-          }
+
          int cbf_u = transforms[i] & 2 || (u_has_coeffs && !(transforms[i] & 1));
          CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.qt_cbf_model_cb[0],
            cbf_u, u_bits, "cbf_u"
          );
+          int cbf_v = transforms[i] & 1 || (v_has_coeffs && !(transforms[i] & 2));
          CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.qt_cbf_model_cr[cbf_u],
-            transforms[i] & 1 || (v_has_coeffs && !(transforms[i] & 2)), v_bits, "cbf_v"
+            cbf_v, v_bits, "cbf_v"
          );

-          memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
+          if (state->encoder_control->cfg.jccr && (cbf_u || cbf_v)) {
+            CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.joint_cb_cr[cbf_u * 2 + cbf_v - 1],
+              transforms[i] != DCT7_CHROMA && transforms[i] != CHROMA_TS, v_bits, "jccr_flag"
+            );
          }

+          if (cbf_u || (transforms[i] == JCCR_1 && u_has_coeffs)) {
+            if(can_use_tr_skip && !IS_JCCR_MODE(transforms[i])) {
+              CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.transform_skip_model_chroma,
+                transforms[i] == CHROMA_TS, u_bits, "tr_skip_u"
+              );
+            }
+            double coeff_cost = kvz_get_coeff_cost(
+              state,
+              u_quant_coeff,
+              NULL,
+              width,
+              COLOR_U,
+              scan_order,
+              transforms[i] == CHROMA_TS);
+            u_bits += coeff_cost;
+          }
+          if (cbf_v && !IS_JCCR_MODE(transforms[i])) {
+            if (can_use_tr_skip) {
+              CABAC_FBITS_UPDATE(&state->search_cabac, &state->search_cabac.ctx.transform_skip_model_chroma,
+                transforms[i] == CHROMA_TS, v_bits, "tr_skip_v"
+              );
+            }
+            v_bits += kvz_get_coeff_cost(
+              state,
+              v_quant_coeff,
+              NULL,
+              width,
+              COLOR_V,
+              scan_order,
+              transforms[i] == CHROMA_TS);
+          }
+          if(!IS_JCCR_MODE(transforms[i])) {
+            double u_cost = KVZ_CHROMA_MULT * ssd_u + u_bits * state->frame->lambda;
+            double v_cost = KVZ_CHROMA_MULT * ssd_v + v_bits * state->frame->lambda;
+            if(u_cost < best_u_cost) {
+              best_u_cost = u_cost;
+              best_u_index = transforms[i];
+            }
+            if(v_cost < best_v_cost) {
+              best_v_cost = v_cost;
+              best_v_index = transforms[i];
+            }
+          }
+          else {
+            double cost = KVZ_CHROMA_MULT * (ssd_u + ssd_v) + (u_bits + v_bits) * state->frame->lambda;
+            if (cost < best_combined_cost) {
+              best_combined_cost = cost;
+              best_combined_index = transforms[i];
+            }
+          }
+          memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
+        }
+        if(best_u_cost + best_v_cost < best_combined_cost) {
+          chroma_data[mode_i].pred_cu.joint_cb_cr = 0;
+          chroma_data[mode_i].pred_cu.tr_skip |= (best_u_index == CHROMA_TS) << COLOR_U;
+          chroma_data[mode_i].pred_cu.tr_skip |= (best_v_index == CHROMA_TS) << COLOR_V;
+          chroma_data[mode_i].cost += best_u_cost + best_v_cost;
+        }
+        else {
+          chroma_data[mode_i].pred_cu.joint_cb_cr = best_combined_index;
+          chroma_data[mode_i].cost += best_combined_cost;
+        }
      }
      else {
        state->search_cabac.update = 1;
+        chroma_data[mode_i].cost = mode_bits * state->lambda;
        uvg_intra_recon_cu(state,
                           x_px, y_px,
-                           depth, &chroma_data[i],
-          &chroma_data[i].pred_cu,
+                           depth, &chroma_data[mode_i],
+          &chroma_data[mode_i].pred_cu,
                           lcu);
-        double mode_bits = uvg_chroma_mode_bits(state, mode, luma_mode);
-        chroma_data[i].cost = mode_bits * state->lambda;
-      
        if(tr_cu->depth != tr_cu->tr_depth || !state->encoder_control->cfg.jccr) {
-          chroma_data[i].cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu);
+          chroma_data[mode_i].cost += uvg_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, &chroma_data[mode_i].pred_cu, lcu);
        } else {
-          uvg_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, &chroma_data[i].pred_cu, lcu, &chroma_data[i].cost);
+          uvg_select_jccr_mode(state, lcu_px.x, lcu_px.y, depth, &chroma_data[mode_i].pred_cu, lcu, &chroma_data[mode_i].cost);
        }
        memcpy(&state->search_cabac, &temp_cabac, sizeof(cabac_data_t));
      }
@ -1708,6 +1869,7 @@ int8_t uvg_search_intra_chroma_rdo(
  return 100;
 }

+#undef IS_JCCR_MODE

 int8_t uvg_search_cu_intra_chroma(encoder_state_t * const state,
                              const int x_px, const int y_px,
@ -2090,5 +2252,6 @@ void uvg_search_cu_intra(
    search_data[0].pred_cu.mts_last_scan_pos = false;
    search_data[0].pred_cu.violates_mts_coeff_constraint = false;
  }
+  printf("%f\n", search_data[0].cost);
  *mode_out = search_data[0];
 }
--- a/src/strategies/avx2/picture-avx2.c
+++ b/src/strategies/avx2/picture-avx2.c
@ -1723,44 +1723,44 @@ static INLINE __m128i get_residual_8x1_avx2(const uint8_t* a_in, const uint8_t*
  return diff;
 }

-void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int in_stride) {
+static void generate_residual_avx2(const uint8_t* ref_in, const uint8_t* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride) {

  __m128i diff = _mm_setzero_si128();
  switch (width) {
  case 4:
-    diff = get_residual_4x1_avx2(ref_in + 0 * in_stride, pred_in + 0 * in_stride);
+    diff = get_residual_4x1_avx2(ref_in + 0 * ref_stride, pred_in + 0 * pred_stride);
    _mm_storel_epi64((__m128i*) & (residual[0]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 1 * in_stride, pred_in + 1 * in_stride);
+    diff = get_residual_4x1_avx2(ref_in + 1 * ref_stride, pred_in + 1 * pred_stride);
    _mm_storel_epi64((__m128i*) & (residual[4]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 2 * in_stride, pred_in + 2 * in_stride);
+    diff = get_residual_4x1_avx2(ref_in + 2 * ref_stride, pred_in + 2 * pred_stride);
    _mm_storel_epi64((__m128i*) & (residual[8]), diff);
-    diff = get_residual_4x1_avx2(ref_in + 3 * in_stride, pred_in + 3 * in_stride);
+    diff = get_residual_4x1_avx2(ref_in + 3 * ref_stride, pred_in + 3 * pred_stride);
    _mm_storel_epi64((__m128i*) & (residual[12]), diff);
    break;
  case 8:
-    diff = get_residual_8x1_avx2(&ref_in[0 * in_stride], &pred_in[0 * in_stride]);
+    diff = get_residual_8x1_avx2(&ref_in[0 * ref_stride], &pred_in[0 * pred_stride]);
    _mm_storeu_si128((__m128i*) & (residual[0]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[1 * in_stride], &pred_in[1 * in_stride]);
+    diff = get_residual_8x1_avx2(&ref_in[1 * ref_stride], &pred_in[1 * pred_stride]);
    _mm_storeu_si128((__m128i*) & (residual[8]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[2 * in_stride], &pred_in[2 * in_stride]);
+    diff = get_residual_8x1_avx2(&ref_in[2 * ref_stride], &pred_in[2 * pred_stride]);
    _mm_storeu_si128((__m128i*) & (residual[16]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[3 * in_stride], &pred_in[3 * in_stride]);
+    diff = get_residual_8x1_avx2(&ref_in[3 * ref_stride], &pred_in[3 * pred_stride]);
    _mm_storeu_si128((__m128i*) & (residual[24]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[4 * in_stride], &pred_in[4 * in_stride]);
+    diff = get_residual_8x1_avx2(&ref_in[4 * ref_stride], &pred_in[4 * pred_stride]);
    _mm_storeu_si128((__m128i*) & (residual[32]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[5 * in_stride], &pred_in[5 * in_stride]);
+    diff = get_residual_8x1_avx2(&ref_in[5 * ref_stride], &pred_in[5 * pred_stride]);
    _mm_storeu_si128((__m128i*) & (residual[40]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[6 * in_stride], &pred_in[6 * in_stride]);
+    diff = get_residual_8x1_avx2(&ref_in[6 * ref_stride], &pred_in[6 * pred_stride]);
    _mm_storeu_si128((__m128i*) & (residual[48]), diff);
-    diff = get_residual_8x1_avx2(&ref_in[7 * in_stride], &pred_in[7 * in_stride]);
+    diff = get_residual_8x1_avx2(&ref_in[7 * ref_stride], &pred_in[7 * pred_stride]);
    _mm_storeu_si128((__m128i*) & (residual[56]), diff);
    break;
  default:
    for (int y = 0; y < width; ++y) {
      for (int x = 0; x < width; x += 16) {
-        diff = get_residual_8x1_avx2(&ref_in[x + y * in_stride], &pred_in[x + y * in_stride]);
+        diff = get_residual_8x1_avx2(&ref_in[x + y * ref_stride], &pred_in[x + y * pred_stride]);
        _mm_storeu_si128((__m128i*) & residual[x + y * width], diff);
-        diff = get_residual_8x1_avx2(&ref_in[(x + 8) + y * in_stride], &pred_in[(x + 8) + y * in_stride]);
+        diff = get_residual_8x1_avx2(&ref_in[(x + 8) + y * ref_stride], &pred_in[(x + 8) + y * pred_stride]);
        _mm_storeu_si128((__m128i*) & residual[(x + 8) + y * width], diff);
      }
    }
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@ -623,7 +623,7 @@ int uvg_quantize_residual_avx2(encoder_state_t *const state,
  assert(width >= TR_MIN_WIDTH);

  // Get residual. (ref_in - pred_in -> residual)
-  kvz_generate_residual(ref_in, pred_in, residual, width, in_stride);
+  kvz_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);

  if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
    int y, x;
--- a/src/strategies/generic/picture-generic.c
+++ b/src/strategies/generic/picture-generic.c
@ -782,12 +782,13 @@ static double pixel_var_generic(const uvg_pixel *arr, const uint32_t len)
 }


-void generate_residual_generic(const kvz_pixel* ref_in, const kvz_pixel* pred_in, int16_t* residual, int width, int in_stride)
+static void generate_residual_generic(const kvz_pixel* ref_in, const kvz_pixel* pred_in, int16_t* residual, 
+  int width, int ref_stride, int pred_stride)
 {
  int y, x;
  for (y = 0; y < width; ++y) {
    for (x = 0; x < width; ++x) {
-      residual[x + y * width] = (int16_t)(ref_in[x + y * in_stride] - pred_in[x + y * in_stride]);
+      residual[x + y * width] = (int16_t)(ref_in[x + y * ref_stride] - pred_in[x + y * pred_stride]);
    }
  }
 }
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@ -207,8 +207,7 @@ int uvg_quant_cbcr_residual_generic(
  ) {
  ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
  ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
-  ALIGNED(64) int16_t u1_residual[2][TR_MAX_WIDTH * TR_MAX_WIDTH];
-  ALIGNED(64) int16_t v1_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) int16_t combined_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
  ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];

  {
@ -220,80 +219,64 @@ int uvg_quant_cbcr_residual_generic(
      }
    }
  }
-  kvz_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride);
-  kvz_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride);
+  kvz_generate_residual(u_ref_in, u_pred_in, u_residual, width, in_stride, in_stride);
+  kvz_generate_residual(v_ref_in, v_pred_in, v_residual, width, in_stride, in_stride);
  
-  int best_cbf_mask = -1;
-  int64_t best_cost = INT64_MAX;
  
-  // This changes the order of the cbf_masks so 2 and 3 are swapped compared with VTM
-  for(int i = cur_cu->type == CU_INTRA ? 1 : 3; i < 4; i++) {
-    int64_t d1 = 0;
-    const int cbf_mask = i * (state->frame->jccr_sign ? -1 : 1);
+  const int cbf_mask = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
  for (int y = 0; y < width; y++)
  {
    for (int x = 0; x < width; x++)
    {
-        int cbx = u_residual[x + y * width], crx = v_residual[x + y * width];
+      const int16_t cbx = u_residual[x + y * width], crx = v_residual[x + y * width];
      if (cbf_mask == 2)
      {
-          u1_residual[i - 2][x + y * width] = ((4 * cbx + 2 * crx) / 5);
-          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (u1_residual[i - 2][x + y * width] >> 1));
+        combined_residual[x + y * width] = (4 * cbx + 2 * crx) / 5;
      }
      else if (cbf_mask == -2)
      {
-          u1_residual[i - 2][x + y * width] = ((4 * cbx - 2 * crx) / 5);
-          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - (-u1_residual[i - 2][x + y * width] >> 1));
+        combined_residual[x + y * width] = (4 * cbx - 2 * crx) / 5;
      }
      else if (cbf_mask == 3)
      {
-          u1_residual[i - 2][x + y * width] = ((cbx + crx) / 2);
-          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx - u1_residual[i - 2][x + y * width]);
+        combined_residual[x + y * width] = (cbx + crx) / 2;
      }
      else if (cbf_mask == -3)
      {
-          u1_residual[i - 2][x + y * width] = ((cbx - crx) / 2);
-          d1 += square(cbx - u1_residual[i - 2][x + y * width]) + square(crx + u1_residual[i - 2][x + y * width]);
+        combined_residual[x + y * width] = (cbx - crx) / 2;
      }
      else if (cbf_mask == 1)
      {
-          v1_residual[x + y * width] = ((4 * crx + 2 * cbx) / 5);
-          d1 += square(cbx - (v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
+        combined_residual[x + y * width] = (4 * crx + 2 * cbx) / 5;
      }
      else if (cbf_mask == -1)
      {
-          v1_residual[x + y * width] = ((4 * crx - 2 * cbx) / 5);
-          d1 += square(cbx - (-v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
+        combined_residual[x + y * width] = (4 * crx - 2 * cbx) / 5;
      }
      else
      {
-          d1 += square(cbx);
-          //d2 += square(crx);
+        assert(0);
      }
    }
  }
-    if (d1 < best_cost) {
-      best_cbf_mask = i;
-      best_cost = d1;
-    }
-  }

-  uvg_transform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu);
+
+  uvg_transform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);

  if (state->encoder_control->cfg.rdoq_enable &&
    (width > 4 || !state->encoder_control->cfg.rdoq_skip))
  {
    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    uvg_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
+    uvg_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
      scan_order, cur_cu->type, tr_depth, cur_cu->cbf);
  }
  else if (state->encoder_control->cfg.rdoq_enable && false) {
-    uvg_ts_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+    uvg_ts_rdoq(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 2 ? COLOR_V : COLOR_U,
      scan_order);
  }
  else {
-    uvg_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
+    uvg_quant(state, coeff, coeff_out, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
  }

@ -309,13 +292,12 @@ int uvg_quant_cbcr_residual_generic(
  }

  if (has_coeffs && !early_skip) {
-    int y, x;

    // Get quantized residual. (coeff_out -> coeff -> residual)
-    uvg_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U,
+    uvg_dequant(state, coeff_out, coeff, width, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U,
      cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
    
-    uvg_itransform2d(state->encoder_control, best_cbf_mask == 1 ? v1_residual : u1_residual[best_cbf_mask - 2], coeff, width, best_cbf_mask == 1 ? COLOR_V : COLOR_U, cur_cu);
+    uvg_itransform2d(state->encoder_control, combined_residual, coeff, width, cur_cu->joint_cb_cr == 1 ? COLOR_V : COLOR_U, cur_cu);
    

    //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
@ -336,39 +318,39 @@ int uvg_quant_cbcr_residual_generic(
    //    }
    //  }
    //}
-    const int temp = best_cbf_mask * (state->frame->jccr_sign ? -1 : 1);
+    const int temp = cur_cu->joint_cb_cr * (state->frame->jccr_sign ? -1 : 1);
    // Get quantized reconstruction. (residual + pred_in -> rec_out)
    for (int y = 0; y < width; y++) {
      for (int x = 0; x < width; x++) {
        if (temp == 2) {
-          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
-          v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width] >> 1;
+          u_residual[x + y * width] = combined_residual[x + y * width];
+          v_residual[x + y * width] = combined_residual[x + y * width] >> 1;
        }
        else if (temp == -2) {
-          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
-          v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width] >> 1;
+          u_residual[x + y * width] = combined_residual[x + y * width];
+          v_residual[x + y * width] = -combined_residual[x + y * width] >> 1;
        }
        else if (temp == 3) {
-          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
-          v_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width];
+          u_residual[x + y * width] = combined_residual[x + y * width];
+          v_residual[x + y * width] = combined_residual[x + y * width];
        }
        else if (temp == -3) {
          // non-normative clipping to prevent 16-bit overflow
-          u_residual[x + y * width] = u1_residual[best_cbf_mask - 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x];
-          v_residual[x + y * width] = -u1_residual[best_cbf_mask - 2][x + y * width];
+          u_residual[x + y * width] = combined_residual[x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x];
+          v_residual[x + y * width] = -combined_residual[x + y * width];
        }
        else if (temp == 1) {
-          u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
-          v_residual[x + y * width] = v1_residual[x + y * width];
+          u_residual[x + y * width] = combined_residual[x + y * width] >> 1;
+          v_residual[x + y * width] = combined_residual[x + y * width];
        }
        else if (temp == -1) {
-          u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
-          v_residual[x + y * width] = -v1_residual[x + y * width];
+          u_residual[x + y * width] = -combined_residual[x + y * width] >> 1;
+          v_residual[x + y * width] = combined_residual[x + y * width];
        }
      }
    }
-    for (y = 0; y < width; ++y) {
-      for (x = 0; x < width; ++x) {
+    for (int y = 0; y < width; ++y) {
+      for (int x = 0; x < width; ++x) {
        int16_t u_val = u_residual[x + y * width] + u_pred_in[x + y * in_stride];
        u_rec_out[x + y * out_stride] = (uvg_pixel)CLIP(0, PIXEL_MAX, u_val);
        int16_t v_val = v_residual[x + y * width] + v_pred_in[x + y * in_stride];
@ -379,20 +361,16 @@ int uvg_quant_cbcr_residual_generic(
  else/* if (rec_out != pred_in)*/ {
    // With no coeffs and rec_out == pred_int we skip copying the coefficients
    // because the reconstruction is just the prediction.
-    int y, x;

-    for (y = 0; y < width; ++y) {
-      for (x = 0; x < width; ++x) {
+    for (int y = 0; y < width; ++y) {
+      for (int x = 0; x < width; ++x) {
        u_rec_out[x + y * out_stride] = u_pred_in[x + y * in_stride];
        v_rec_out[x + y * out_stride] = v_pred_in[x + y * in_stride];
      }
    }
  }
  
-
-
-
-  return has_coeffs ? best_cbf_mask : 0;
+  return has_coeffs ? cur_cu->joint_cb_cr : 0;
 }

 /**
@ -431,7 +409,7 @@ int uvg_quantize_residual_generic(encoder_state_t *const state,
  const int height = width; // TODO: height for non-square blocks

  // Get residual. (ref_in - pred_in -> residual)
-  kvz_generate_residual(ref_in, pred_in, residual, width, in_stride);
+  kvz_generate_residual(ref_in, pred_in, residual, width, in_stride, in_stride);

  if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
    int y, x;
--- a/src/strategies/strategies-picture.h
+++ b/src/strategies/strategies-picture.h
@ -149,7 +149,7 @@ typedef void (inter_recon_bipred_func)(lcu_t * const lcu,

 typedef double (pixel_var_func)(const uvg_pixel *buf, const uint32_t len);

-typedef void (generate_residual_func)(const kvz_pixel* ref_in, const kvz_pixel* pred_in, int16_t* residual, int width, int in_stride);
+typedef void (generate_residual_func)(const kvz_pixel* ref_in, const kvz_pixel* pred_in, int16_t* residual, int width, int ref_stride, int pred_stride);

 // Declare function pointers.
 extern reg_sad_func * uvg_reg_sad;
@ -229,6 +229,7 @@ cost_pixel_nxn_multi_func * kvz_pixels_get_sad_dual_func(unsigned n);
  {"ver_sad", (void**) &uvg_ver_sad}, \
  {"hor_sad", (void**) &uvg_hor_sad}, \
  {"pixel_var", (void**) &uvg_pixel_var}, \
+  {"generate_residual", (void**) &kvz_generate_residual}, \



--- a/src/transform.c
+++ b/src/transform.c
@ -650,9 +650,8 @@ static void quantize_tr_residual(encoder_state_t * const state,
  }

  const bool can_use_trskip = tr_width <= (1 << state->encoder_control->cfg.trskip_max_size) &&
-                              color == COLOR_Y &&
                              cfg->trskip_enable && 
-                              cur_pu->tr_idx == 1;
+                              cur_pu->tr_skip & (1 << color);

  uint8_t has_coeffs;

@ -696,7 +695,6 @@ static void quantize_tr_residual(encoder_state_t * const state,
                                              pred,
                                              coeff,
                                              lmcs_chroma_adj);
-    cur_pu->tr_skip = tr_skip;
  } else {
    if(color == COLOR_UV) {
      has_coeffs = uvg_quant_cbcr_residual(