Merge branch 'joint_cbcr' into 'master'

[jccr] Add joint coding of chroma residual See merge request cs/ultravideo/vvc/uvg266!6
2024-11-27 19:24:06 +00:00 · 2021-09-06 11:43:06 +03:00 · 2021-09-06 11:43:06 +03:00 · 450cbd356c
parent 839b9527af 91374e95a9
commit 450cbd356c
24 changed files with 463 additions and 86 deletions
--- a/configure.ac
+++ b/configure.ac
@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=6
-ver_minor=5
+ver_minor=6
 ver_release=0

 # Prevents configure from adding a lot of defines to the CFLAGS
--- a/src/cabac.h
+++ b/src/cabac.h
@ -95,7 +95,7 @@ typedef struct
    cabac_ctx_t luma_planar_model[2];
    cabac_ctx_t multi_ref_line[2];
    cabac_ctx_t bdpcm_mode[4];
-    cabac_ctx_t joint_bc_br[3];
+    cabac_ctx_t joint_cb_cr[3];
    cabac_ctx_t transform_skip_model_luma;
    cabac_ctx_t transform_skip_model_chroma;
    cabac_ctx_t transform_skip_sig_coeff[3];
--- a/src/cfg.c
+++ b/src/cfg.c
@ -180,10 +180,7 @@ int kvz_config_init(kvz_config *cfg)
  cfg->fastrd_sampling_on = 0;
  cfg->fastrd_accuracy_check_on = 0;
  cfg->fastrd_learning_outdir_fn = NULL;
-
-  int8_t in[] = { 17, 27, 32, 44 };
-  int8_t out[] = { 17, 29, 34, 41 };
-
+  
  cfg->chroma_scale_out[0][0] = cfg->chroma_scale_in[0][0] = 17;
  cfg->chroma_scale_out[0][1] = cfg->chroma_scale_in[0][1] = 27;
  cfg->chroma_scale_out[0][2] = cfg->chroma_scale_in[0][2] = 32;
@ -195,6 +192,8 @@ int kvz_config_init(kvz_config *cfg)

  parse_qp_map(cfg, 0);

+  cfg->jccr = 0;
+
  return 1;
 }

@ -1466,6 +1465,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
    parse_qp_map(cfg, 0);
    return success;
  }
+  else if OPT("jccr") {
+    cfg->jccr = (bool)atobool(value);
+  }
  else {
    return 0;
  }
--- a/src/cli.c
+++ b/src/cli.c
@ -162,6 +162,8 @@ static const struct option long_options[] = {
  { "fastrd-outdir",      required_argument, NULL, 0 },
  { "chroma-qp-in",       required_argument, NULL, 0 },
  { "chroma-qp-out",      required_argument, NULL, 0 },
+  { "jccr",                     no_argument, NULL, 0 },
+  { "no-jccr",                  no_argument, NULL, 0 },
  {0, 0, 0, 0}
 };

@ -613,6 +615,8 @@ void print_help(void)
    "                                   - both: MTS applied for both intra and inter blocks.\n"
    "                                   - implicit: uses implicit MTS. Applies DST7 instead \n"
    "                                               of DCT2 to certain intra blocks.\n"
+    "      --(no-)jccr            : Joint coding of chroma residual. "
+    "                               Requires rdo> = 2. [disabled]\n"          
    "\n"
    /* Word wrap to this width to stay under 80 characters (including ") *************/
    "Parallel processing:\n"
--- a/src/context.c
+++ b/src/context.c
@ -455,7 +455,7 @@ void kvz_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)

  for (i = 0; i < 3; i++) {
    kvz_ctx_init(&cabac->ctx.cu_skip_flag_model[i], QP, INIT_SKIP_FLAG[slice][i], INIT_SKIP_FLAG[3][i]);
-    kvz_ctx_init(&cabac->ctx.joint_bc_br[i], QP, INIT_JOINT_CB_CR_FLAG[slice][i], INIT_JOINT_CB_CR_FLAG[3][i]);   
+    kvz_ctx_init(&cabac->ctx.joint_cb_cr[i], QP, INIT_JOINT_CB_CR_FLAG[slice][i], INIT_JOINT_CB_CR_FLAG[3][i]);   
    kvz_ctx_init(&cabac->ctx.transform_skip_sig_coeff[i], QP, INIT_TRANSFORM_SKIP_SIG_COEFF[slice][i], INIT_TRANSFORM_SKIP_SIG_COEFF[3][i]);
    kvz_ctx_init(&cabac->ctx.transform_skip_sig[i], QP, INIT_TRANSFORM_SKIP_SIG[slice][i], INIT_TRANSFORM_SKIP_SIG[3][i]);
  }
--- a/src/cu.h
+++ b/src/cu.h
@ -127,15 +127,16 @@ typedef struct {
 */
 typedef struct
 {
-  uint8_t type      : 2; //!< \brief block type, one of cu_type_t values
-  uint8_t depth     : 3; //!< \brief depth / size of this block
-  uint8_t part_size : 3; //!< \brief partition mode, one of part_mode_t values
-  uint8_t tr_depth  : 3; //!< \brief transform depth
-  uint8_t skipped   : 1; //!< \brief flag to indicate this block is skipped
-  uint8_t merged    : 1; //!< \brief flag to indicate this block is merged
-  uint8_t merge_idx : 3; //!< \brief merge index
-  uint8_t tr_skip   : 1; //!< \brief transform skip flag
-  uint8_t tr_idx : 3; //!< \brief transform index
+  uint8_t type        : 2; //!< \brief block type, one of cu_type_t values
+  uint8_t depth       : 3; //!< \brief depth / size of this block
+  uint8_t part_size   : 3; //!< \brief partition mode, one of part_mode_t values
+  uint8_t tr_depth    : 3; //!< \brief transform depth
+  uint8_t skipped     : 1; //!< \brief flag to indicate this block is skipped
+  uint8_t merged      : 1; //!< \brief flag to indicate this block is merged
+  uint8_t merge_idx   : 3; //!< \brief merge index
+  uint8_t tr_skip     : 1; //!< \brief transform skip flag
+  uint8_t tr_idx      : 3; //!< \brief transform index
+  uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding 

  uint16_t cbf;

@ -299,6 +300,7 @@ typedef ALIGNED(8) struct {
  coeff_t y[LCU_LUMA_SIZE];
  coeff_t u[LCU_CHROMA_SIZE];
  coeff_t v[LCU_CHROMA_SIZE];
+  coeff_t joint_uv[LCU_CHROMA_SIZE];
 } lcu_coeff_t;


--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@ -328,31 +328,41 @@ void kvz_encode_last_significant_xy(cabac_data_t * const cabac,
  }
 }

-static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int depth, const uint8_t width_c, const cu_info_t* cur_pu, int8_t* scan_idx, lcu_coeff_t* coeff) {
+static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int depth, const uint8_t width_c, const cu_info_t* cur_pu, int8_t* scan_idx, lcu_coeff_t* coeff, uint8_t joint_chroma) {
  int x_local = (x >> 1) % LCU_WIDTH_C;
  int y_local = (y >> 1) % LCU_WIDTH_C;
  cabac_data_t* const cabac = &state->cabac;
  *scan_idx = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth);
+  if(!joint_chroma){
+    const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
+    const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];

-  const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
-  const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
-
-  if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) {
-    if(state->encoder_control->cfg.trskip_enable && width_c == 4){
-      cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
-      // HEVC only supports transform_skip for Luma
-      // TODO: transform skip for chroma blocks
-      CABAC_BIN(cabac, 0, "transform_skip_flag");
+    if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) {
+      if(state->encoder_control->cfg.trskip_enable && width_c == 4){
+        cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
+        // HEVC only supports transform_skip for Luma
+        // TODO: transform skip for chroma blocks
+        CABAC_BIN(cabac, 0, "transform_skip_flag");
+      }
+      kvz_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, 1, *scan_idx, NULL, false);
    }
-    kvz_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, 1, *scan_idx, NULL, false);
-  }

-  if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
+    if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
+      if (state->encoder_control->cfg.trskip_enable && width_c == 4) {
+        cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
+        CABAC_BIN(cabac, 0, "transform_skip_flag");
+      }
+      kvz_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, 2, *scan_idx, NULL, false);
+    }
+  }
+  else {
+    const coeff_t *coeff_uv = &coeff->joint_uv[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
    if (state->encoder_control->cfg.trskip_enable && width_c == 4) {
      cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
      CABAC_BIN(cabac, 0, "transform_skip_flag");
    }
-    kvz_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, 2, *scan_idx, NULL, false);
+    kvz_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, 2, *scan_idx, NULL, false);
+    
  }
 }

@ -370,16 +380,6 @@ static void encode_transform_unit(encoder_state_t * const state,

  int8_t scan_idx = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);

-  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-    // joint_cb_cr
-    /*
-    if (type == 2 && cbf_mask) {
-      cabac->cur_ctx = &(cabac->ctx.joint_bc_br[0]);
-      CABAC_BIN(cabac, 0, "joint_cb_cr");
-    }
-    */
-  }
-
  int cbf_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y);

  if (cbf_y && !only_chroma) {
@ -410,6 +410,7 @@ static void encode_transform_unit(encoder_state_t * const state,
    }
  }

+  bool joint_chroma = cur_pu->joint_cb_cr != 0;
  if (depth == MAX_DEPTH) {
    // For size 4x4 luma transform the corresponding chroma transforms are
    // also of size 4x4 covering 8x8 luma pixels. The residual is coded in
@ -428,8 +429,8 @@ static void encode_transform_unit(encoder_state_t * const state,

  bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, depth, COLOR_U) ||
                        cbf_is_set(cur_pu->cbf, depth, COLOR_V);
-  if (chroma_cbf_set) {
-    encode_chroma_tu(state, x, y, depth, width_c, cur_pu, &scan_idx, coeff);
+  if (chroma_cbf_set || joint_chroma) {
+    encode_chroma_tu(state, x, y, depth, width_c, cur_pu, &scan_idx, coeff, joint_chroma);
  }
 }

@ -483,8 +484,8 @@ static void encode_transform_coeff(encoder_state_t * const state,
 

  const int cb_flag_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y);
-  const int cb_flag_u = cbf_is_set(cur_cu->cbf, depth, COLOR_U);
-  const int cb_flag_v = cbf_is_set(cur_cu->cbf, depth, COLOR_V);
+  const int cb_flag_u = cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U);
+  const int cb_flag_v = cur_pu->joint_cb_cr ? ((cur_pu->joint_cb_cr & 2) >> 1) : cbf_is_set(cur_cu->cbf, depth, COLOR_V);

  // The split_transform_flag is not signaled when:
  // - transform size is greater than 32 (depth == 0)
@ -519,7 +520,7 @@ static void encode_transform_coeff(encoder_state_t * const state,
      }
      if (true) {
        cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]);
-        CABAC_BIN(cabac, cb_flag_v, "cbf_cr");
+        CABAC_BIN(cabac,  cb_flag_v, "cbf_cr");
      }
    }
  }
@ -570,7 +571,10 @@ static void encode_transform_coeff(encoder_state_t * const state,

      state->must_code_qp_delta = false;
    }
-
+    if((cb_flag_u || cb_flag_v ) && (depth != 4 || only_chroma) && state->encoder_control->cfg.jccr) {
+      cabac->cur_ctx = &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1];
+      CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
+    }
    encode_transform_unit(state, x, y, depth, only_chroma, coeff);
  }
 }
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@ -614,7 +614,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,


  if (encoder->chroma_format != KVZ_CSP_400) {
-    WRITE_U(stream, 0, 1, "sps_joint_cbcr_enabled_flag");
+    WRITE_U(stream, encoder->cfg.jccr, 1, "sps_joint_cbcr_enabled_flag");
    WRITE_U(stream, 1, 1, "same_qp_table_for_chroma");

    for (int i = 0; i < encoder->cfg.num_used_table; i++) {
@ -1265,6 +1265,11 @@ void kvz_encoder_state_write_bitstream_slice_header(
    WRITE_UE(stream, state->frame->slicetype, "sh_slice_type");
  }

+
+  if (encoder->cfg.jccr) {
+    WRITE_U(stream, 0, 1, "ph_joint_cbcr_sign_flag");
+  }
+
  if (state->frame->pictype == KVZ_NAL_CRA_NUT || state->frame->pictype == KVZ_NAL_IDR_N_LP || state->frame->pictype == KVZ_NAL_IDR_W_RADL || state->frame->pictype == KVZ_NAL_GDR_NUT)
  {
    WRITE_U(stream, 0, 1, "sh_no_output_of_prior_pics_flag");
@ -1322,7 +1327,6 @@ void kvz_encoder_state_write_bitstream_slice_header(
  int slice_qp_delta = state->frame->QP - encoder->cfg.qp;
  WRITE_SE(stream, slice_qp_delta, "sh_qp_delta");

-
  if (encoder->cfg.sao_type) {
    WRITE_U(stream, 1, 1, "sh_sao_luma_flag");
    if (encoder->chroma_format != KVZ_CSP_400) {
--- a/src/global.h
+++ b/src/global.h
@ -65,6 +65,7 @@
 #define RESHAPE_SIGNAL_HLG 2
 #define RESHAPE_SIGNAL_NULL 100

+
 /**
 * \defgroup Bitstream
 * HEVC bitstream coding
@ -327,7 +328,7 @@ typedef int16_t coeff_t;
 #define MAX_TR_DYNAMIC_RANGE 15

 //Constants
-typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V } color_t;
+typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V, COLOR_UV } color_t;


 // Hardware data (abstraction of defines). Extend for other compilers
--- a/src/image.c
+++ b/src/image.c
@ -220,6 +220,8 @@ hi_prec_buf_t * kvz_hi_prec_buf_t_alloc(int luma_size)
  yuv->y = (int16_t *)malloc(luma_size * sizeof(*yuv->y));
  yuv->u = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->u));
  yuv->v = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->v));
+  yuv->joint_u = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->u));
+  yuv->joint_v = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->v));
  yuv->size = luma_size;

  return yuv;
@ -230,6 +232,8 @@ void kvz_hi_prec_buf_t_free(hi_prec_buf_t * yuv)
  free(yuv->y);
  free(yuv->u);
  free(yuv->v);
+  free(yuv->joint_v);
+  free(yuv->joint_u);
  free(yuv);
 }

--- a/src/image.h
+++ b/src/image.h
@ -36,6 +36,8 @@ typedef struct {
  kvz_pixel y[LCU_LUMA_SIZE];
  kvz_pixel u[LCU_CHROMA_SIZE];
  kvz_pixel v[LCU_CHROMA_SIZE];
+  kvz_pixel joint_u[LCU_CHROMA_SIZE];
+  kvz_pixel joint_v[LCU_CHROMA_SIZE];
  enum kvz_chroma_format chroma_format;
 } lcu_yuv_t;

@ -44,6 +46,8 @@ typedef struct {
  int16_t *y;
  int16_t *u;
  int16_t *v;
+  int16_t *joint_u;
+  int16_t *joint_v;
 } hi_prec_buf_t;

 typedef struct {
--- a/src/intra.c
+++ b/src/intra.c
@ -603,19 +603,25 @@ static void intra_recon_tb_leaf(

  const int index = lcu_px.x + lcu_px.y * lcu_width;
  kvz_pixel *block = NULL;
+  kvz_pixel *block2 = NULL;
  switch (color) {
    case COLOR_Y:
      block = &lcu->rec.y[index];
      break;
    case COLOR_U:
      block = &lcu->rec.u[index];
+      block2 = &lcu->rec.joint_u[index];
      break;
    case COLOR_V:
      block = &lcu->rec.v[index];
+      block2 = &lcu->rec.joint_v[index];
      break;
  }

  kvz_pixels_blit(pred, block , width, width, width, lcu_width);
+  if(color != COLOR_Y && cfg->jccr) {
+    kvz_pixels_blit(pred, block2, width, width, width, lcu_width);
+  }
 }

 /**
@ -683,7 +689,7 @@ void kvz_intra_recon_cu(
    }
  } else {
    const bool has_luma = mode_luma != -1;
-    const bool has_chroma = mode_chroma != -1 && x % 8 == 0 && y % 8 == 0;
+    const bool has_chroma = mode_chroma != -1 &&  (x % 8 == 0 && y % 8 == 0);
    // Process a leaf TU.
    if (has_luma) {
      intra_recon_tb_leaf(state, x, y, depth, mode_luma, lcu, COLOR_Y);
--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@ -29,7 +29,6 @@
 #include <stdint.h>
 #include <stdio.h>

-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -490,6 +489,8 @@ typedef struct kvz_config

  int8_t chroma_scale_in[3][17];
  int8_t chroma_scale_out[3][17];
+
+  int8_t jccr;
 } kvz_config;

 /**
--- a/src/search.c
+++ b/src/search.c
@ -80,7 +80,7 @@ static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *fr
  }
 }

-static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
+static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, bool joint)
 {
  const int luma_z = xy_to_zorder(LCU_WIDTH, x_local, y_local);
  copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], width);
@ -89,18 +89,22 @@ static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *fr
    const int chroma_z = xy_to_zorder(LCU_WIDTH_C, x_local >> 1, y_local >> 1);
    copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], width >> 1);
    copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], width >> 1);
+    if (joint) {
+      copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], width >> 1);
+    }
  }
 }

 /**
 * Copy all non-reference CU data from next level to current level.
 */
-static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_tree)
+static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_tree, bool joint)
 {
  const int width = LCU_WIDTH >> depth;
  copy_cu_info  (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
  copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
-  copy_cu_coeffs(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
+  copy_cu_coeffs(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], joint);
+  
 }


@ -298,7 +302,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,

 double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
                         const int x_px, const int y_px, const int depth,
-                         const cu_info_t *const pred_cu,
+                         cu_info_t * pred_cu,
                         lcu_t *const lcu)
 {
  const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
@ -306,7 +310,9 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);

  double tr_tree_bits = 0;
+  double joint_cbcr_tr_tree_bits = 0;
  double coeff_bits = 0;
+  double joint_coeff_bits = 0;

  assert(x_px >= 0 && x_px < LCU_WIDTH);
  assert(y_px >= 0 && y_px < LCU_WIDTH);
@ -323,13 +329,21 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
    }
+    if(state->encoder_control->cfg.jccr) {
+      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, pred_cu->joint_cb_cr & 1);
+    }
    int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
    ctx = &(state->cabac.ctx.qt_cbf_model_cr[is_set]);
    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
    }
+    if(state->encoder_control->cfg.jccr) {
+      ctx = &(state->cabac.ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]);
+      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, (pred_cu->joint_cb_cr & 2) >> 1);
+    }
  }

+
  if (tr_cu->tr_depth > depth) {
    int offset = LCU_WIDTH >> (depth + 1);
    int sum = 0;
@ -342,8 +356,22 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
    return sum + tr_tree_bits * state->lambda;
  }

+  if (state->encoder_control->cfg.jccr) {
+    int cbf_mask = cbf_is_set(pred_cu->cbf, depth, COLOR_U) * 2 + cbf_is_set(pred_cu->cbf, depth, COLOR_V) - 1;
+    const cabac_ctx_t* ctx = NULL;
+    if (cbf_mask != -1) {
+      ctx = &(state->cabac.ctx.joint_cb_cr[cbf_mask]);
+      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 0);      
+    }
+    if(pred_cu->joint_cb_cr) {
+      ctx = &(state->cabac.ctx.joint_cb_cr[(pred_cu->joint_cb_cr & 1) * 2 + ((pred_cu->joint_cb_cr & 2) >> 1) - 1]);
+      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 1);
+    }
+  }
+
  // Chroma SSD
  int ssd = 0;
+  int joint_ssd = 0;
  if (!state->encoder_control->cfg.lossless) {
    int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
    int ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
@ -353,6 +381,16 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
                                    LCU_WIDTH_C,        LCU_WIDTH_C,
                                    width);
    ssd = ssd_u + ssd_v;
+
+    if(state->encoder_control->cfg.jccr) {
+      int ssd_u_joint = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        width);
+      int ssd_v_joint = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        width);
+      joint_ssd = ssd_u_joint + ssd_v_joint;
+    }
  }

  {
@ -361,10 +399,35 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,

    coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order, 0);
    coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order, 0);
+
+    if(state->encoder_control->cfg.jccr) {
+      joint_coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0);
+    }
  }

+
  double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd + bits * state->c_lambda;
+  double joint_bits = joint_cbcr_tr_tree_bits + joint_coeff_bits;
+
+  double cost = (double)ssd + bits * state->c_lambda;
+  double joint_cost = (double)joint_ssd + joint_bits * state->c_lambda;
+  if ((cost < joint_cost || !pred_cu->joint_cb_cr) || !state->encoder_control->cfg.jccr) {
+    pred_cu->joint_cb_cr = 0;
+    return cost;    
+  }
+  cbf_clear(&pred_cu->cbf, depth, COLOR_U);
+  cbf_clear(&pred_cu->cbf, depth, COLOR_V);
+  if (pred_cu->joint_cb_cr & 1) {
+    cbf_set(&pred_cu->cbf, depth, COLOR_U);
+  }
+  if (pred_cu->joint_cb_cr & 2) {
+    cbf_set(&pred_cu->cbf, depth, COLOR_V);
+  }
+  int lcu_width = LCU_WIDTH_C;
+  const int index = lcu_px.x + lcu_px.y * lcu_width;
+  kvz_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width);
+  kvz_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width);
+  return joint_cost;
 }


@ -518,6 +581,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
  cur_cu->tr_idx = 0;
  cur_cu->violates_mts_coeff_constraint = 0;
  cur_cu->mts_last_scan_pos = 0;
+  cur_cu->joint_cb_cr = 0;

  // If the CU is completely inside the frame at this depth, search for
  // prediction modes at this depth.
@ -814,7 +878,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    if (split_cost < cost) {
      // Copy split modes to this depth.
      cost = split_cost;
-      work_tree_copy_up(x_local, y_local, depth, work_tree);
+      work_tree_copy_up(x_local, y_local, depth, work_tree, state->encoder_control->cfg.jccr);
 #if KVZ_DEBUG
      //debug_split = 1;
 #endif
@ -1027,4 +1091,7 @@ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, con
  copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH);
  copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C);
  copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C);
+  if (state->encoder_control->cfg.jccr) {
+    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C);
+  }
 }
--- a/src/search.h
+++ b/src/search.h
@ -43,7 +43,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
                       lcu_t *const lcu);
 double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
                         const int x_px, const int y_px, const int depth,
-                         const cu_info_t *const pred_cu,
+                         cu_info_t * pred_cu,
                         lcu_t *const lcu);
 void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth);

--- a/src/search_intra.c
+++ b/src/search_intra.c
@ -319,7 +319,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
      kvz_intra_recon_cu(state,
        x_px, y_px,
        depth,
-        intra_mode, chroma_mode,
+        intra_mode, -1,
        pred_cu, lcu);

      // TODO: Not sure if this should be 0 or 1 but at least seems to work with 1
@ -334,15 +334,23 @@ static double search_intra_trdepth(encoder_state_t * const state,
      }

      double rd_cost = kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
-      if (reconstruct_chroma) {
-        rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
-      }
+      //if (reconstruct_chroma) {
+      //  rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+      //}

      if (rd_cost < best_rd_cost) {
        best_rd_cost = rd_cost;
        best_tr_idx = pred_cu->tr_idx;
      }
    }
+    if(reconstruct_chroma) {
+      kvz_intra_recon_cu(state,
+        x_px, y_px,
+        depth,
+        -1, chroma_mode,
+        pred_cu, lcu);
+      best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+    }
    pred_cu->tr_skip = best_tr_idx == MTS_SKIP;
    pred_cu->tr_idx = best_tr_idx;
    nosplit_cost += best_rd_cost;
@ -718,6 +726,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
    pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
    pred_cu.intra.mode = modes[rdo_mode];
    pred_cu.intra.mode_chroma = modes[rdo_mode];
+    pred_cu.joint_cb_cr = 0;
    FILL(pred_cu.cbf, 0);

    // Reset transform split data in lcu.cu for this area.
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@ -364,17 +364,17 @@ static INLINE unsigned kvz_math_floor_log2(unsigned value)
 *
 */
 void kvz_quant_avx2(const encoder_state_t * const state, const coeff_t * __restrict coef, coeff_t * __restrict q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type, int8_t transform_skip)
+  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip)
 {
  const encoder_control_t * const encoder = state->encoder_control;
  const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
  const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];

-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
+  int32_t qp_scaled = kvz_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
  uint32_t log2_tr_width = kvz_math_floor_log2(height);
  uint32_t log2_tr_height = kvz_math_floor_log2(width);
-  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
+  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[color]);
  const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); //!< Represents scaling through forward transform
  const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
@ -721,7 +721,7 @@ int kvz_quantize_residual_avx2(encoder_state_t *const state,
  if (has_coeffs && !early_skip) {

    // Get quantized residual. (coeff_out -> coeff -> residual)
-    kvz_dequant(state, coeff_out, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)),
+    kvz_dequant(state, coeff_out, coeff, width, width, color,
      cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
    if (use_trskip) {
      kvz_itransformskip(state->encoder_control, residual, coeff, width);
@ -771,7 +771,7 @@ int kvz_quantize_residual_avx2(encoder_state_t *const state,
 * \brief inverse quantize transformed and quantized coefficents
 *
 */
-void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,int8_t type, int8_t block_type, int8_t transform_skip)
+void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
  const encoder_control_t * const encoder = state->encoder_control;
  int32_t shift,add,coeff_q;
@ -779,7 +779,7 @@ void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((kvz_math_floor_log2(width) + kvz_math_floor_log2(height)) >> 1); // Represents scaling through forward transform


-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
+  int32_t qp_scaled = kvz_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;

  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
@ -788,7 +788,7 @@ void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
  {
    uint32_t log2_tr_width = kvz_math_floor_log2(height) + 2;
    uint32_t log2_tr_height = kvz_math_floor_log2(width) + 2;
-    int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
+    int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);

    const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width - 2][log2_tr_height - 2][scalinglist_type][qp_scaled % 6];
    shift += 4;
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@ -38,17 +38,17 @@
 *
 */
 void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type, int8_t transform_skip)
+  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip)
 {
  const encoder_control_t * const encoder = state->encoder_control;
  const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
  const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];

-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
+  int32_t qp_scaled = kvz_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
  uint32_t log2_tr_width = kvz_math_floor_log2(height);
  uint32_t log2_tr_height = kvz_math_floor_log2(width);
-  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
+  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[color]);
  const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
  const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); //!< Represents scaling through forward transform
  const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
@ -172,6 +172,214 @@ void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff
  }
 }

+static INLINE int64_t square(int x) {
+  return x * (int64_t)x;
+}
+
+
+int kvz_quant_cbcr_residual_generic(
+  encoder_state_t* const state, 
+  const cu_info_t* const cur_cu,
+  const int width,
+  const coeff_scan_order_t scan_order,
+  const int in_stride, const int out_stride,
+  const kvz_pixel* const u_ref_in, 
+  const kvz_pixel* const v_ref_in, 
+  const kvz_pixel* const u_pred_in,
+  const kvz_pixel* const v_pred_in,
+  kvz_pixel* u_rec_out,
+  kvz_pixel* v_rec_out,
+  coeff_t* coeff_out,
+  bool early_skip, 
+  int lmcs_chroma_adj
+  ) {
+  ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) int16_t u1_residual[2][TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) int16_t v1_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+
+  {
+    int y, x;
+    for (y = 0; y < width; ++y) {
+      for (x = 0; x < width; ++x) {
+        u_residual[x + y * width] = (int16_t)(u_ref_in[x + y * in_stride] - u_pred_in[x + y * in_stride]);
+        v_residual[x + y * width] = (int16_t)(v_ref_in[x + y * in_stride] - v_pred_in[x + y * in_stride]);
+      }
+    }
+  }
+
+  int best_cbf_mask = -1;
+  int64_t best_cost = INT64_MAX;
+
+  // This changes the order of the cbf_masks so 2 and 3 are swapped compared with VTM
+  for(int cbf_mask = cur_cu->type == CU_INTRA ? 1 : 3; cbf_mask < 4; cbf_mask++) {
+    int64_t d1 = 0;
+    for (int y = 0; y < width; y++)
+    {
+      for (int x = 0; x < width; x++)
+      {
+        int cbx = u_residual[x + y * width], crx = v_residual[x + y * width];
+        if (cbf_mask == 1)
+        {
+          u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx + 2 * crx) / 5);
+          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (u1_residual[cbf_mask / 2][x + y * width] >> 1));
+        }
+        else if (cbf_mask == -1)
+        {
+          u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx - 2 * crx) / 5);
+          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (-u1_residual[cbf_mask / 2][x + y * width] >> 1));
+        }
+        else if (cbf_mask == 3)
+        {
+          u1_residual[cbf_mask / 2][x + y * width] = ((cbx + crx) / 2);
+          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - u1_residual[cbf_mask / 2][x + y * width]);
+        }
+        else if (cbf_mask == -3)
+        {
+          u1_residual[cbf_mask / 2][x + y * width] = ((cbx - crx) / 2);
+          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx + u1_residual[cbf_mask / 2][x + y * width]);
+        }
+        else if (cbf_mask == 2)
+        {
+          v1_residual[x + y * width] = ((4 * crx + 2 * cbx) / 5);
+          d1 += square(cbx - (v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
+        }
+        else if (cbf_mask == -2)
+        {
+          v1_residual[x + y * width] = ((4 * crx - 2 * cbx) / 5);
+          d1 += square(cbx - (-v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
+        }
+        else
+        {
+          d1 += square(cbx);
+          //d2 += square(crx);
+        }
+      }
+    }
+    if (d1 < best_cost) {
+      best_cbf_mask = cbf_mask;
+      best_cost = d1;
+    }
+  }
+
+  kvz_transform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu);
+
+  if (state->encoder_control->cfg.rdoq_enable &&
+    (width > 4 || !state->encoder_control->cfg.rdoq_skip))
+  {
+    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
+    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
+    kvz_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+      scan_order, cur_cu->type, tr_depth, cur_cu->cbf);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable && false) {
+    kvz_ts_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+      scan_order);
+  }
+  else {
+    kvz_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
+  }
+
+  int8_t has_coeffs = 0;
+  {
+    int i;
+    for (i = 0; i < width * width; ++i) {
+      if (coeff_out[i] != 0) {
+        has_coeffs = 1;
+        break;
+      }
+    }
+  }
+
+  if (has_coeffs && !early_skip) {
+    int y, x;
+
+    // Get quantized residual. (coeff_out -> coeff -> residual)
+    kvz_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+      cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
+    
+    kvz_itransform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu);
+    
+
+    //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
+    //  int y, x;
+    //  int sign, absval;
+    //  int maxAbsclipBD = (1 << KVZ_BIT_DEPTH) - 1;
+    //  for (y = 0; y < width; ++y) {
+    //    for (x = 0; x < width; ++x) {
+    //      residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]);
+    //      sign = residual[x + y * width] >= 0 ? 1 : -1;
+    //      absval = sign * residual[x + y * width];
+    //      int val = sign * ((absval * lmcs_chroma_adj + (1 << (CSCALE_FP_PREC - 1))) >> CSCALE_FP_PREC);
+    //      if (sizeof(kvz_pixel) == 2) // avoid overflow when storing data
+    //      {
+    //        val = CLIP(-32768, 32767, val);
+    //      }
+    //      residual[x + y * width] = (int16_t)val;
+    //    }
+    //  }
+    //}
+
+    // Get quantized reconstruction. (residual + pred_in -> rec_out)
+    for (int y = 0; y < width; y++) {
+      for (int x = 0; x < width; x++) {
+        if (best_cbf_mask == 1) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
+          v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width] >> 1;
+        }
+        else if (best_cbf_mask == -1) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
+          v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width] >> 1;
+        }
+        else if (best_cbf_mask == 3) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
+          v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
+        }
+        else if (best_cbf_mask == -3) {
+          // non-normative clipping to prevent 16-bit overflow
+          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x];
+          v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width];
+        }
+        else if (best_cbf_mask == 2) {
+          u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
+          v_residual[x + y * width] = v1_residual[x + y * width];
+        }
+        else if (best_cbf_mask == -2) {
+          u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
+          v_residual[x + y * width] = -v1_residual[x + y * width];
+        }
+      }
+    }
+    for (y = 0; y < width; ++y) {
+      for (x = 0; x < width; ++x) {
+        int16_t u_val = u_residual[x + y * width] + u_pred_in[x + y * in_stride];
+        u_rec_out[x + y * out_stride] = (kvz_pixel)CLIP(0, PIXEL_MAX, u_val);
+        int16_t v_val = v_residual[x + y * width] + v_pred_in[x + y * in_stride];
+        v_rec_out[x + y * out_stride] = (kvz_pixel)CLIP(0, PIXEL_MAX, v_val);
+      }
+    }
+  }
+  else/* if (rec_out != pred_in)*/ {
+    // With no coeffs and rec_out == pred_int we skip copying the coefficients
+    // because the reconstruction is just the prediction.
+    int y, x;
+
+    for (y = 0; y < width; ++y) {
+      for (x = 0; x < width; ++x) {
+        u_rec_out[x + y * out_stride] = u_pred_in[x + y * in_stride];
+        v_rec_out[x + y * out_stride] = v_pred_in[x + y * in_stride];
+      }
+    }
+  }
+
+
+
+
+  return has_coeffs ? best_cbf_mask : 0;
+}
+
 /**
 * \brief Quantize residual and get both the reconstruction and coeffs.
 *
@ -271,7 +479,7 @@ int kvz_quantize_residual_generic(encoder_state_t *const state,
    int y, x;

    // Get quantized residual. (coeff_out -> coeff -> residual)
-    kvz_dequant(state, coeff_out, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)),
+    kvz_dequant(state, coeff_out, coeff, width, width, color,
      cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
    if (use_trskip) {
      kvz_itransformskip(state->encoder_control, residual, coeff, width);
@ -326,7 +534,7 @@ int kvz_quantize_residual_generic(encoder_state_t *const state,
 * \brief inverse quantize transformed and quantized coefficents
 *
 */
-void kvz_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,int8_t type, int8_t block_type, int8_t transform_skip)
+void kvz_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
  const encoder_control_t * const encoder = state->encoder_control;
  int32_t shift,add,coeff_q;
@ -334,7 +542,7 @@ void kvz_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
  int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((kvz_math_floor_log2(width) + kvz_math_floor_log2(height)) >> 1); // Represents scaling through forward transform


-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
+  int32_t qp_scaled = kvz_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
  qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;

  shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
@ -343,7 +551,7 @@ void kvz_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
  {
    uint32_t log2_tr_width = kvz_math_floor_log2(height) + 2;
    uint32_t log2_tr_height = kvz_math_floor_log2(width) + 2;
-    int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
+    int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);

    const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width -2][log2_tr_height -2][scalinglist_type][qp_scaled%6];
    shift += 4;
@ -413,6 +621,7 @@ int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth)
  bool success = true;

  success &= kvz_strategyselector_register(opaque, "quant", "generic", 0, &kvz_quant_generic);
+  success &= kvz_strategyselector_register(opaque, "quant_cbcr_residual", "generic", 0, &kvz_quant_cbcr_residual_generic);
  success &= kvz_strategyselector_register(opaque, "quantize_residual", "generic", 0, &kvz_quantize_residual_generic);
  success &= kvz_strategyselector_register(opaque, "dequant", "generic", 0, &kvz_dequant_generic);
  success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "generic", 0, &coeff_abs_sum_generic);
--- a/src/strategies/generic/quant-generic.h
+++ b/src/strategies/generic/quant-generic.h
@ -36,7 +36,7 @@

 int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth);
 void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type, int8_t transform_skip);
+  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip);

 int kvz_quantize_residual_generic(encoder_state_t *const state,
  const cu_info_t *const cur_cu, const int width, const color_t color,
@ -46,4 +46,21 @@ int kvz_quantize_residual_generic(encoder_state_t *const state,
  kvz_pixel *rec_out, coeff_t *coeff_out,
  bool early_skip, int lmcs_chroma_adj);

+int kvz_quant_cbcr_residual_generic(
+  encoder_state_t* const state,
+  const cu_info_t* const cur_cu,
+  const int width,
+  const coeff_scan_order_t scan_order,
+  const int in_stride, const int out_stride,
+  const kvz_pixel* const u_ref_in,
+  const kvz_pixel* const v_ref_in,
+  const kvz_pixel* const u_pred_in,
+  const kvz_pixel* const v_pred_in,
+  kvz_pixel* u_rec_out,
+  kvz_pixel* v_rec_out,
+  coeff_t* coeff_out,
+  bool early_skip,
+  int lmcs_chroma_adj
+);
+
 #endif //STRATEGIES_QUANT_GENERIC_H_
--- a/src/strategies/strategies-quant.c
+++ b/src/strategies/strategies-quant.c
@ -27,6 +27,7 @@

 // Define function pointers.
 quant_func *kvz_quant;
+quant_cbcr_func *kvz_quant_cbcr_residual;
 quant_residual_func *kvz_quantize_residual;
 dequant_func *kvz_dequant;
 coeff_abs_sum_func *kvz_coeff_abs_sum;
--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@ -34,7 +34,22 @@

 // Declare function pointers.
 typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type, int8_t transform_skip);
+  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip);
+typedef unsigned (quant_cbcr_func)(
+  encoder_state_t* const state,
+  const cu_info_t* const cur_cu,
+  const int width,
+  const coeff_scan_order_t scan_order,
+  const int in_stride, const int out_stride,
+  const kvz_pixel* const u_ref_in,
+  const kvz_pixel* const v_ref_in,
+  const kvz_pixel* const u_pred_in,
+  const kvz_pixel* const v_pred_in,
+  kvz_pixel* u_rec_out,
+  kvz_pixel* v_rec_out,
+  coeff_t* coeff_out,
+  bool early_skip,
+  int lmcs_chroma_adj);
 typedef unsigned (quant_residual_func)(encoder_state_t *const state,
  const cu_info_t *const cur_cu, const int width, const color_t color,
  const coeff_scan_order_t scan_order, const int use_trskip,
@ -43,13 +58,14 @@ typedef unsigned (quant_residual_func)(encoder_state_t *const state,
  kvz_pixel *rec_out, coeff_t *coeff_out,
  bool early_skip, int lmcs_chroma_adj);
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
-  int32_t height, int8_t type, int8_t block_type, int8_t transform_skip);
+  int32_t height, color_t color, int8_t block_type, int8_t transform_skip);
 typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);

 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);

 // Declare function pointers.
 extern quant_func * kvz_quant;
+extern quant_cbcr_func* kvz_quant_cbcr_residual;
 extern quant_residual_func * kvz_quantize_residual;
 extern dequant_func *kvz_dequant;
 extern coeff_abs_sum_func *kvz_coeff_abs_sum;
@ -60,6 +76,7 @@ int kvz_strategy_register_quant(void* opaque, uint8_t bitdepth);

 #define STRATEGIES_QUANT_EXPORTS \
  {"quant", (void**) &kvz_quant}, \
+  {"quant_cbcr_residual", (void**) &kvz_quant_cbcr_residual}, \
  {"quantize_residual", (void**) &kvz_quantize_residual}, \
  {"dequant", (void**) &kvz_dequant}, \
  {"coeff_abs_sum", (void**) &kvz_coeff_abs_sum}, \
--- a/src/transform.c
+++ b/src/transform.c
@ -127,10 +127,10 @@ static void rdpcm(const int width,
 * \brief Get scaled QP used in quantization
 *
 */
-int32_t kvz_get_scaled_qp(int8_t type, int8_t qp, int8_t qp_offset, int8_t const * const chroma_scale)
+int32_t kvz_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t const * const chroma_scale)
 {
  int32_t qp_scaled = 0;
-  if(type == 0) {
+  if(color == 0) {
    qp_scaled = qp + qp_offset;
  } else {
    qp_scaled = CLIP(-qp_offset, 57, qp);
@ -306,13 +306,13 @@ static void quantize_tr_residual(encoder_state_t * const state,
 {
  const kvz_config *cfg    = &state->encoder_control->cfg;
  const int32_t shift      = color == COLOR_Y ? 0 : 1;
-  const vector2d_t lcu_px  = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift };
+  const vector2d_t lcu_px  = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift};

  // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
  // left PU because the coordinates are correct.
  bool handled_elsewhere = color != COLOR_Y &&
-                           depth > MAX_DEPTH &&
-                           (lcu_px.x % 4 != 0 || lcu_px.y % 4 != 0);
+                           depth == MAX_DEPTH &&
+                           (x % 4 != 0 || y % 4 != 0);
  if (handled_elsewhere) {
    return;
  }
@ -367,7 +367,7 @@ static void quantize_tr_residual(encoder_state_t * const state,
                              cfg->trskip_enable && 
                              cur_pu->tr_idx == 1;

-  bool has_coeffs;
+  uint8_t has_coeffs;


  int lmcs_chroma_adj = 0;
@ -411,6 +411,25 @@ static void quantize_tr_residual(encoder_state_t * const state,
                                              lmcs_chroma_adj);
    cur_pu->tr_skip = tr_skip;
  } else {
+    if(color == COLOR_UV) {
+      has_coeffs = kvz_quant_cbcr_residual(
+        state,
+        cur_pu,
+        tr_width,
+        scan_idx,
+        lcu_width,
+        lcu_width,
+        &lcu->ref.u[offset], &lcu->ref.v[offset],
+        &lcu->rec.joint_u[offset], &lcu->rec.joint_v[offset],
+        &lcu->rec.joint_u[offset], &lcu->rec.joint_v[offset],
+        &lcu->coeff.joint_uv[z_index],
+        early_skip,
+        lmcs_chroma_adj
+      );
+      cur_pu->joint_cb_cr = has_coeffs;
+      return;
+    }
+
    has_coeffs = kvz_quantize_residual(state,
                                       cur_pu,
                                       tr_width,
@ -425,6 +444,7 @@ static void quantize_tr_residual(encoder_state_t * const state,
                                       coeff,
                                       early_skip,
                                       lmcs_chroma_adj);
+    
  }

  cbf_clear(&cur_pu->cbf, depth, color);
@ -519,6 +539,9 @@ void kvz_quantize_lcu_residual(encoder_state_t * const state,
    if (chroma) {
      quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip);
      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip);
+      if(state->encoder_control->cfg.jccr && cur_pu->tr_depth == cur_pu->depth){
+        quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip);
+      }
    }
  }
 }
--- a/src/transform.h
+++ b/src/transform.h
@ -53,7 +53,7 @@ void kvz_itransform2d(const encoder_control_t * const encoder,
                      const cu_info_t *tu);


-int32_t kvz_get_scaled_qp(int8_t type, int8_t qp, int8_t qp_offset, int8_t const* const chroma_scale);
+int32_t kvz_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t const* const chroma_scale);

 void kvz_quantize_lcu_residual(encoder_state_t *state,
                               bool luma,
--- a/tests/test_intra.sh
+++ b/tests/test_intra.sh
@ -12,4 +12,6 @@ valgrind_test $common_args --rd=2 --no-transform-skip --qp 37
 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37 --signhide --rdoq 
 valgrind_test $common_args --alf=full --no-wpp --threads=0 --owf=0
 valgrind_test $common_args --alf=full --wpp --threads=1
+valgrind_test $common_args --jccr
+valgrind_test $common_args --jccr --rdoq --rd=2 --mts=intra