diff --git a/configure.ac b/configure.ac
index 4447e5bf..dbbb1b72 100644
--- a/configure.ac
+++ b/configure.ac
@@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=6
-ver_minor=5
+ver_minor=6
 ver_release=0
 
 # Prevents configure from adding a lot of defines to the CFLAGS
diff --git a/src/cabac.h b/src/cabac.h
index f86a633a..9b946ab1 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -95,7 +95,7 @@ typedef struct
     cabac_ctx_t luma_planar_model[2];
     cabac_ctx_t multi_ref_line[2];
     cabac_ctx_t bdpcm_mode[4];
-    cabac_ctx_t joint_bc_br[3];
+    cabac_ctx_t joint_cb_cr[3];
     cabac_ctx_t transform_skip_model_luma;
     cabac_ctx_t transform_skip_model_chroma;
     cabac_ctx_t transform_skip_sig_coeff[3];
diff --git a/src/cfg.c b/src/cfg.c
index 2d06a718..37cf6b04 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -180,10 +180,7 @@ int kvz_config_init(kvz_config *cfg)
   cfg->fastrd_sampling_on = 0;
   cfg->fastrd_accuracy_check_on = 0;
   cfg->fastrd_learning_outdir_fn = NULL;
-
-  int8_t in[] = { 17, 27, 32, 44 };
-  int8_t out[] = { 17, 29, 34, 41 };
-
+  
   cfg->chroma_scale_out[0][0] = cfg->chroma_scale_in[0][0] = 17;
   cfg->chroma_scale_out[0][1] = cfg->chroma_scale_in[0][1] = 27;
   cfg->chroma_scale_out[0][2] = cfg->chroma_scale_in[0][2] = 32;
@@ -195,6 +192,8 @@ int kvz_config_init(kvz_config *cfg)
 
   parse_qp_map(cfg, 0);
 
+  cfg->jccr = 0;
+
   return 1;
 }
 
@@ -1466,6 +1465,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
     parse_qp_map(cfg, 0);
     return success;
   }
+  else if OPT("jccr") {
+    cfg->jccr = (bool)atobool(value);
+  }
   else {
     return 0;
   }
diff --git a/src/cli.c b/src/cli.c
index 2ec02f8b..24cf22c4 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -162,6 +162,8 @@ static const struct option long_options[] = {
   { "fastrd-outdir",      required_argument, NULL, 0 },
   { "chroma-qp-in",       required_argument, NULL, 0 },
   { "chroma-qp-out",      required_argument, NULL, 0 },
+  { "jccr",                     no_argument, NULL, 0 },
+  { "no-jccr",                  no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -613,6 +615,8 @@ void print_help(void)
     "                                   - both: MTS applied for both intra and inter blocks.\n"
     "                                   - implicit: uses implicit MTS. Applies DST7 instead \n"
     "                                               of DCT2 to certain intra blocks.\n"
+    "      --(no-)jccr            : Joint coding of chroma residual. "
+    "                               Requires rdo> = 2. [disabled]\n"          
     "\n"
     /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Parallel processing:\n"
diff --git a/src/context.c b/src/context.c
index dd7853a2..ace5f46c 100644
--- a/src/context.c
+++ b/src/context.c
@@ -455,7 +455,7 @@ void kvz_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
 
   for (i = 0; i < 3; i++) {
     kvz_ctx_init(&cabac->ctx.cu_skip_flag_model[i], QP, INIT_SKIP_FLAG[slice][i], INIT_SKIP_FLAG[3][i]);
-    kvz_ctx_init(&cabac->ctx.joint_bc_br[i], QP, INIT_JOINT_CB_CR_FLAG[slice][i], INIT_JOINT_CB_CR_FLAG[3][i]);   
+    kvz_ctx_init(&cabac->ctx.joint_cb_cr[i], QP, INIT_JOINT_CB_CR_FLAG[slice][i], INIT_JOINT_CB_CR_FLAG[3][i]);   
     kvz_ctx_init(&cabac->ctx.transform_skip_sig_coeff[i], QP, INIT_TRANSFORM_SKIP_SIG_COEFF[slice][i], INIT_TRANSFORM_SKIP_SIG_COEFF[3][i]);
     kvz_ctx_init(&cabac->ctx.transform_skip_sig[i], QP, INIT_TRANSFORM_SKIP_SIG[slice][i], INIT_TRANSFORM_SKIP_SIG[3][i]);
   }
diff --git a/src/cu.h b/src/cu.h
index 093840dc..c0f395db 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -127,15 +127,16 @@ typedef struct {
  */
 typedef struct
 {
-  uint8_t type      : 2; //!< \brief block type, one of cu_type_t values
-  uint8_t depth     : 3; //!< \brief depth / size of this block
-  uint8_t part_size : 3; //!< \brief partition mode, one of part_mode_t values
-  uint8_t tr_depth  : 3; //!< \brief transform depth
-  uint8_t skipped   : 1; //!< \brief flag to indicate this block is skipped
-  uint8_t merged    : 1; //!< \brief flag to indicate this block is merged
-  uint8_t merge_idx : 3; //!< \brief merge index
-  uint8_t tr_skip   : 1; //!< \brief transform skip flag
-  uint8_t tr_idx : 3; //!< \brief transform index
+  uint8_t type        : 2; //!< \brief block type, one of cu_type_t values
+  uint8_t depth       : 3; //!< \brief depth / size of this block
+  uint8_t part_size   : 3; //!< \brief partition mode, one of part_mode_t values
+  uint8_t tr_depth    : 3; //!< \brief transform depth
+  uint8_t skipped     : 1; //!< \brief flag to indicate this block is skipped
+  uint8_t merged      : 1; //!< \brief flag to indicate this block is merged
+  uint8_t merge_idx   : 3; //!< \brief merge index
+  uint8_t tr_skip     : 1; //!< \brief transform skip flag
+  uint8_t tr_idx      : 3; //!< \brief transform index
+  uint8_t joint_cb_cr : 2; //!< \brief joint chroma residual coding 
 
   uint16_t cbf;
 
@@ -299,6 +300,7 @@ typedef ALIGNED(8) struct {
   coeff_t y[LCU_LUMA_SIZE];
   coeff_t u[LCU_CHROMA_SIZE];
   coeff_t v[LCU_CHROMA_SIZE];
+  coeff_t joint_uv[LCU_CHROMA_SIZE];
 } lcu_coeff_t;
 
 
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 799941e6..a25304db 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -328,31 +328,41 @@ void kvz_encode_last_significant_xy(cabac_data_t * const cabac,
   }
 }
 
-static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int depth, const uint8_t width_c, const cu_info_t* cur_pu, int8_t* scan_idx, lcu_coeff_t* coeff) {
+static void encode_chroma_tu(encoder_state_t* const state, int x, int y, int depth, const uint8_t width_c, const cu_info_t* cur_pu, int8_t* scan_idx, lcu_coeff_t* coeff, uint8_t joint_chroma) {
   int x_local = (x >> 1) % LCU_WIDTH_C;
   int y_local = (y >> 1) % LCU_WIDTH_C;
   cabac_data_t* const cabac = &state->cabac;
   *scan_idx = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth);
+  if(!joint_chroma){
+    const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
+    const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
 
-  const coeff_t *coeff_u = &coeff->u[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
-  const coeff_t *coeff_v = &coeff->v[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
-
-  if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) {
-    if(state->encoder_control->cfg.trskip_enable && width_c == 4){
-      cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
-      // HEVC only supports transform_skip for Luma
-      // TODO: transform skip for chroma blocks
-      CABAC_BIN(cabac, 0, "transform_skip_flag");
+    if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) {
+      if(state->encoder_control->cfg.trskip_enable && width_c == 4){
+        cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
+        // HEVC only supports transform_skip for Luma
+        // TODO: transform skip for chroma blocks
+        CABAC_BIN(cabac, 0, "transform_skip_flag");
+      }
+      kvz_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, 1, *scan_idx, NULL, false);
     }
-    kvz_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, 1, *scan_idx, NULL, false);
-  }
 
-  if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
+    if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
+      if (state->encoder_control->cfg.trskip_enable && width_c == 4) {
+        cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
+        CABAC_BIN(cabac, 0, "transform_skip_flag");
+      }
+      kvz_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, 2, *scan_idx, NULL, false);
+    }
+  }
+  else {
+    const coeff_t *coeff_uv = &coeff->joint_uv[xy_to_zorder(LCU_WIDTH_C, x_local, y_local)];
     if (state->encoder_control->cfg.trskip_enable && width_c == 4) {
       cabac->cur_ctx = &cabac->ctx.transform_skip_model_chroma;
       CABAC_BIN(cabac, 0, "transform_skip_flag");
     }
-    kvz_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, 2, *scan_idx, NULL, false);
+    kvz_encode_coeff_nxn(state, &state->cabac, coeff_uv, width_c, 2, *scan_idx, NULL, false);
+    
   }
 }
 
@@ -370,16 +380,6 @@ static void encode_transform_unit(encoder_state_t * const state,
 
   int8_t scan_idx = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);
 
-  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-    // joint_cb_cr
-    /*
-    if (type == 2 && cbf_mask) {
-      cabac->cur_ctx = &(cabac->ctx.joint_bc_br[0]);
-      CABAC_BIN(cabac, 0, "joint_cb_cr");
-    }
-    */
-  }
-
   int cbf_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y);
 
   if (cbf_y && !only_chroma) {
@@ -410,6 +410,7 @@ static void encode_transform_unit(encoder_state_t * const state,
     }
   }
 
+  bool joint_chroma = cur_pu->joint_cb_cr != 0;
   if (depth == MAX_DEPTH) {
     // For size 4x4 luma transform the corresponding chroma transforms are
     // also of size 4x4 covering 8x8 luma pixels. The residual is coded in
@@ -428,8 +429,8 @@ static void encode_transform_unit(encoder_state_t * const state,
 
   bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, depth, COLOR_U) ||
                         cbf_is_set(cur_pu->cbf, depth, COLOR_V);
-  if (chroma_cbf_set) {
-    encode_chroma_tu(state, x, y, depth, width_c, cur_pu, &scan_idx, coeff);
+  if (chroma_cbf_set || joint_chroma) {
+    encode_chroma_tu(state, x, y, depth, width_c, cur_pu, &scan_idx, coeff, joint_chroma);
   }
 }
 
@@ -483,8 +484,8 @@ static void encode_transform_coeff(encoder_state_t * const state,
  
 
   const int cb_flag_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y);
-  const int cb_flag_u = cbf_is_set(cur_cu->cbf, depth, COLOR_U);
-  const int cb_flag_v = cbf_is_set(cur_cu->cbf, depth, COLOR_V);
+  const int cb_flag_u = cur_pu->joint_cb_cr ? cur_pu->joint_cb_cr & 1 : cbf_is_set(cur_cu->cbf, depth, COLOR_U);
+  const int cb_flag_v = cur_pu->joint_cb_cr ? ((cur_pu->joint_cb_cr & 2) >> 1) : cbf_is_set(cur_cu->cbf, depth, COLOR_V);
 
   // The split_transform_flag is not signaled when:
   // - transform size is greater than 32 (depth == 0)
@@ -519,7 +520,7 @@ static void encode_transform_coeff(encoder_state_t * const state,
       }
       if (true) {
         cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_cr[cb_flag_u ? 1 : 0]);
-        CABAC_BIN(cabac, cb_flag_v, "cbf_cr");
+        CABAC_BIN(cabac,  cb_flag_v, "cbf_cr");
       }
     }
   }
@@ -570,7 +571,10 @@ static void encode_transform_coeff(encoder_state_t * const state,
 
       state->must_code_qp_delta = false;
     }
-
+    if((cb_flag_u || cb_flag_v ) && (depth != 4 || only_chroma) && state->encoder_control->cfg.jccr) {
+      cabac->cur_ctx = &cabac->ctx.joint_cb_cr[cb_flag_u * 2 + cb_flag_v - 1];
+      CABAC_BIN(cabac, cur_pu->joint_cb_cr != 0, "tu_joint_cbcr_residual_flag");
+    }
     encode_transform_unit(state, x, y, depth, only_chroma, coeff);
   }
 }
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 2552cc58..641f48b9 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -614,7 +614,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
 
 
   if (encoder->chroma_format != KVZ_CSP_400) {
-    WRITE_U(stream, 0, 1, "sps_joint_cbcr_enabled_flag");
+    WRITE_U(stream, encoder->cfg.jccr, 1, "sps_joint_cbcr_enabled_flag");
     WRITE_U(stream, 1, 1, "same_qp_table_for_chroma");
 
     for (int i = 0; i < encoder->cfg.num_used_table; i++) {
@@ -1265,6 +1265,11 @@ void kvz_encoder_state_write_bitstream_slice_header(
     WRITE_UE(stream, state->frame->slicetype, "sh_slice_type");
   }
 
+
+  if (encoder->cfg.jccr) {
+    WRITE_U(stream, 0, 1, "ph_joint_cbcr_sign_flag");
+  }
+
   if (state->frame->pictype == KVZ_NAL_CRA_NUT || state->frame->pictype == KVZ_NAL_IDR_N_LP || state->frame->pictype == KVZ_NAL_IDR_W_RADL || state->frame->pictype == KVZ_NAL_GDR_NUT)
   {
     WRITE_U(stream, 0, 1, "sh_no_output_of_prior_pics_flag");
@@ -1322,7 +1327,6 @@ void kvz_encoder_state_write_bitstream_slice_header(
   int slice_qp_delta = state->frame->QP - encoder->cfg.qp;
   WRITE_SE(stream, slice_qp_delta, "sh_qp_delta");
 
-
   if (encoder->cfg.sao_type) {
     WRITE_U(stream, 1, 1, "sh_sao_luma_flag");
     if (encoder->chroma_format != KVZ_CSP_400) {
diff --git a/src/global.h b/src/global.h
index 97954fad..b3d24048 100644
--- a/src/global.h
+++ b/src/global.h
@@ -65,6 +65,7 @@
 #define RESHAPE_SIGNAL_HLG 2
 #define RESHAPE_SIGNAL_NULL 100
 
+
 /**
  * \defgroup Bitstream
  * HEVC bitstream coding
@@ -327,7 +328,7 @@ typedef int16_t coeff_t;
 #define MAX_TR_DYNAMIC_RANGE 15
 
 //Constants
-typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V } color_t;
+typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V, COLOR_UV } color_t;
 
 
 // Hardware data (abstraction of defines). Extend for other compilers
diff --git a/src/image.c b/src/image.c
index c0a9eb0f..44d1ee45 100644
--- a/src/image.c
+++ b/src/image.c
@@ -220,6 +220,8 @@ hi_prec_buf_t * kvz_hi_prec_buf_t_alloc(int luma_size)
   yuv->y = (int16_t *)malloc(luma_size * sizeof(*yuv->y));
   yuv->u = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->u));
   yuv->v = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->v));
+  yuv->joint_u = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->u));
+  yuv->joint_v = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->v));
   yuv->size = luma_size;
 
   return yuv;
@@ -230,6 +232,8 @@ void kvz_hi_prec_buf_t_free(hi_prec_buf_t * yuv)
   free(yuv->y);
   free(yuv->u);
   free(yuv->v);
+  free(yuv->joint_v);
+  free(yuv->joint_u);
   free(yuv);
 }
 
diff --git a/src/image.h b/src/image.h
index 950066bf..11e6452f 100644
--- a/src/image.h
+++ b/src/image.h
@@ -36,6 +36,8 @@ typedef struct {
   kvz_pixel y[LCU_LUMA_SIZE];
   kvz_pixel u[LCU_CHROMA_SIZE];
   kvz_pixel v[LCU_CHROMA_SIZE];
+  kvz_pixel joint_u[LCU_CHROMA_SIZE];
+  kvz_pixel joint_v[LCU_CHROMA_SIZE];
   enum kvz_chroma_format chroma_format;
 } lcu_yuv_t;
 
@@ -44,6 +46,8 @@ typedef struct {
   int16_t *y;
   int16_t *u;
   int16_t *v;
+  int16_t *joint_u;
+  int16_t *joint_v;
 } hi_prec_buf_t;
 
 typedef struct {
diff --git a/src/intra.c b/src/intra.c
index 65113441..d5924c01 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -603,19 +603,25 @@ static void intra_recon_tb_leaf(
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;
   kvz_pixel *block = NULL;
+  kvz_pixel *block2 = NULL;
   switch (color) {
     case COLOR_Y:
       block = &lcu->rec.y[index];
       break;
     case COLOR_U:
       block = &lcu->rec.u[index];
+      block2 = &lcu->rec.joint_u[index];
       break;
     case COLOR_V:
       block = &lcu->rec.v[index];
+      block2 = &lcu->rec.joint_v[index];
       break;
   }
 
   kvz_pixels_blit(pred, block , width, width, width, lcu_width);
+  if(color != COLOR_Y && cfg->jccr) {
+    kvz_pixels_blit(pred, block2, width, width, width, lcu_width);
+  }
 }
 
 /**
@@ -683,7 +689,7 @@ void kvz_intra_recon_cu(
     }
   } else {
     const bool has_luma = mode_luma != -1;
-    const bool has_chroma = mode_chroma != -1 && x % 8 == 0 && y % 8 == 0;
+    const bool has_chroma = mode_chroma != -1 &&  (x % 8 == 0 && y % 8 == 0);
     // Process a leaf TU.
     if (has_luma) {
       intra_recon_tb_leaf(state, x, y, depth, mode_luma, lcu, COLOR_Y);
diff --git a/src/kvazaar.h b/src/kvazaar.h
index 7b9abc9b..f5d62baa 100644
--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@@ -29,7 +29,6 @@
 #include <stdint.h>
 #include <stdio.h>
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -490,6 +489,8 @@ typedef struct kvz_config
 
   int8_t chroma_scale_in[3][17];
   int8_t chroma_scale_out[3][17];
+
+  int8_t jccr;
 } kvz_config;
 
 /**
diff --git a/src/search.c b/src/search.c
index c874f6c2..b207b3e5 100644
--- a/src/search.c
+++ b/src/search.c
@@ -80,7 +80,7 @@ static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *fr
   }
 }
 
-static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
+static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to, bool joint)
 {
   const int luma_z = xy_to_zorder(LCU_WIDTH, x_local, y_local);
   copy_coeffs(&from->coeff.y[luma_z], &to->coeff.y[luma_z], width);
@@ -89,18 +89,22 @@ static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *fr
     const int chroma_z = xy_to_zorder(LCU_WIDTH_C, x_local >> 1, y_local >> 1);
     copy_coeffs(&from->coeff.u[chroma_z], &to->coeff.u[chroma_z], width >> 1);
     copy_coeffs(&from->coeff.v[chroma_z], &to->coeff.v[chroma_z], width >> 1);
+    if (joint) {
+      copy_coeffs(&from->coeff.joint_uv[chroma_z], &to->coeff.joint_uv[chroma_z], width >> 1);
+    }
   }
 }
 
 /**
  * Copy all non-reference CU data from next level to current level.
  */
-static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_tree)
+static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_tree, bool joint)
 {
   const int width = LCU_WIDTH >> depth;
   copy_cu_info  (x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
   copy_cu_pixels(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
-  copy_cu_coeffs(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth]);
+  copy_cu_coeffs(x_local, y_local, width, &work_tree[depth + 1], &work_tree[depth], joint);
+  
 }
 
 
@@ -298,7 +302,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
 
 double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
                          const int x_px, const int y_px, const int depth,
-                         const cu_info_t *const pred_cu,
+                         cu_info_t * pred_cu,
                          lcu_t *const lcu)
 {
   const vector2d_t lcu_px = { (x_px & ~7) / 2, (y_px & ~7) / 2 };
@@ -306,7 +310,9 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
 
   double tr_tree_bits = 0;
+  double joint_cbcr_tr_tree_bits = 0;
   double coeff_bits = 0;
+  double joint_coeff_bits = 0;
 
   assert(x_px >= 0 && x_px < LCU_WIDTH);
   assert(y_px >= 0 && y_px < LCU_WIDTH);
@@ -323,13 +329,21 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
     if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
       tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
     }
+    if(state->encoder_control->cfg.jccr) {
+      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, pred_cu->joint_cb_cr & 1);
+    }
     int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
     ctx = &(state->cabac.ctx.qt_cbf_model_cr[is_set]);
     if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
       tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
     }
+    if(state->encoder_control->cfg.jccr) {
+      ctx = &(state->cabac.ctx.qt_cbf_model_cr[pred_cu->joint_cb_cr & 1]);
+      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, (pred_cu->joint_cb_cr & 2) >> 1);
+    }
   }
 
+
   if (tr_cu->tr_depth > depth) {
     int offset = LCU_WIDTH >> (depth + 1);
     int sum = 0;
@@ -342,8 +356,22 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
     return sum + tr_tree_bits * state->lambda;
   }
 
+  if (state->encoder_control->cfg.jccr) {
+    int cbf_mask = cbf_is_set(pred_cu->cbf, depth, COLOR_U) * 2 + cbf_is_set(pred_cu->cbf, depth, COLOR_V) - 1;
+    const cabac_ctx_t* ctx = NULL;
+    if (cbf_mask != -1) {
+      ctx = &(state->cabac.ctx.joint_cb_cr[cbf_mask]);
+      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 0);      
+    }
+    if(pred_cu->joint_cb_cr) {
+      ctx = &(state->cabac.ctx.joint_cb_cr[(pred_cu->joint_cb_cr & 1) * 2 + ((pred_cu->joint_cb_cr & 2) >> 1) - 1]);
+      joint_cbcr_tr_tree_bits += CTX_ENTROPY_FBITS(ctx, 1);
+    }
+  }
+
   // Chroma SSD
   int ssd = 0;
+  int joint_ssd = 0;
   if (!state->encoder_control->cfg.lossless) {
     int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
     int ssd_u = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.u[index],
@@ -353,6 +381,16 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
                                     LCU_WIDTH_C,        LCU_WIDTH_C,
                                     width);
     ssd = ssd_u + ssd_v;
+
+    if(state->encoder_control->cfg.jccr) {
+      int ssd_u_joint = kvz_pixels_calc_ssd(&lcu->ref.u[index], &lcu->rec.joint_u[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        width);
+      int ssd_v_joint = kvz_pixels_calc_ssd(&lcu->ref.v[index], &lcu->rec.joint_v[index],
+        LCU_WIDTH_C, LCU_WIDTH_C,
+        width);
+      joint_ssd = ssd_u_joint + ssd_v_joint;
+    }
   }
 
   {
@@ -361,10 +399,35 @@ double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
 
     coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.u[index], width, 2, scan_order, 0);
     coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.v[index], width, 2, scan_order, 0);
+
+    if(state->encoder_control->cfg.jccr) {
+      joint_coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.joint_uv[index], width, 2, scan_order, 0);
+    }
   }
 
+
   double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd + bits * state->c_lambda;
+  double joint_bits = joint_cbcr_tr_tree_bits + joint_coeff_bits;
+
+  double cost = (double)ssd + bits * state->c_lambda;
+  double joint_cost = (double)joint_ssd + joint_bits * state->c_lambda;
+  if ((cost < joint_cost || !pred_cu->joint_cb_cr) || !state->encoder_control->cfg.jccr) {
+    pred_cu->joint_cb_cr = 0;
+    return cost;    
+  }
+  cbf_clear(&pred_cu->cbf, depth, COLOR_U);
+  cbf_clear(&pred_cu->cbf, depth, COLOR_V);
+  if (pred_cu->joint_cb_cr & 1) {
+    cbf_set(&pred_cu->cbf, depth, COLOR_U);
+  }
+  if (pred_cu->joint_cb_cr & 2) {
+    cbf_set(&pred_cu->cbf, depth, COLOR_V);
+  }
+  int lcu_width = LCU_WIDTH_C;
+  const int index = lcu_px.x + lcu_px.y * lcu_width;
+  kvz_pixels_blit(&lcu->rec.joint_u[index], &lcu->rec.u[index], width, width, lcu_width, lcu_width);
+  kvz_pixels_blit(&lcu->rec.joint_v[index], &lcu->rec.v[index], width, width, lcu_width, lcu_width);
+  return joint_cost;
 }
 
 
@@ -518,6 +581,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
   cur_cu->tr_idx = 0;
   cur_cu->violates_mts_coeff_constraint = 0;
   cur_cu->mts_last_scan_pos = 0;
+  cur_cu->joint_cb_cr = 0;
 
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
@@ -814,7 +878,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     if (split_cost < cost) {
       // Copy split modes to this depth.
       cost = split_cost;
-      work_tree_copy_up(x_local, y_local, depth, work_tree);
+      work_tree_copy_up(x_local, y_local, depth, work_tree, state->encoder_control->cfg.jccr);
 #if KVZ_DEBUG
       //debug_split = 1;
 #endif
@@ -1027,4 +1091,7 @@ void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, con
   copy_coeffs(work_tree[0].coeff.y, coeff->y, LCU_WIDTH);
   copy_coeffs(work_tree[0].coeff.u, coeff->u, LCU_WIDTH_C);
   copy_coeffs(work_tree[0].coeff.v, coeff->v, LCU_WIDTH_C);
+  if (state->encoder_control->cfg.jccr) {
+    copy_coeffs(work_tree[0].coeff.joint_uv, coeff->joint_uv, LCU_WIDTH_C);
+  }
 }
diff --git a/src/search.h b/src/search.h
index e1225099..a53cbf9c 100644
--- a/src/search.h
+++ b/src/search.h
@@ -43,7 +43,7 @@ double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
                        lcu_t *const lcu);
 double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
                          const int x_px, const int y_px, const int depth,
-                         const cu_info_t *const pred_cu,
+                         cu_info_t * pred_cu,
                          lcu_t *const lcu);
 void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth);
 
diff --git a/src/search_intra.c b/src/search_intra.c
index 30623aa4..09b2b49b 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -319,7 +319,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
       kvz_intra_recon_cu(state,
         x_px, y_px,
         depth,
-        intra_mode, chroma_mode,
+        intra_mode, -1,
         pred_cu, lcu);
 
       // TODO: Not sure if this should be 0 or 1 but at least seems to work with 1
@@ -334,15 +334,23 @@ static double search_intra_trdepth(encoder_state_t * const state,
       }
 
       double rd_cost = kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
-      if (reconstruct_chroma) {
-        rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
-      }
+      //if (reconstruct_chroma) {
+      //  rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+      //}
 
       if (rd_cost < best_rd_cost) {
         best_rd_cost = rd_cost;
         best_tr_idx = pred_cu->tr_idx;
       }
     }
+    if(reconstruct_chroma) {
+      kvz_intra_recon_cu(state,
+        x_px, y_px,
+        depth,
+        -1, chroma_mode,
+        pred_cu, lcu);
+      best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+    }
     pred_cu->tr_skip = best_tr_idx == MTS_SKIP;
     pred_cu->tr_idx = best_tr_idx;
     nosplit_cost += best_rd_cost;
@@ -718,6 +726,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
     pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
     pred_cu.intra.mode = modes[rdo_mode];
     pred_cu.intra.mode_chroma = modes[rdo_mode];
+    pred_cu.joint_cb_cr = 0;
     FILL(pred_cu.cbf, 0);
 
     // Reset transform split data in lcu.cu for this area.
diff --git a/src/strategies/avx2/quant-avx2.c b/src/strategies/avx2/quant-avx2.c
index d731eef4..7a6e8e62 100644
--- a/src/strategies/avx2/quant-avx2.c
+++ b/src/strategies/avx2/quant-avx2.c
@@ -364,17 +364,17 @@ static INLINE unsigned kvz_math_floor_log2(unsigned value)
  *
  */
 void kvz_quant_avx2(const encoder_state_t * const state, const coeff_t * __restrict coef, coeff_t * __restrict q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type, int8_t transform_skip)
+  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
   const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
+  int32_t qp_scaled = kvz_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
   uint32_t log2_tr_width = kvz_math_floor_log2(height);
   uint32_t log2_tr_height = kvz_math_floor_log2(width);
-  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
+  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[color]);
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_width + log2_tr_height) >> 1); //!< Represents scaling through forward transform
   const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
@@ -721,7 +721,7 @@ int kvz_quantize_residual_avx2(encoder_state_t *const state,
   if (has_coeffs && !early_skip) {
 
     // Get quantized residual. (coeff_out -> coeff -> residual)
-    kvz_dequant(state, coeff_out, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)),
+    kvz_dequant(state, coeff_out, coeff, width, width, color,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
     if (use_trskip) {
       kvz_itransformskip(state->encoder_control, residual, coeff, width);
@@ -771,7 +771,7 @@ int kvz_quantize_residual_avx2(encoder_state_t *const state,
  * \brief inverse quantize transformed and quantized coefficents
  *
  */
-void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,int8_t type, int8_t block_type, int8_t transform_skip)
+void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   int32_t shift,add,coeff_q;
@@ -779,7 +779,7 @@ void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
   int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((kvz_math_floor_log2(width) + kvz_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
 
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
+  int32_t qp_scaled = kvz_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
 
   shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
@@ -788,7 +788,7 @@ void kvz_dequant_avx2(const encoder_state_t * const state, coeff_t *q_coef, coef
   {
     uint32_t log2_tr_width = kvz_math_floor_log2(height) + 2;
     uint32_t log2_tr_height = kvz_math_floor_log2(width) + 2;
-    int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
+    int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);
 
     const int32_t* dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width - 2][log2_tr_height - 2][scalinglist_type][qp_scaled % 6];
     shift += 4;
diff --git a/src/strategies/generic/quant-generic.c b/src/strategies/generic/quant-generic.c
index db2ea83c..7d8a6829 100644
--- a/src/strategies/generic/quant-generic.c
+++ b/src/strategies/generic/quant-generic.c
@@ -38,17 +38,17 @@
 *
 */
 void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type, int8_t transform_skip)
+  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2;
   const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1];
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
+  int32_t qp_scaled = kvz_get_scaled_qp(color, state->qp, (encoder->bitdepth - 8) * 6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
   uint32_t log2_tr_width = kvz_math_floor_log2(height);
   uint32_t log2_tr_height = kvz_math_floor_log2(width);
-  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
+  const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[color]);
   const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_width][log2_tr_height][scalinglist_type][qp_scaled % 6];
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((log2_tr_height + log2_tr_width) >> 1); //!< Represents scaling through forward transform
   const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + (transform_skip ? 0 : transform_shift);
@@ -172,6 +172,214 @@ void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff
   }
 }
 
+static INLINE int64_t square(int x) {
+  return x * (int64_t)x;
+}
+
+
+int kvz_quant_cbcr_residual_generic(
+  encoder_state_t* const state, 
+  const cu_info_t* const cur_cu,
+  const int width,
+  const coeff_scan_order_t scan_order,
+  const int in_stride, const int out_stride,
+  const kvz_pixel* const u_ref_in, 
+  const kvz_pixel* const v_ref_in, 
+  const kvz_pixel* const u_pred_in,
+  const kvz_pixel* const v_pred_in,
+  kvz_pixel* u_rec_out,
+  kvz_pixel* v_rec_out,
+  coeff_t* coeff_out,
+  bool early_skip, 
+  int lmcs_chroma_adj
+  ) {
+  ALIGNED(64) int16_t u_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) int16_t v_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) int16_t u1_residual[2][TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) int16_t v1_residual[TR_MAX_WIDTH * TR_MAX_WIDTH];
+  ALIGNED(64) coeff_t coeff[TR_MAX_WIDTH * TR_MAX_WIDTH];
+
+  {
+    int y, x;
+    for (y = 0; y < width; ++y) {
+      for (x = 0; x < width; ++x) {
+        u_residual[x + y * width] = (int16_t)(u_ref_in[x + y * in_stride] - u_pred_in[x + y * in_stride]);
+        v_residual[x + y * width] = (int16_t)(v_ref_in[x + y * in_stride] - v_pred_in[x + y * in_stride]);
+      }
+    }
+  }
+
+  int best_cbf_mask = -1;
+  int64_t best_cost = INT64_MAX;
+
+  // This changes the order of the cbf_masks so 2 and 3 are swapped compared with VTM
+  for(int cbf_mask = cur_cu->type == CU_INTRA ? 1 : 3; cbf_mask < 4; cbf_mask++) {
+    int64_t d1 = 0;
+    for (int y = 0; y < width; y++)
+    {
+      for (int x = 0; x < width; x++)
+      {
+        int cbx = u_residual[x + y * width], crx = v_residual[x + y * width];
+        if (cbf_mask == 1)
+        {
+          u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx + 2 * crx) / 5);
+          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (u1_residual[cbf_mask / 2][x + y * width] >> 1));
+        }
+        else if (cbf_mask == -1)
+        {
+          u1_residual[cbf_mask / 2][x + y * width] = ((4 * cbx - 2 * crx) / 5);
+          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - (-u1_residual[cbf_mask / 2][x + y * width] >> 1));
+        }
+        else if (cbf_mask == 3)
+        {
+          u1_residual[cbf_mask / 2][x + y * width] = ((cbx + crx) / 2);
+          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx - u1_residual[cbf_mask / 2][x + y * width]);
+        }
+        else if (cbf_mask == -3)
+        {
+          u1_residual[cbf_mask / 2][x + y * width] = ((cbx - crx) / 2);
+          d1 += square(cbx - u1_residual[cbf_mask / 2][x + y * width]) + square(crx + u1_residual[cbf_mask / 2][x + y * width]);
+        }
+        else if (cbf_mask == 2)
+        {
+          v1_residual[x + y * width] = ((4 * crx + 2 * cbx) / 5);
+          d1 += square(cbx - (v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
+        }
+        else if (cbf_mask == -2)
+        {
+          v1_residual[x + y * width] = ((4 * crx - 2 * cbx) / 5);
+          d1 += square(cbx - (-v1_residual[x + y * width] >> 1)) + square(crx - v1_residual[x + y * width]);
+        }
+        else
+        {
+          d1 += square(cbx);
+          //d2 += square(crx);
+        }
+      }
+    }
+    if (d1 < best_cost) {
+      best_cbf_mask = cbf_mask;
+      best_cost = d1;
+    }
+  }
+
+  kvz_transform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu);
+
+  if (state->encoder_control->cfg.rdoq_enable &&
+    (width > 4 || !state->encoder_control->cfg.rdoq_skip))
+  {
+    int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
+    tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
+    kvz_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+      scan_order, cur_cu->type, tr_depth, cur_cu->cbf);
+  }
+  else if (state->encoder_control->cfg.rdoq_enable && false) {
+    kvz_ts_rdoq(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+      scan_order);
+  }
+  else {
+    kvz_quant(state, coeff, coeff_out, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+      scan_order, cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
+  }
+
+  int8_t has_coeffs = 0;
+  {
+    int i;
+    for (i = 0; i < width * width; ++i) {
+      if (coeff_out[i] != 0) {
+        has_coeffs = 1;
+        break;
+      }
+    }
+  }
+
+  if (has_coeffs && !early_skip) {
+    int y, x;
+
+    // Get quantized residual. (coeff_out -> coeff -> residual)
+    kvz_dequant(state, coeff_out, coeff, width, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U,
+      cur_cu->type, cur_cu->tr_idx == MTS_SKIP && false);
+    
+    kvz_itransform2d(state->encoder_control, best_cbf_mask == 2 ? v1_residual : u1_residual[best_cbf_mask / 2], coeff, width, best_cbf_mask == 2 ? COLOR_V : COLOR_U, cur_cu);
+    
+
+    //if (state->tile->frame->lmcs_aps->m_sliceReshapeInfo.enableChromaAdj && color != COLOR_Y) {
+    //  int y, x;
+    //  int sign, absval;
+    //  int maxAbsclipBD = (1 << KVZ_BIT_DEPTH) - 1;
+    //  for (y = 0; y < width; ++y) {
+    //    for (x = 0; x < width; ++x) {
+    //      residual[x + y * width] = (int16_t)CLIP((int16_t)(-maxAbsclipBD - 1), (int16_t)maxAbsclipBD, residual[x + y * width]);
+    //      sign = residual[x + y * width] >= 0 ? 1 : -1;
+    //      absval = sign * residual[x + y * width];
+    //      int val = sign * ((absval * lmcs_chroma_adj + (1 << (CSCALE_FP_PREC - 1))) >> CSCALE_FP_PREC);
+    //      if (sizeof(kvz_pixel) == 2) // avoid overflow when storing data
+    //      {
+    //        val = CLIP(-32768, 32767, val);
+    //      }
+    //      residual[x + y * width] = (int16_t)val;
+    //    }
+    //  }
+    //}
+
+    // Get quantized reconstruction. (residual + pred_in -> rec_out)
+    for (int y = 0; y < width; y++) {
+      for (int x = 0; x < width; x++) {
+        if (best_cbf_mask == 1) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
+          v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width] >> 1;
+        }
+        else if (best_cbf_mask == -1) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
+          v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width] >> 1;
+        }
+        else if (best_cbf_mask == 3) {
+          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
+          v_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width];
+        }
+        else if (best_cbf_mask == -3) {
+          // non-normative clipping to prevent 16-bit overflow
+          u_residual[x + y * width] = u1_residual[best_cbf_mask / 2][x + y * width]; // == -32768 && sizeof(Pel) == 2) ? 32767 : -v1_residual[best_cbf_mask][x];
+          v_residual[x + y * width] = -u1_residual[best_cbf_mask / 2][x + y * width];
+        }
+        else if (best_cbf_mask == 2) {
+          u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
+          v_residual[x + y * width] = v1_residual[x + y * width];
+        }
+        else if (best_cbf_mask == -2) {
+          u_residual[x + y * width] = v1_residual[x + y * width] >> 1;
+          v_residual[x + y * width] = -v1_residual[x + y * width];
+        }
+      }
+    }
+    for (y = 0; y < width; ++y) {
+      for (x = 0; x < width; ++x) {
+        int16_t u_val = u_residual[x + y * width] + u_pred_in[x + y * in_stride];
+        u_rec_out[x + y * out_stride] = (kvz_pixel)CLIP(0, PIXEL_MAX, u_val);
+        int16_t v_val = v_residual[x + y * width] + v_pred_in[x + y * in_stride];
+        v_rec_out[x + y * out_stride] = (kvz_pixel)CLIP(0, PIXEL_MAX, v_val);
+      }
+    }
+  }
+  else/* if (rec_out != pred_in)*/ {
+    // With no coeffs and rec_out == pred_int we skip copying the coefficients
+    // because the reconstruction is just the prediction.
+    int y, x;
+
+    for (y = 0; y < width; ++y) {
+      for (x = 0; x < width; ++x) {
+        u_rec_out[x + y * out_stride] = u_pred_in[x + y * in_stride];
+        v_rec_out[x + y * out_stride] = v_pred_in[x + y * in_stride];
+      }
+    }
+  }
+
+
+
+
+  return has_coeffs ? best_cbf_mask : 0;
+}
+
 /**
 * \brief Quantize residual and get both the reconstruction and coeffs.
 *
@@ -271,7 +479,7 @@ int kvz_quantize_residual_generic(encoder_state_t *const state,
     int y, x;
 
     // Get quantized residual. (coeff_out -> coeff -> residual)
-    kvz_dequant(state, coeff_out, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)),
+    kvz_dequant(state, coeff_out, coeff, width, width, color,
       cur_cu->type, cur_cu->tr_idx == MTS_SKIP && color == COLOR_Y);
     if (use_trskip) {
       kvz_itransformskip(state->encoder_control, residual, coeff, width);
@@ -326,7 +534,7 @@ int kvz_quantize_residual_generic(encoder_state_t *const state,
  * \brief inverse quantize transformed and quantized coefficents
  *
  */
-void kvz_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,int8_t type, int8_t block_type, int8_t transform_skip)
+void kvz_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height,color_t color, int8_t block_type, int8_t transform_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   int32_t shift,add,coeff_q;
@@ -334,7 +542,7 @@ void kvz_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
   int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - ((kvz_math_floor_log2(width) + kvz_math_floor_log2(height)) >> 1); // Represents scaling through forward transform
 
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
+  int32_t qp_scaled = kvz_get_scaled_qp(color, state->qp, (encoder->bitdepth-8)*6, encoder->qp_map[0]);
   qp_scaled = transform_skip ? MAX(qp_scaled, 4 + 6 * MIN_QP_PRIME_TS) : qp_scaled;
 
   shift = 20 - QUANT_SHIFT - (transform_skip ? 0 : transform_shift);
@@ -343,7 +551,7 @@ void kvz_dequant_generic(const encoder_state_t * const state, coeff_t *q_coef, c
   {
     uint32_t log2_tr_width = kvz_math_floor_log2(height) + 2;
     uint32_t log2_tr_height = kvz_math_floor_log2(width) + 2;
-    int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
+    int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)(color);
 
     const int32_t *dequant_coef = encoder->scaling_list.de_quant_coeff[log2_tr_width -2][log2_tr_height -2][scalinglist_type][qp_scaled%6];
     shift += 4;
@@ -413,6 +621,7 @@ int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth)
   bool success = true;
 
   success &= kvz_strategyselector_register(opaque, "quant", "generic", 0, &kvz_quant_generic);
+  success &= kvz_strategyselector_register(opaque, "quant_cbcr_residual", "generic", 0, &kvz_quant_cbcr_residual_generic);
   success &= kvz_strategyselector_register(opaque, "quantize_residual", "generic", 0, &kvz_quantize_residual_generic);
   success &= kvz_strategyselector_register(opaque, "dequant", "generic", 0, &kvz_dequant_generic);
   success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "generic", 0, &coeff_abs_sum_generic);
diff --git a/src/strategies/generic/quant-generic.h b/src/strategies/generic/quant-generic.h
index 442e8238..34269522 100644
--- a/src/strategies/generic/quant-generic.h
+++ b/src/strategies/generic/quant-generic.h
@@ -36,7 +36,7 @@
 
 int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth);
 void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type, int8_t transform_skip);
+  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip);
 
 int kvz_quantize_residual_generic(encoder_state_t *const state,
   const cu_info_t *const cur_cu, const int width, const color_t color,
@@ -46,4 +46,21 @@ int kvz_quantize_residual_generic(encoder_state_t *const state,
   kvz_pixel *rec_out, coeff_t *coeff_out,
   bool early_skip, int lmcs_chroma_adj);
 
+int kvz_quant_cbcr_residual_generic(
+  encoder_state_t* const state,
+  const cu_info_t* const cur_cu,
+  const int width,
+  const coeff_scan_order_t scan_order,
+  const int in_stride, const int out_stride,
+  const kvz_pixel* const u_ref_in,
+  const kvz_pixel* const v_ref_in,
+  const kvz_pixel* const u_pred_in,
+  const kvz_pixel* const v_pred_in,
+  kvz_pixel* u_rec_out,
+  kvz_pixel* v_rec_out,
+  coeff_t* coeff_out,
+  bool early_skip,
+  int lmcs_chroma_adj
+);
+
 #endif //STRATEGIES_QUANT_GENERIC_H_
diff --git a/src/strategies/strategies-quant.c b/src/strategies/strategies-quant.c
index 0a7e8f91..5a45fdb0 100644
--- a/src/strategies/strategies-quant.c
+++ b/src/strategies/strategies-quant.c
@@ -27,6 +27,7 @@
 
 // Define function pointers.
 quant_func *kvz_quant;
+quant_cbcr_func *kvz_quant_cbcr_residual;
 quant_residual_func *kvz_quantize_residual;
 dequant_func *kvz_dequant;
 coeff_abs_sum_func *kvz_coeff_abs_sum;
diff --git a/src/strategies/strategies-quant.h b/src/strategies/strategies-quant.h
index 83dc48eb..55c0bed2 100644
--- a/src/strategies/strategies-quant.h
+++ b/src/strategies/strategies-quant.h
@@ -34,7 +34,22 @@
 
 // Declare function pointers.
 typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type, int8_t transform_skip);
+  int32_t height, color_t color, int8_t scan_idx, int8_t block_type, int8_t transform_skip);
+typedef unsigned (quant_cbcr_func)(
+  encoder_state_t* const state,
+  const cu_info_t* const cur_cu,
+  const int width,
+  const coeff_scan_order_t scan_order,
+  const int in_stride, const int out_stride,
+  const kvz_pixel* const u_ref_in,
+  const kvz_pixel* const v_ref_in,
+  const kvz_pixel* const u_pred_in,
+  const kvz_pixel* const v_pred_in,
+  kvz_pixel* u_rec_out,
+  kvz_pixel* v_rec_out,
+  coeff_t* coeff_out,
+  bool early_skip,
+  int lmcs_chroma_adj);
 typedef unsigned (quant_residual_func)(encoder_state_t *const state,
   const cu_info_t *const cur_cu, const int width, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
@@ -43,13 +58,14 @@ typedef unsigned (quant_residual_func)(encoder_state_t *const state,
   kvz_pixel *rec_out, coeff_t *coeff_out,
   bool early_skip, int lmcs_chroma_adj);
 typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
-  int32_t height, int8_t type, int8_t block_type, int8_t transform_skip);
+  int32_t height, color_t color, int8_t block_type, int8_t transform_skip);
 typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
 
 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);
 
 // Declare function pointers.
 extern quant_func * kvz_quant;
+extern quant_cbcr_func* kvz_quant_cbcr_residual;
 extern quant_residual_func * kvz_quantize_residual;
 extern dequant_func *kvz_dequant;
 extern coeff_abs_sum_func *kvz_coeff_abs_sum;
@@ -60,6 +76,7 @@ int kvz_strategy_register_quant(void* opaque, uint8_t bitdepth);
 
 #define STRATEGIES_QUANT_EXPORTS \
   {"quant", (void**) &kvz_quant}, \
+  {"quant_cbcr_residual", (void**) &kvz_quant_cbcr_residual}, \
   {"quantize_residual", (void**) &kvz_quantize_residual}, \
   {"dequant", (void**) &kvz_dequant}, \
   {"coeff_abs_sum", (void**) &kvz_coeff_abs_sum}, \
diff --git a/src/transform.c b/src/transform.c
index 79ce6ef0..bb768218 100644
--- a/src/transform.c
+++ b/src/transform.c
@@ -127,10 +127,10 @@ static void rdpcm(const int width,
  * \brief Get scaled QP used in quantization
  *
  */
-int32_t kvz_get_scaled_qp(int8_t type, int8_t qp, int8_t qp_offset, int8_t const * const chroma_scale)
+int32_t kvz_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t const * const chroma_scale)
 {
   int32_t qp_scaled = 0;
-  if(type == 0) {
+  if(color == 0) {
     qp_scaled = qp + qp_offset;
   } else {
     qp_scaled = CLIP(-qp_offset, 57, qp);
@@ -306,13 +306,13 @@ static void quantize_tr_residual(encoder_state_t * const state,
 {
   const kvz_config *cfg    = &state->encoder_control->cfg;
   const int32_t shift      = color == COLOR_Y ? 0 : 1;
-  const vector2d_t lcu_px  = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift };
+  const vector2d_t lcu_px  = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift};
 
   // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
   // left PU because the coordinates are correct.
   bool handled_elsewhere = color != COLOR_Y &&
-                           depth > MAX_DEPTH &&
-                           (lcu_px.x % 4 != 0 || lcu_px.y % 4 != 0);
+                           depth == MAX_DEPTH &&
+                           (x % 4 != 0 || y % 4 != 0);
   if (handled_elsewhere) {
     return;
   }
@@ -367,7 +367,7 @@ static void quantize_tr_residual(encoder_state_t * const state,
                               cfg->trskip_enable && 
                               cur_pu->tr_idx == 1;
 
-  bool has_coeffs;
+  uint8_t has_coeffs;
 
 
   int lmcs_chroma_adj = 0;
@@ -411,6 +411,25 @@ static void quantize_tr_residual(encoder_state_t * const state,
                                               lmcs_chroma_adj);
     cur_pu->tr_skip = tr_skip;
   } else {
+    if(color == COLOR_UV) {
+      has_coeffs = kvz_quant_cbcr_residual(
+        state,
+        cur_pu,
+        tr_width,
+        scan_idx,
+        lcu_width,
+        lcu_width,
+        &lcu->ref.u[offset], &lcu->ref.v[offset],
+        &lcu->rec.joint_u[offset], &lcu->rec.joint_v[offset],
+        &lcu->rec.joint_u[offset], &lcu->rec.joint_v[offset],
+        &lcu->coeff.joint_uv[z_index],
+        early_skip,
+        lmcs_chroma_adj
+      );
+      cur_pu->joint_cb_cr = has_coeffs;
+      return;
+    }
+
     has_coeffs = kvz_quantize_residual(state,
                                        cur_pu,
                                        tr_width,
@@ -425,6 +444,7 @@ static void quantize_tr_residual(encoder_state_t * const state,
                                        coeff,
                                        early_skip,
                                        lmcs_chroma_adj);
+    
   }
 
   cbf_clear(&cur_pu->cbf, depth, color);
@@ -519,6 +539,9 @@ void kvz_quantize_lcu_residual(encoder_state_t * const state,
     if (chroma) {
       quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip);
       quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip);
+      if(state->encoder_control->cfg.jccr && cur_pu->tr_depth == cur_pu->depth){
+        quantize_tr_residual(state, COLOR_UV, x, y, depth, cur_pu, lcu, early_skip);
+      }
     }
   }
 }
diff --git a/src/transform.h b/src/transform.h
index 27cbdf91..77bc2607 100644
--- a/src/transform.h
+++ b/src/transform.h
@@ -53,7 +53,7 @@ void kvz_itransform2d(const encoder_control_t * const encoder,
                       const cu_info_t *tu);
 
 
-int32_t kvz_get_scaled_qp(int8_t type, int8_t qp, int8_t qp_offset, int8_t const* const chroma_scale);
+int32_t kvz_get_scaled_qp(color_t color, int8_t qp, int8_t qp_offset, int8_t const* const chroma_scale);
 
 void kvz_quantize_lcu_residual(encoder_state_t *state,
                                bool luma,
diff --git a/tests/test_intra.sh b/tests/test_intra.sh
index e7b7c4c9..722fe9fc 100755
--- a/tests/test_intra.sh
+++ b/tests/test_intra.sh
@@ -12,4 +12,6 @@ valgrind_test $common_args --rd=2 --no-transform-skip --qp 37
 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37 --signhide --rdoq 
 valgrind_test $common_args --alf=full --no-wpp --threads=0 --owf=0
 valgrind_test $common_args --alf=full --wpp --threads=1
+valgrind_test $common_args --jccr
+valgrind_test $common_args --jccr --rdoq --rd=2 --mts=intra