Merge branch 'CCLM' into 'master'

CCLM implementation See merge request cs/ultravideo/vvc/uvg266!7
2024-11-27 11:24:05 +00:00 · 2021-11-26 08:54:24 +02:00 · 2021-11-26 08:54:24 +02:00 · 7aeef8e9b8
parent d5c212b77e 385e91399a
commit 7aeef8e9b8
18 changed files with 596 additions and 64 deletions
--- a/configure.ac
+++ b/configure.ac
@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=6
-ver_minor=6
+ver_minor=7
 ver_release=0

 # Prevents configure from adding a lot of defines to the CFLAGS
--- a/src/cabac.h
+++ b/src/cabac.h
@ -117,6 +117,8 @@ typedef struct
    cabac_ctx_t transform_skip_gt1[4];
    cabac_ctx_t transform_skip_par;
    cabac_ctx_t transform_skip_gt2[5];
+    cabac_ctx_t cclm_flag;
+    cabac_ctx_t cclm_model;

  } ctx;
 } cabac_data_t;
--- a/src/cfg.c
+++ b/src/cfg.c
@ -209,6 +209,8 @@ int kvz_config_init(kvz_config *cfg)

  cfg->amvr = 0;

+  cfg->cclm = 0;
+
  return 1;
 }

@ -1486,6 +1488,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
  else if OPT("amvr") {
    cfg->amvr = (bool)atobool(value);
  }
+  else if OPT("cclm") {
+    cfg->cclm = (bool)atobool(value);
+  }
  else {
    return 0;
  }
--- a/src/cli.c
+++ b/src/cli.c
@ -178,6 +178,8 @@ static const struct option long_options[] = {
  { "no-jccr",                  no_argument, NULL, 0 },
  { "amvr",                     no_argument, NULL, 0 },
  { "no-amvr",                  no_argument, NULL, 0 },
+  { "cclm",                     no_argument, NULL, 0 },
+  { "no-cclm",                  no_argument, NULL, 0 },
  {0, 0, 0, 0}
 };

@ -550,8 +552,8 @@ void print_help(void)
    "                                   - 0: Skip intra if inter is good enough.\n"
    "                                   - 1: Rough intra mode search with SATD.\n"
    "                                   - 2: Refine intra mode search with SSE.\n"
-    "                                   - 3: Try all intra modes and enable intra\n"
-    "                                        chroma mode search.\n"
+    "                                   - 3: Enable intra chroma mode search.\n"
+    "                                   - 4: Try all intra modes.\n"
    "      --(no-)mv-rdo          : Rate-distortion optimized motion vector costs\n"
    "                               [disabled]\n"
    "      --(no-)zero-coeff-rdo  : If a CU is set inter, check if forcing zero\n"
@ -629,8 +631,12 @@ void print_help(void)
    "                                   - both: MTS applied for both intra and inter blocks.\n"
    "                                   - implicit: uses implicit MTS. Applies DST7 instead \n"
    "                                               of DCT2 to certain intra blocks.\n"
-    "      --(no-)jccr            : Joint coding of chroma residual.\n"
-    "                               Requires rdo> = 2. [disabled]\n"      
+    "      --(no-)jccr            : Joint coding of chroma residual. "
+    "                               Requires rdo> = 2. [disabled]\n"
+    "      --(no-)cclm            : Cross component linear model. \n"
+    "                               Extra chroma prediction modes that are formed\n"
+    "                               via linear transformation from the luma\n"
+    "                               prediction. Requires rdo >=3. [disabled\n"
    "      --(no-)amvr            : Adaptive Motion Vector Resolution.\n"
    "                               Code some mv's with reduced resolution [disabled]\n"
    "\n"
--- a/src/context.c
+++ b/src/context.c
@ -395,6 +395,20 @@ static const uint8_t INIT_IMV_FLAG[4][5] = {
  {   0,   5,   0,   0,   4, },
 };

+static const uint8_t INIT_CCLM_FLAG[4] = {
+  {  26, },
+  {  34, },
+  {  59, },
+  {   4, },
+};
+
+static const uint8_t INIT_CCLM_MODEL[4] = {
+  {  27, },
+  {  27, },
+  {  27, },
+  {   9, },
+};
+
 /*
 static const uint16_t g_inistateToCount[128] = {
  614,   647,   681,   718,   756,   797,   839,   884,   932,   982,   1034,  1089,  1148,  1209,  1274,  1342,
@ -471,6 +485,9 @@ void kvz_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)

  kvz_ctx_init(&cabac->ctx.chroma_pred_model, QP, INIT_CHROMA_PRED_MODE[slice], INIT_CHROMA_PRED_MODE[3]);

+  kvz_ctx_init(&cabac->ctx.cclm_flag, QP, INIT_CCLM_FLAG[slice], INIT_CCLM_FLAG[3]);
+  kvz_ctx_init(&cabac->ctx.cclm_model, QP, INIT_CCLM_MODEL[slice], INIT_CCLM_MODEL[3]);
+

  for (i = 0; i < 3; i++) {
    kvz_ctx_init(&cabac->ctx.cu_skip_flag_model[i], QP, INIT_SKIP_FLAG[slice][i], INIT_SKIP_FLAG[3][i]);
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@ -701,7 +701,7 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
  return non_zero_mvd;
 }

-static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* const cur_cu, int x, int y, const videoframe_t* const frame, const int cu_width) {
+static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* const cur_cu, int x, int y, const videoframe_t* const frame, const int cu_width, const int cclm_enabled) {
  unsigned pred_mode = 0;
  unsigned chroma_pred_modes[8] = {0, 50, 18, 1, 67, 81, 82, 83};
  const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, 0);
@ -710,7 +710,23 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c
  int8_t chroma_intra_dir = first_pu->intra.mode_chroma;
  int8_t luma_intra_dir = first_pu->intra.mode;

+
  bool derived_mode = chroma_intra_dir == luma_intra_dir;
+  bool cclm_mode = chroma_intra_dir > 67;
+
+  if (cclm_enabled) {
+    cabac->cur_ctx = &cabac->ctx.cclm_flag;
+    CABAC_BIN(cabac, cclm_mode, "cclm_flag");
+    if(cclm_mode) {
+      cabac->cur_ctx = &cabac->ctx.cclm_model;
+      CABAC_BIN(cabac, chroma_intra_dir != 81, "cclm_model_1");
+      if(chroma_intra_dir != 81) {
+        CABAC_BIN_EP(cabac, chroma_intra_dir == 83, "cclm_model_2");
+      }
+      return;
+    }
+
+  }
  cabac->cur_ctx = &(cabac->ctx.chroma_pred_model);
  CABAC_BIN(cabac, derived_mode ? 0 : 1, "intra_chroma_pred_mode");

@ -722,7 +738,7 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c
          break;
        }
      }*/
-    for (; pred_mode < 8; pred_mode++) {
+    for (; pred_mode < 5; pred_mode++) {
      if (chroma_intra_dir == chroma_pred_modes[pred_mode]) {
        break;
      }
@ -983,7 +999,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,

  // Code chroma prediction mode.
  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) {
-    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width);
+    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
  }

  encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff);
@ -991,7 +1007,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
  encode_mts_idx(state, cabac, cur_cu);

  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) {
-    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width);
+    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
    encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff);
  }

--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@ -722,7 +722,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
  WRITE_U(stream, 0, 1, "sps_mip_enabled_flag");
  // if(!no_cclm_constraint_flag)
  if(encoder->chroma_format != KVZ_CSP_400) {
-    WRITE_U(stream, 0, 1, "sps_cclm_enabled_flag");
+    WRITE_U(stream, encoder->cfg.cclm, 1, "sps_cclm_enabled_flag");
  }
  if (encoder->chroma_format == KVZ_CSP_420) {
    WRITE_U(stream, 0, 1, "sps_chroma_horizontal_collocated_flag");
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@ -122,7 +122,7 @@ static int encoder_state_config_tile_init(encoder_state_t * const state,
                                          const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
  
  const encoder_control_t * const encoder = state->encoder_control;
-  state->tile->frame = kvz_videoframe_alloc(width, height, state->encoder_control->chroma_format, encoder->cfg.alf_type);
+  state->tile->frame = kvz_videoframe_alloc(width, height, state->encoder_control->chroma_format, encoder->cfg.alf_type, encoder->cfg.cclm);
  
  state->tile->frame->rec = NULL;
  
--- a/src/intra.c
+++ b/src/intra.c
@ -248,6 +248,300 @@ static void intra_pred_dc(
 }


+enum lm_mode
+{
+  LM_CHROMA_IDX = 81,
+  LM_CHROMA_L_IDX = 82,
+  LM_CHROMA_T_IDX = 83,
+};
+
+
+static void get_cclm_parameters(
+  encoder_state_t const* const state,
+  int8_t width, int8_t height, int8_t mode,
+  int x0, int y0, int avai_above_right_units, int avai_left_below_units,
+  kvz_intra_ref* luma_src, kvz_intra_references*chroma_ref,
+  int16_t *a, int16_t*b, int16_t*shift) {
+
+  const int base_unit_size = 1 << (6 - PU_DEPTH_INTRA_MAX);
+
+  // TODO: take into account YUV422
+  const int unit_w = base_unit_size >> 1;
+  const int unit_h = base_unit_size >> 1;
+
+  const int c_height = height;
+  const int c_width = width;
+  height *= 2;
+  width *= 2;
+
+  const int tu_width_in_units = c_width / unit_w;
+  const int tu_height_in_units = c_height / unit_h;
+
+
+  int top_template_samp_num = width; // for MDLM, the template sample number is 2W or 2H;
+  int left_template_samp_num = height;
+
+  // These are used for calculating some stuff for non-square CUs
+  //int total_above_units = (top_template_samp_num + (unit_w - 1)) / unit_w;
+  //int total_left_units = (left_template_samp_num + (unit_h - 1)) / unit_h;
+  //int total_units = total_left_units + total_above_units + 1;
+  //int above_right_units = total_above_units - tu_width_in_units;
+  //int left_below_units = total_left_units - tu_height_in_units;
+  //int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
+  //int avai_left_below_units = 0;
+  int avai_above_units = CLIP(0, tu_height_in_units, y0/base_unit_size);
+  int avai_left_units = CLIP(0, tu_width_in_units, x0 / base_unit_size);
+
+  bool above_available = avai_above_units != 0;
+  bool left_available = avai_left_units != 0;
+    
+  char internal_bit_depth = state->encoder_control->bitdepth;
+
+  int min_luma[2] = { MAX_INT, 0 };
+  int max_luma[2] = { -MAX_INT, 0 };
+  
+  kvz_pixel* src;
+  int actualTopTemplateSampNum = 0;
+  int actualLeftTemplateSampNum = 0;
+  if (mode == LM_CHROMA_T_IDX)
+  {
+    left_available = 0;
+    avai_above_right_units = avai_above_right_units > (c_height / unit_w) ? c_height / unit_w : avai_above_right_units;
+    actualTopTemplateSampNum = unit_w * (avai_above_units + avai_above_right_units);
+  }
+  else if (mode == LM_CHROMA_L_IDX)
+  {
+    above_available = 0;
+    avai_left_below_units = avai_left_below_units > (c_width / unit_h) ? c_width / unit_h : avai_left_below_units;
+    actualLeftTemplateSampNum = unit_h * (avai_left_units + avai_left_below_units);
+  }
+  else if (mode == LM_CHROMA_IDX)
+  {
+    actualTopTemplateSampNum = c_width;
+    actualLeftTemplateSampNum = c_height;
+  }
+  int startPos[2]; //0:Above, 1: Left
+  int pickStep[2];
+
+  int aboveIs4 = left_available ? 0 : 1;
+  int leftIs4 = above_available ? 0 : 1;
+
+  startPos[0] = actualTopTemplateSampNum >> (2 + aboveIs4);
+  pickStep[0] = MAX(1, actualTopTemplateSampNum >> (1 + aboveIs4));
+
+  startPos[1] = actualLeftTemplateSampNum >> (2 + leftIs4);
+  pickStep[1] = MAX(1, actualLeftTemplateSampNum >> (1 + leftIs4));
+
+  kvz_pixel selectLumaPix[4] = { 0, 0, 0, 0 };
+  kvz_pixel selectChromaPix[4] = { 0, 0, 0, 0 };
+
+  int cntT, cntL;
+  cntT = cntL = 0;
+  int cnt = 0;
+  if (above_available)
+  {
+    cntT = MIN(actualTopTemplateSampNum, (1 + aboveIs4) << 1);
+    src = luma_src->top;
+    const kvz_pixel* cur = chroma_ref->ref.top + 1;
+    for (int pos = startPos[0]; cnt < cntT; pos += pickStep[0], cnt++)
+    {
+      selectLumaPix[cnt] = src[pos];
+      selectChromaPix[cnt] = cur[pos];
+    }
+  }
+
+  if (left_available)
+  {
+    cntL = MIN(actualLeftTemplateSampNum, (1 + leftIs4) << 1);
+    src = luma_src->left;
+    const kvz_pixel* cur = chroma_ref->ref.left + 1;
+    for (int pos = startPos[1], cnt = 0; cnt < cntL; pos += pickStep[1], cnt++)
+    {
+      selectLumaPix[cnt + cntT] = src[pos];
+      selectChromaPix[cnt + cntT] = cur[pos];
+    }
+  }
+  cnt = cntL + cntT;
+
+  if (cnt == 2)
+  {
+    selectLumaPix[3] = selectLumaPix[0]; selectChromaPix[3] = selectChromaPix[0];
+    selectLumaPix[2] = selectLumaPix[1]; selectChromaPix[2] = selectChromaPix[1];
+    selectLumaPix[0] = selectLumaPix[1]; selectChromaPix[0] = selectChromaPix[1];
+    selectLumaPix[1] = selectLumaPix[3]; selectChromaPix[1] = selectChromaPix[3];
+  }
+
+  int minGrpIdx[2] = { 0, 2 };
+  int maxGrpIdx[2] = { 1, 3 };
+  int* tmpMinGrp = minGrpIdx;
+  int* tmpMaxGrp = maxGrpIdx;
+  if (selectLumaPix[tmpMinGrp[0]] > selectLumaPix[tmpMinGrp[1]])
+  {
+    SWAP(tmpMinGrp[0], tmpMinGrp[1], int);
+  }
+  if (selectLumaPix[tmpMaxGrp[0]] > selectLumaPix[tmpMaxGrp[1]])
+  {
+    SWAP(tmpMaxGrp[0], tmpMaxGrp[1], int);
+  }
+  if (selectLumaPix[tmpMinGrp[0]] > selectLumaPix[tmpMaxGrp[1]])
+  {
+    SWAP(tmpMinGrp, tmpMaxGrp, int*);
+  }
+  if (selectLumaPix[tmpMinGrp[1]] > selectLumaPix[tmpMaxGrp[0]])
+  {
+    SWAP(tmpMinGrp[1], tmpMaxGrp[0], int);
+  }
+
+  min_luma[0] = (selectLumaPix[tmpMinGrp[0]] + selectLumaPix[tmpMinGrp[1]] + 1) >> 1;
+  min_luma[1] = (selectChromaPix[tmpMinGrp[0]] + selectChromaPix[tmpMinGrp[1]] + 1) >> 1;
+  max_luma[0] = (selectLumaPix[tmpMaxGrp[0]] + selectLumaPix[tmpMaxGrp[1]] + 1) >> 1;
+  max_luma[1] = (selectChromaPix[tmpMaxGrp[0]] + selectChromaPix[tmpMaxGrp[1]] + 1) >> 1;
+
+  if (left_available || above_available)
+  {
+    int diff = max_luma[0] - min_luma[0];
+    if (diff > 0)
+    {
+      int diffC = max_luma[1] - min_luma[1];
+      int x = kvz_math_floor_log2(diff);
+      static const uint8_t DivSigTable[1 << 4] = {
+        // 4bit significands - 8 ( MSB is omitted )
+        0,  7,  6,  5,  5,  4,  4,  3,  3,  2,  2,  1,  1,  1,  1,  0
+      };
+      int normDiff = (diff << 4 >> x) & 15;
+      int v = DivSigTable[normDiff] | 8;
+      x += normDiff != 0;
+
+      int y = diffC ? kvz_math_floor_log2(abs(diffC)) + 1 : 0;
+      int add = 1 << y >> 1;
+      *a = (diffC * v + add) >> y;
+      *shift = 3 + x - y;
+      if (*shift < 1)
+      {
+        *shift = 1;
+        *a = ((*a == 0) ? 0 : (*a < 0) ? -15 : 15);   // a=Sign(a)*15
+      }
+      *b = min_luma[1] - ((*a * min_luma[0]) >> *shift);
+    }
+    else
+    {
+      *a = 0;
+      *b = min_luma[1];
+      *shift = 0;
+    }
+  }
+  else
+  {
+    *a = 0;
+
+    *b = 1 << (internal_bit_depth - 1);
+
+    *shift = 0;
+  }
+}
+
+static void linear_transform_cclm(cclm_parameters_t* cclm_params, kvz_pixel * src, kvz_pixel * dst, int stride, int height) {
+  int scale = cclm_params->a;
+  int shift = cclm_params->shift;
+  int offset = cclm_params->b;
+  for (int y = 0; y < height; ++y) {
+    for (int x=0; x < stride; ++x) {
+      int val = src[x + y * stride] * scale;
+      val >>= shift;
+      val += offset;
+      val = CLIP_TO_PIXEL(val);
+      dst[x + y * stride] = val;
+    }
+  }
+}
+
+
+void kvz_predict_cclm(
+  encoder_state_t const* const state,
+  const color_t color,
+  const int8_t width,
+  const int8_t height,
+  const int16_t x0,
+  const int16_t y0,
+  const int16_t stride,
+  const int8_t mode,
+  lcu_t* const lcu,
+  kvz_intra_references* chroma_ref,
+  kvz_pixel* dst,
+  cclm_parameters_t* cclm_params
+)
+{
+  assert(mode == LM_CHROMA_IDX || mode == LM_CHROMA_L_IDX || mode == LM_CHROMA_T_IDX);
+  assert(state->encoder_control->cfg.cclm);
+
+  
+  kvz_intra_ref sampled_luma_ref;
+  kvz_pixel sampled_luma[LCU_CHROMA_SIZE];
+
+  int x_scu = SUB_SCU(x0);
+  int y_scu = SUB_SCU(y0);
+
+  int available_above_right = 0;
+  int available_left_below = 0;
+
+
+  kvz_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH;
+
+  // Essentially what this does is that it uses 6-tap filtering to downsample
+  // the luma intra references down to match the resolution of the chroma channel.
+  // The luma reference is only needed when we are not on the edge of the picture.
+  // Because the reference pixels that are needed on the edge of the ctu this code
+  // is kinda messy but what can you do
+
+  if (y0) {
+    for (; available_above_right < width / 2; available_above_right++) {
+      int x_extension = x_scu + width * 2 + 4 * available_above_right;
+      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4);
+      if (x_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
+    }
+    if(y_scu == 0) {
+      if(!state->encoder_control->cfg.wpp) available_above_right = MIN(width / 2, (state->tile->frame->width - x0 - width * 2) / 4);
+      memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride / 2)], sizeof(kvz_pixel) * (width + available_above_right * 2));
+    }
+    else {
+      for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
+        bool left_padding = x0 || x;
+        int s = 4;
+        s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2            : state->tile->frame->rec->y[x0 + x + (y0 - 2) * stride] * 2;
+        s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1]            : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
+        s += y_scu && !(x0 && !x && !x_scu) ? y_rec[x - LCU_WIDTH * 2 - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 2) * stride];
+        s += y_scu ? y_rec[x - LCU_WIDTH] * 2                : state->tile->frame->rec->y[x0 + x + (y0 - 1) * stride] * 2;
+        s += y_scu ? y_rec[x - LCU_WIDTH + 1]                : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
+        s += y_scu && !(x0 && !x && !x_scu) ? y_rec[x - LCU_WIDTH - left_padding]     : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
+        sampled_luma_ref.top[x / 2] = s >> 3;
+      }
+    }
+  }
+
+  if(x0) {
+    for (; available_left_below < height / 2; available_left_below++) {
+      int y_extension = y_scu + height * 2 + 4 * available_left_below;
+      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension);
+      if (y_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
+      if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
+    }
+    for(int i = 0; i < height + available_left_below * 2; i++) {
+      sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride/2) + x0 / 2 - 1];
+    }    
+  }
+
+  kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride) / 4], sampled_luma, width, height, stride / 2, width);
+
+  int16_t a, b, shift;
+  get_cclm_parameters(state, width, height, mode,x0, y0, available_above_right, available_left_below, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
+  cclm_params->shift = shift;
+  cclm_params->a = a;
+  cclm_params->b = b;
+
+  if(dst)
+    linear_transform_cclm(cclm_params, sampled_luma, dst, width, height);
+}
+
 void kvz_intra_predict(
  encoder_state_t *const state,
  kvz_intra_references *refs,
@ -573,6 +867,7 @@ static void intra_recon_tb_leaf(
  int y,
  int depth,
  int8_t intra_mode,
+  cclm_parameters_t *cclm_params,
  lcu_t *lcu,
  color_t color)
 {
@ -592,14 +887,29 @@ static void intra_recon_tb_leaf(
    state->tile->frame->width,
    state->tile->frame->height,
  };
-  const vector2d_t lcu_px = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift};
+  int x_scu = SUB_SCU(x);
+  int y_scu = SUB_SCU(y);
+  const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift };

  kvz_intra_references refs;
  kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp);

  kvz_pixel pred[32 * 32];
+  int stride = state->tile->frame->source->stride;
  const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
-  kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary);
+  if(intra_mode < 68) {
+    kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary);
+  } else {
+    kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width);
+    if(cclm_params == NULL) {
+      cclm_parameters_t temp_params;
+      kvz_predict_cclm(
+        state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params);
+    }
+    else {
+      linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width);
+    }
+  }

  const int index = lcu_px.x + lcu_px.y * lcu_width;
  kvz_pixel *block = NULL;
@ -634,6 +944,7 @@ static void intra_recon_tb_leaf(
 * \param mode_luma     intra mode for luma, or -1 to skip luma recon
 * \param mode_chroma   intra mode for chroma, or -1 to skip chroma recon
 * \param cur_cu        pointer to the CU, or NULL to fetch CU from LCU
+ * \param cclm_params   pointer for the cclm_parameters, can be NULL if the mode is not cclm mode
 * \param lcu           containing LCU
 */
 void kvz_intra_recon_cu(
@ -644,6 +955,7 @@ void kvz_intra_recon_cu(
  int8_t mode_luma,
  int8_t mode_chroma,
  cu_info_t *cur_cu,
+  cclm_parameters_t *cclm_params,
  lcu_t *lcu)
 {
  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
@ -668,10 +980,10 @@ void kvz_intra_recon_cu(
    const int32_t x2 = x + offset;
    const int32_t y2 = y + offset;

-    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, lcu);
-    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, lcu);
-    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, lcu);
-    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, lcu);
+    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);

    // Propagate coded block flags from child CUs to parent CU.
    uint16_t child_cbfs[3] = {
@ -692,11 +1004,11 @@ void kvz_intra_recon_cu(
    const bool has_chroma = mode_chroma != -1 &&  (x % 8 == 0 && y % 8 == 0);
    // Process a leaf TU.
    if (has_luma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_luma, lcu, COLOR_Y);
+      intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y);
    }
    if (has_chroma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, lcu, COLOR_U);
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, lcu, COLOR_V);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V);
    }

    kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
--- a/src/intra.h
+++ b/src/intra.h
@ -54,6 +54,12 @@ typedef struct
  bool filtered_initialized;
 } kvz_intra_references;

+typedef struct
+{
+  int16_t a;
+  int16_t shift;
+  int16_t b;
+} cclm_parameters_t;

 /**
 * \brief Function for deriving intra luma predictions
@ -118,5 +124,21 @@ void kvz_intra_recon_cu(
  int8_t mode_luma,
  int8_t mode_chroma,
  cu_info_t *cur_cu,
+  cclm_parameters_t* cclm_params,
  lcu_t *lcu);

+
+void kvz_predict_cclm(
+  encoder_state_t const* const state,
+  const color_t color,
+  const int8_t width,
+  const int8_t height,
+  const int16_t x0,
+  const int16_t y0,
+  const int16_t stride,
+  const int8_t mode,
+  lcu_t* const lcu,
+  kvz_intra_references* chroma_ref,
+  kvz_pixel* dst,
+  cclm_parameters_t* cclm_params
+);
--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@ -516,6 +516,8 @@ typedef struct kvz_config

  int8_t jccr;

+  int8_t cclm;
+
  int8_t amvr; /* \brief Adaptive motion vector resolution parameter */
 } kvz_config;

--- a/src/search.c
+++ b/src/search.c
@ -241,6 +241,44 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree,
 }


+static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, int height, kvz_pixel *y_rec, kvz_pixel extra_pixel) {
+  if (!state->encoder_control->cfg.cclm) return;
+  int x_scu = SUB_SCU(x);
+  int y_scu = SUB_SCU(y);
+  y_rec += x_scu + y_scu * LCU_WIDTH;
+  int stride = state->tile->frame->source->stride;
+
+  for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) {
+    for (int x_ = 0; x_ < width; x_++) {
+      int s = 4;
+      s += y_rec[2 * x_] * 2;
+      s += y_rec[2 * x_ + 1];
+      // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
+      // *except* when we are also at the edge of the frame, in which case we want to duplicate
+      // the edge pixel
+      s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2) * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
+      s += y_rec[2 * x_ + LCU_WIDTH] * 2;
+      s += y_rec[2 * x_ + 1 + LCU_WIDTH];
+      s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH];
+      int index = x / 2 + x_ + (y / 2 + y_ )* stride / 2;
+      state->tile->frame->cclm_luma_rec[index] = s >> 3;
+    }
+    y_rec += LCU_WIDTH * 2;
+  }
+  if((y + height * 2) % 64 == 0) {
+    int line = y / 64 * stride / 2;
+    y_rec -= LCU_WIDTH;
+    for (int i = 0; i < width; ++i) {
+      int s = 2;
+      s += y_rec[i * 2] * 2;
+      s += y_rec[i * 2 + 1];
+      s += !x_scu && !i && x ? extra_pixel : y_rec[i * 2 - ((i + x) > 0)] ;
+      state->tile->frame->cclm_luma_rec_top_line[i + x / 2 + line] = s >> 2;
+    }
+  }
+}
+
+
 /**
 * Calculate RD cost for a Coding Unit.
 * \return Cost of block
@ -709,7 +747,11 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                         x, y,
                         depth,
                         cur_cu->intra.mode, -1, // skip chroma
-                         NULL, lcu);
+                         NULL, NULL, lcu);
+
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+      );

      // TODO: This heavily relies to square CUs
      if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != KVZ_CSP_400) {
@ -717,8 +759,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
        // rd2. Possibly because the luma mode search already takes chroma
        // into account, so there is less of a chanse of luma mode being
        // really bad for chroma.
-        if (ctrl->cfg.rdo == 3) {
-          cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu);
+        cclm_parameters_t cclm_params[2];
+        if (ctrl->cfg.rdo >= 3) {
+          cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, cclm_params);
          lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
        }

@ -726,7 +769,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                           x & ~7, y & ~7, // TODO: as does this
                           depth,
                           -1, cur_cu->intra.mode_chroma, // skip luma
-                           NULL, lcu);
+                           NULL, cclm_params, lcu);
      }
    } else if (cur_cu->type == CU_INTER) {

@ -862,7 +905,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    // gets used, at least in the most obvious cases, while avoiding any
    // searching.
    if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH
-        && x + cu_width <= frame->width && y + cu_width <= frame->height)
+        && x + cu_width <= frame->width && y + cu_width <= frame->height && 0)
    {
      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);

@ -883,7 +926,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                           x, y,
                           depth,
                           cur_cu->intra.mode, mode_chroma,
-                           NULL, lcu);
+                           NULL,NULL, lcu);

        cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
        if (has_chroma) {
@ -912,6 +955,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
      // Copy this CU's mode all the way down for use in adjacent CUs mode
      // search.
      work_tree_copy_down(x_local, y_local, depth, work_tree);
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+      );

      if (state->frame->slicetype != KVZ_SLICE_I) {
        // Reset HMVP to the beginning of this CU level search and add this CU as the mvp
@ -924,6 +970,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    // Need to copy modes down since the lower level of the work tree is used
    // when searching SMP and AMP blocks.
    work_tree_copy_down(x_local, y_local, depth, work_tree);
+    downsample_cclm_rec(
+      state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
+    );

    if (state->frame->slicetype != KVZ_SLICE_I) {
      // Reset HMVP to the beginning of this CU level search and add this CU as the mvp
--- a/src/search_inter.c
+++ b/src/search_inter.c
@ -1937,7 +1937,7 @@ static void search_pu_inter(encoder_state_t * const state,
    }

    // TODO: this probably should have a separate command line option
-    if (cfg->rdo == 3) {
+    if (cfg->rdo >= 3) {
      search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost);
    }
  }
--- a/src/search_intra.c
+++ b/src/search_intra.c
@ -258,6 +258,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
                                   int intra_mode, int cost_treshold,
                                   cu_info_t *const pred_cu,
                                   lcu_t *const lcu,
+                                   cclm_parameters_t *cclm_params,
                                   const int mts_mode)
 {
  assert(depth >= 0 && depth <= MAX_PU_DEPTH);
@ -332,7 +333,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
        x_px, y_px,
        depth,
        intra_mode, -1,
-        pred_cu, lcu);
+        pred_cu, cclm_params, lcu);

      // TODO: Not sure if this should be 0 or 1 but at least seems to work with 1
      if (pred_cu->tr_idx > 1)
@ -360,7 +361,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
        x_px, y_px,
        depth,
        -1, chroma_mode,
-        pred_cu, lcu);
+        pred_cu, cclm_params, lcu);
      best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
    }
    pred_cu->tr_skip = best_tr_idx == MTS_SKIP;
@ -391,15 +392,15 @@ static double search_intra_trdepth(encoder_state_t * const state,
  if (depth < max_depth && depth < MAX_PU_DEPTH) {
    split_cost = 3 * state->lambda;

-    split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, -1);
+    split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
    if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, -1);
+      split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
    }
    if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, -1);
+      split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
    }
    if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, -1);
+      split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
    }

    double cbf_bits = 0.0;
@ -454,20 +455,22 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
                                      const kvz_pixel *orig_u, const kvz_pixel *orig_v, int16_t origstride,
                                      kvz_intra_references *refs_u, kvz_intra_references *refs_v,
                                      int8_t luma_mode,
-                                      int8_t modes[5], double costs[5])
+                                      int8_t modes[8], double costs[8], lcu_t* lcu)
 {
  assert(!(x_px & 4 || y_px & 4));

  const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
  const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - (depth + 1), 2);

-  for (int i = 0; i < 5; ++i) {
+  for (int i = 0; i < 8; ++i) {
    costs[i] = 0;
  }

  cost_pixel_nxn_func *const satd_func = kvz_pixels_get_satd_func(width);
  //cost_pixel_nxn_func *const sad_func = kvz_pixels_get_sad_func(width);

+  cclm_parameters_t cclm_params;
+  
  kvz_pixel _pred[32 * 32 + SIMD_ALIGNMENT];
  kvz_pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT);

@ -476,19 +479,31 @@ static void search_intra_chroma_rough(encoder_state_t * const state,

  kvz_pixels_blit(orig_u, orig_block, width, width, origstride, width);
  for (int i = 0; i < 5; ++i) {
-    if (modes[i] == luma_mode) continue;
+    if (modes[i] == -1) continue;
    kvz_intra_predict(state, refs_u, log2_width_c, modes[i], COLOR_U, pred, false);
    //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
    costs[i] += satd_func(pred, orig_block);
  }
+  for (int i = 5; i < 8; i++) {
+    assert(state->encoder_control->cfg.cclm);
+    kvz_predict_cclm(
+      state,
+      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u,  pred, &cclm_params);
+  }

  kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width);
  for (int i = 0; i < 5; ++i) {
-    if (modes[i] == luma_mode) continue;
+    if (modes[i] == -1) continue;
    kvz_intra_predict(state, refs_v, log2_width_c, modes[i], COLOR_V, pred, false);
    //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
    costs[i] += satd_func(pred, orig_block);
  }
+  for (int i = 5; i < 8; i++) {
+    assert(state->encoder_control->cfg.cclm);
+    kvz_predict_cclm(
+      state,
+      COLOR_V, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u, pred, &cclm_params);
+  }

  kvz_sort_modes(modes, costs, 5);
 }
@ -744,7 +759,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
    // Reset transform split data in lcu.cu for this area.
    kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);

-    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu, -1);
+    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu, NULL, -1);
    costs[rdo_mode] += mode_cost;
    trafo[rdo_mode] = pred_cu.tr_idx;

@ -769,7 +784,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
    pred_cu.intra.mode = modes[0];
    pred_cu.intra.mode_chroma = modes[0];
    FILL(pred_cu.cbf, 0);
-    search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu, trafo[0]);
+    search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu, NULL, trafo[0]);
  }

  return modes_to_check;
@ -810,7 +825,19 @@ double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
  if (chroma_mode == luma_mode) {
    mode_bits = CTX_ENTROPY_FBITS(ctx, 0);
  } else {
-    mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1);
+    if(chroma_mode > 67) {
+      mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1);
+    }
+    else {
+      ctx = &(state->cabac.ctx.cclm_model);
+      mode_bits = CTX_ENTROPY_FBITS(ctx, chroma_mode != 81);
+      if (chroma_mode != 81) mode_bits += 1;
+    }
+  }
+  // Technically this is encoded first but for this method of counting bits it does not matter
+  if(state->encoder_control->cfg.cclm) {
+    ctx = &(state->cabac.ctx.cclm_flag);
+    mode_bits += CTX_ENTROPY_FBITS(ctx, chroma_mode > 67);
  }

  return mode_bits;
@ -820,31 +847,87 @@ double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
 int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
                                  int x_px, int y_px, int depth,
                                  int8_t intra_mode,
-                                  int8_t modes[5], int8_t num_modes,
-                                  lcu_t *const lcu)
+                                  int8_t modes[8], int8_t num_modes,
+                                  lcu_t *const lcu, cclm_parameters_t *best_cclm)
 {
  const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4);

+
+  kvz_intra_references refs[2];
+  const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
+  const vector2d_t pic_px = {
+    state->tile->frame->width,
+    state->tile->frame->height,
+  };
+
+
  if (reconstruct_chroma) {
+
+    int c_width = MAX(32 >> (depth), 4);
+
+    kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp);
+    kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp);
+
+    cclm_parameters_t cclm_params[2] = { 0 };
+
    const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
    cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);

    struct {
      double cost;
      int8_t mode;
+      cclm_parameters_t cclm[2];
    } chroma, best_chroma;

+    // chroma.cclm = cclm_params;
+
    best_chroma.mode = 0;
    best_chroma.cost = MAX_INT;

    for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) {
      chroma.mode = modes[chroma_mode_i];
+      if (chroma.mode == -1) continue;
+      if(chroma.mode < 67 || depth == 0) {
+        kvz_intra_recon_cu(state,
+          x_px, y_px,
+          depth,
+          -1, chroma.mode, // skip luma
+          NULL, NULL, lcu);
+      }
+      else {

-      kvz_intra_recon_cu(state,
-                         x_px, y_px,
-                         depth,
-                         -1, chroma.mode, // skip luma
-                         NULL, lcu);
+        kvz_predict_cclm(
+          state, COLOR_U,
+          c_width, c_width,
+          x_px & ~7, y_px & ~7,
+          state->tile->frame->source->stride,
+          chroma.mode, 
+          lcu,
+          &refs[0], NULL,
+          &cclm_params[0]);
+
+        chroma.cclm[0] = cclm_params[0];
+
+        kvz_predict_cclm(
+          state, COLOR_V,
+          c_width, c_width,
+          x_px & ~7, y_px & ~7,
+          state->tile->frame->source->stride, 
+          chroma.mode, 
+          lcu, 
+          &refs[1], NULL,
+          &cclm_params[1]);
+
+        chroma.cclm[1] = cclm_params[1];
+
+        kvz_intra_recon_cu(
+          state,
+          x_px, y_px,
+          depth,
+          -1, chroma.mode, // skip luma
+          NULL, cclm_params, lcu
+        );
+      }
      chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);

      double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode);
@ -854,6 +937,8 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
        best_chroma = chroma;
      }
    }
+    best_cclm[0] = best_chroma.cclm[0];
+    best_cclm[1] = best_chroma.cclm[1];

    return best_chroma.mode;
  }
@ -864,15 +949,15 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,

 int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
                              const int x_px, const int y_px,
-                              const int depth, lcu_t *lcu)
+                              const int depth, lcu_t *lcu, cclm_parameters_t *best_cclm)
 {
  const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };

  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
  int8_t intra_mode = cur_pu->intra.mode;

-  double costs[5];
-  int8_t modes[5] = { 0, 50, 18, 1, 67 };
+  double costs[8];
+  int8_t modes[8] = { 0, 50, 18, 1, -1, 81, 82, 83 };
  if (intra_mode != 0 && intra_mode != 50 && intra_mode != 18 && intra_mode != 1) {
    modes[4] = intra_mode;
  }
@ -884,14 +969,14 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
  const int8_t modes_in_depth[5] = { 1, 1, 1, 1, 2 };
  int num_modes = modes_in_depth[depth];

-  if (state->encoder_control->cfg.rdo == 3) {
-    num_modes = modes[4] == intra_mode ? 5 : 4;
+  if (state->encoder_control->cfg.rdo >= 3) {
+    num_modes = state->encoder_control->cfg.cclm ? 8 : 5;
  }

  // Don't do rough mode search if all modes are selected.
  // FIXME: It might make more sense to only disable rough search if
  // num_modes is 0.is 0.
-  if (num_modes != 1 && num_modes != 5 && num_modes != 4) {
+  if (num_modes != 1 && num_modes != 5 && num_modes != 4 && num_modes != 8) {
    const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
    const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
    const vector2d_t luma_px = { x_px, y_px };
@ -909,12 +994,12 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
    search_intra_chroma_rough(state, x_px, y_px, depth,
                              ref_u, ref_v, LCU_WIDTH_C,
                              &refs_u, &refs_v,
-                              intra_mode, modes, costs);
+                              intra_mode, modes, costs, lcu);
  }

  int8_t intra_mode_chroma = intra_mode;
  if (num_modes > 1) {
-    intra_mode_chroma = kvz_search_intra_chroma_rdo(state, x_px, y_px, depth, intra_mode, modes, num_modes, lcu);
+    intra_mode_chroma = kvz_search_intra_chroma_rdo(state, x_px, y_px, depth, intra_mode, modes, num_modes, lcu, best_cclm);
  }

  return intra_mode_chroma;
@ -969,7 +1054,7 @@ void kvz_search_cu_intra(encoder_state_t * const state,
  kvz_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH];

  int8_t number_of_modes = 0;
-  bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 3);
+  bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 4);
  if (!skip_rough_search) {
    number_of_modes = search_intra_rough(state,
                                         ref_pixels, LCU_WIDTH,
@ -990,9 +1075,9 @@ void kvz_search_cu_intra(encoder_state_t * const state,
  const int32_t rdo_level = state->encoder_control->cfg.rdo;
  if (rdo_level >= 2 || skip_rough_search) {
    int number_of_modes_to_search;
-    if (rdo_level == 3) {
+    if (rdo_level == 4) {
      number_of_modes_to_search = 67;
-    } else if (rdo_level == 2) {
+    } else if (rdo_level == 2 || rdo_level == 3) {
      number_of_modes_to_search = (cu_width == 4) ? 3 : 2;
    } else {
      // Check only the predicted modes.
--- a/src/search_intra.h
+++ b/src/search_intra.h
@ -41,17 +41,18 @@
 #include "cu.h"
 #include "encoderstate.h"
 #include "global.h" // IWYU pragma: keep
+#include "intra.h"


 double kvz_luma_mode_bits(const encoder_state_t *state, 
-                      int8_t luma_mode, const int8_t *intra_preds);
+                          int8_t luma_mode, const int8_t *intra_preds);
                       
 double kvz_chroma_mode_bits(const encoder_state_t *state,
                        int8_t chroma_mode, int8_t luma_mode);

 int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
                              const int x_px, const int y_px,
-                              const int depth, lcu_t *lcu);
+                              const int depth, lcu_t *lcu, cclm_parameters_t* best_cclm);

 void kvz_search_cu_intra(encoder_state_t * const state,
                         const int x_px, const int y_px,
--- a/src/videoframe.c
+++ b/src/videoframe.c
@ -46,7 +46,7 @@
 videoframe_t * kvz_videoframe_alloc(int32_t width,
                                    int32_t height,
                                    enum kvz_chroma_format chroma_format,
-                                    enum kvz_alf alf_type)
+                                    enum kvz_alf alf_type, bool cclm)
 {
  videoframe_t *frame = calloc(1, sizeof(videoframe_t));
  if (!frame) return 0;
@ -59,8 +59,13 @@ videoframe_t * kvz_videoframe_alloc(int32_t width,
  frame->sao_luma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
  if (chroma_format != KVZ_CSP_400) {
    frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
+    if (cclm) {
+      assert(chroma_format == KVZ_CSP_420);
+      frame->cclm_luma_rec = MALLOC(kvz_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 7) & ~7) + FRAME_PADDING_LUMA) / 4);
+      frame->cclm_luma_rec_top_line = MALLOC(kvz_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) / 2 * CEILDIV(height, 64));
+    }
  }
-
+  
  return frame;
 }

@ -76,6 +81,12 @@ int kvz_videoframe_free(videoframe_t * const frame)
    kvz_image_free(frame->rec_lmcs);
    frame->source_lmcs_mapped = false;
  }
+  if(frame->cclm_luma_rec) {
+    FREE_POINTER(frame->cclm_luma_rec);
+  }
+  if(frame->cclm_luma_rec_top_line) {
+    FREE_POINTER(frame->cclm_luma_rec_top_line);
+  }

  kvz_image_free(frame->source);
  frame->source = NULL;
--- a/src/videoframe.h
+++ b/src/videoframe.h
@ -53,6 +53,9 @@ typedef struct videoframe
  kvz_picture *rec;            //!< \brief Reconstructed image.
  kvz_picture *rec_lmcs;       //!< \brief LMCS mapped reconstructed image, if available, otherwise points to source.

+  kvz_pixel *cclm_luma_rec;    //!< \brief buffer for the downsampled luma reconstruction for cclm
+  kvz_pixel *cclm_luma_rec_top_line;    //!< \brief buffer for the downsampled luma reconstruction for cclm
+
  uint8_t* lmcs_avg_processed; //!< \brief For each LCU, indicates if already calculated average of border pixels is available
  int32_t* lmcs_avg;           //!< \brief Average of LCU border pixels

@ -78,7 +81,7 @@ typedef struct videoframe
 } videoframe_t;


-videoframe_t *kvz_videoframe_alloc(int32_t width, int32_t height, enum kvz_chroma_format chroma_format, enum kvz_alf alf_type);
+videoframe_t *kvz_videoframe_alloc(int32_t width, int32_t height, enum kvz_chroma_format chroma_format, enum kvz_alf alf_type, bool cclm);
 int kvz_videoframe_free(videoframe_t * const frame);

 void kvz_videoframe_set_poc(videoframe_t * frame, int32_t poc);
--- a/tests/test_intra.sh
+++ b/tests/test_intra.sh
@ -10,8 +10,9 @@ common_args='256x128 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-wpp --no
 valgrind_test $common_args --rd=1
 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37
 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37 --signhide --rdoq 
+valgrind_test $common_args --rd=3
 valgrind_test $common_args --alf=full --no-wpp --threads=0 --owf=0
 valgrind_test $common_args --alf=full --wpp --threads=1
-valgrind_test $common_args --jccr
 valgrind_test $common_args --jccr --rdoq --rd=2 --mts=intra
+valgrind_test $common_args --rd=3 --cclm --jccr