[cclm] Fix search and parameter generation for CCLM

2024-11-30 12:44:07 +00:00 · 2021-11-19 11:54:51 +02:00 · 2021-11-19 11:54:51 +02:00 · f030158703
parent d9875a3d92
commit f030158703
7 changed files with 170 additions and 60 deletions
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@ -122,7 +122,7 @@ static int encoder_state_config_tile_init(encoder_state_t * const state,
                                          const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
  
  const encoder_control_t * const encoder = state->encoder_control;
-  state->tile->frame = kvz_videoframe_alloc(width, height, state->encoder_control->chroma_format, encoder->cfg.alf_type);
+  state->tile->frame = kvz_videoframe_alloc(width, height, state->encoder_control->chroma_format, encoder->cfg.alf_type, encoder->cfg.cclm);
  
  state->tile->frame->rec = NULL;
  
--- a/src/intra.c
+++ b/src/intra.c
@ -259,7 +259,7 @@ enum lm_mode
 static void get_cclm_parameters(
  encoder_state_t const* const state,
  int8_t width, int8_t height, int8_t mode,
-  int x0, int y0,
+  int x0, int y0, int avai_above_right_units, int avai_left_below_units,
  kvz_intra_ref* luma_src, kvz_intra_references*chroma_ref,
  int16_t *a, int16_t*b, int16_t*shift) {

@ -287,8 +287,8 @@ static void get_cclm_parameters(
  //int total_units = total_left_units + total_above_units + 1;
  //int above_right_units = total_above_units - tu_width_in_units;
  //int left_below_units = total_left_units - tu_height_in_units;
-  int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
-  int avai_left_below_units = 0;
+  //int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
+  //int avai_left_below_units = 0;
  int avai_above_units = CLIP(0, tu_height_in_units, y0/base_unit_size);
  int avai_left_units = CLIP(0, tu_width_in_units, x0 / base_unit_size);

@ -465,7 +465,7 @@ void kvz_predict_cclm(
  const int16_t y0,
  const int16_t stride,
  const int8_t mode,
-  kvz_pixel const *  y_rec,
+  lcu_t* const lcu,
  kvz_intra_references* chroma_ref,
  kvz_pixel* dst,
  cclm_parameters_t* cclm_params
@ -480,61 +480,92 @@ void kvz_predict_cclm(

  int x_scu = SUB_SCU(x0);
  int y_scu = SUB_SCU(y0);
-  y_rec += x_scu + y_scu * LCU_WIDTH;
+
+  int available_above_right = 0;
+  int available_left_below = 0;
+
+
+  kvz_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH;

  // Essentially what this does is that it uses 6-tap filtering to downsample
  // the luma intra references down to match the resolution of the chroma channel.
  // The luma reference is only needed when we are not on the edge of the picture.
  // Because the reference pixels that are needed on the edge of the ctu this code
  // is kinda messy but what can you do
-  if(x0) {
-    for(int y = 0; y < height * 2; y+=2) {
+
+  if (y0) {
+    for (; available_above_right < width / 2; available_above_right++) {
+      int x_extension = x_scu + width * 2 + 4 * available_above_right;
+      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4);
+      if (pu->type == CU_NOTSET || x_extension > LCU_WIDTH) break;
+    }
+    if(y_scu == 0) {
+      if(!state->encoder_control->cfg.wpp) available_above_right = MIN(width / 2, (state->tile->frame->width - x0 - width * 2) / 4);
+      for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
+        bool left_padding = x0 || x;
+        sampled_luma_ref.top[x / 2] = (state->tile->frame->rec->y[x0 + x + (y0 - 1) * stride] * 2 +
+          state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride] +
+          state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride] + 
+          2) >> 2;
+      }
+    }
+    else {
+      for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
+        bool left_padding = x0 || x;
        int s = 4;
-      s += x_scu ? y_rec[y * LCU_WIDTH - 1] * 2       : state->tile->frame->rec->y[x0 - 1 + (y0 + y) * stride] * 2;
-      s += x_scu ? y_rec[y * LCU_WIDTH - 2]           : state->tile->frame->rec->y[x0 - 2 + (y0 + y) * stride];
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 1] * 2 : state->tile->frame->rec->y[x0 - 1 + (y0 + y + 1) * stride] * 2;
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2]     : state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride];
-      s +=         y_rec[y * LCU_WIDTH];
-      s +=         y_rec[(y + 1) * LCU_WIDTH];
+        s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2            : state->tile->frame->rec->y[x0 + x + (y0 - 2) * stride] * 2;
+        s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1]            : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
+        s += y_scu && !(x0 && !x && !x_scu) ? y_rec[x - LCU_WIDTH * 2 - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 2) * stride];
+        s += y_scu ? y_rec[x - LCU_WIDTH] * 2                : state->tile->frame->rec->y[x0 + x + (y0 - 1) * stride] * 2;
+        s += y_scu ? y_rec[x - LCU_WIDTH + 1]                : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
+        s += y_scu && !(x0 && !x && !x_scu) ? y_rec[x - LCU_WIDTH - left_padding]     : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
+        sampled_luma_ref.top[x / 2] = s >> 3;
+      }
+    }
+  }
+
+  if(x0) {
+    for (; available_left_below < height / 2; available_left_below++) {
+      int y_extension = y_scu + height * 2 + 4 * available_left_below;
+      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension);
+      if (pu->type == CU_NOTSET || y_extension > LCU_WIDTH) break;
+      if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
+    }
+    for(int y = 0; y < height * (available_left_below ? 4 : 2); y+=2) {
+      int s = 4;
+      s += x_scu ? y_rec[y * LCU_WIDTH - 2] * 2       : state->tile->frame->rec->y[x0 - 2 + (y0 + y) * stride] * 2;
+      s += x_scu ? y_rec[y * LCU_WIDTH - 1]           : state->tile->frame->rec->y[x0 - 1 + (y0 + y) * stride];
+      s += x_scu ? y_rec[y * LCU_WIDTH - 3]           : state->tile->frame->rec->y[x0 - 3 + (y0 + y) * stride];
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2] * 2 : state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride] * 2;
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 1]     : state->tile->frame->rec->y[x0 - 1 + (y0 + y + 1) * stride];
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 3]     : state->tile->frame->rec->y[x0 - 3 + (y0 + y + 1) * stride];
      sampled_luma_ref.left[y/2] = s >> 3;
    }
  }

-  if(y0) {
-    for(int x = 0; x < width*2; x += 2) {
-      bool left_padding = x0 || x;
-      int s = 4;
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2            : state->tile->frame->rec->y[x0 + x +(y0 - 2) * stride] * 2;
-      s += y_scu ? y_rec[x - LCU_WIDTH] * 2                : state->tile->frame->rec->y[x0 + x +(y0 - 1) * stride] * 2;
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2 - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 2) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH - left_padding]     : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1]            : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH + 1]                : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
-      sampled_luma_ref.top[x / 2] = s >> 3;
-    }
-  }
+

  // Downsample the reconstructed luma sample so that they can be mapped into the chroma
  // to generate the chroma prediction
-  for (int y = 0; y < height * 2; y+=2) {
-    for (int x = 0; x <  width * 2; x+=2) {
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x <  width; x++) {
      int s = 4;
      s += y_rec[2 * x] * 2;
      s += y_rec[2 * x + 1];
      // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
      // *except* when we are also at the edge of the frame, in which case we want to duplicate
      // the edge pixel
-      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + y0 * stride] : y_rec[2 * x - ((x + x0) > 0)];
+      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + y*2) * stride] : y_rec[2 * x - ((x + x0) > 0)];
      s += y_rec[2 * x + LCU_WIDTH] * 2;
      s += y_rec[2 * x + 1 + LCU_WIDTH];
-      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + 1) * stride] : y_rec[2 * x - ((x + x0) > 0) + stride];
-      sampled_luma[x / 2 + y / 2 * width] = s >> 3;
+      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + y * 2 + 1) * stride] : y_rec[2 * x - ((x + x0) > 0) + LCU_WIDTH];
+      sampled_luma[x + y * width] = s >> 3;
    }
-    y_rec += LCU_WIDTH;
+    y_rec += LCU_WIDTH * 2;
  }

  int16_t a, b, shift;
-  get_cclm_parameters(state, width, height, mode,x0, y0, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
+  get_cclm_parameters(state, width, height, mode,x0, y0, available_above_right, available_left_below, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
  cclm_params->shift = shift;
  cclm_params->a = a;
  cclm_params->b = b;
@ -889,7 +920,8 @@ static void intra_recon_tb_leaf(
    state->tile->frame->height,
  };
  int x_scu = SUB_SCU(x);
-  const vector2d_t lcu_px = {x_scu >> shift, SUB_SCU(y) >> shift};
+  int y_scu = SUB_SCU(y);
+  const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift };

  kvz_intra_references refs;
  kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp);
@ -901,26 +933,27 @@ static void intra_recon_tb_leaf(
    kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary);
  } else {
    kvz_pixel *y_rec = lcu->rec.y;
-    for (int y_ = 0; y_ < width * 2; y_ += 2) {
-      for (int x_ = 0; x_ < width * 2; x_ += 2) {
+    y_rec += x_scu + y_scu * LCU_WIDTH;
+    for (int y_ = 0; y_ < width; y_++) {
+      for (int x_ = 0; x_ < width; x_++) {
        int s = 4;
        s += y_rec[2 * x_] * 2;
        s += y_rec[2 * x_ + 1];
        // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
        // *except* when we are also at the edge of the frame, in which case we want to duplicate
        // the edge pixel
-        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + y * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
+        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2) * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
        s += y_rec[2 * x_ + LCU_WIDTH] * 2;
        s += y_rec[2 * x_ + 1 + LCU_WIDTH];
-        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + stride];
-        pred[x_ / 2 + y_ * width / 2] = s >> 3;
+        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH];
+        pred[x_  + y_ * width] = s >> 3;
      }
-      y_rec += LCU_WIDTH;
+      y_rec += LCU_WIDTH * 2;
    }
    if(cclm_params == NULL) {
      cclm_parameters_t temp_params;
      kvz_predict_cclm(
-        state, color, width, width, x, y, stride, intra_mode, lcu->rec.y, &refs, pred, &temp_params);
+        state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params);
    }
    else {
      linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width);
@ -996,10 +1029,10 @@ void kvz_intra_recon_cu(
    const int32_t x2 = x + offset;
    const int32_t y2 = y + offset;

-    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
-    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
-    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
-    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
+    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);

    // Propagate coded block flags from child CUs to parent CU.
    uint16_t child_cbfs[3] = {
--- a/src/intra.h
+++ b/src/intra.h
@ -137,7 +137,7 @@ void kvz_predict_cclm(
  const int16_t y0,
  const int16_t stride,
  const int8_t mode,
-  kvz_pixel const* y_rec,
+  lcu_t* const lcu,
  kvz_intra_references* chroma_ref,
  kvz_pixel* dst,
  cclm_parameters_t* cclm_params
--- a/src/search.c
+++ b/src/search.c
@ -241,6 +241,33 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree,
 }


+static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, int height, kvz_pixel *y_rec) {
+  if (!state->encoder_control->cfg.cclm) return;
+  int x_scu = SUB_SCU(x);
+  int y_scu = SUB_SCU(y);
+  y_rec += x_scu + y_scu * LCU_WIDTH;
+  int stride = state->tile->frame->source->stride;
+
+  for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) {
+    for (int x_ = 0; x_ < width; x_++) {
+      int s = 4;
+      s += y_rec[2 * x_] * 2;
+      s += y_rec[2 * x_ + 1];
+      // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
+      // *except* when we are also at the edge of the frame, in which case we want to duplicate
+      // the edge pixel
+      s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2) * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
+      s += y_rec[2 * x_ + LCU_WIDTH] * 2;
+      s += y_rec[2 * x_ + 1 + LCU_WIDTH];
+      s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH];
+      int index = x / 2 + x_ + (y / 2 + y_ )* stride / 2;
+      state->tile->frame->cclm_luma_rec[index] = s >> 3;
+    }
+    y_rec += LCU_WIDTH * 2;
+  }
+}
+
+
 /**
 * Calculate RD cost for a Coding Unit.
 * \return Cost of block
@ -711,6 +738,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                         cur_cu->intra.mode, -1, // skip chroma
                         NULL, NULL, lcu);

+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+      );
+
      // TODO: This heavily relies to square CUs
      if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != KVZ_CSP_400) {
        // There is almost no benefit to doing the chroma mode search for
@ -863,7 +894,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    // gets used, at least in the most obvious cases, while avoiding any
    // searching.
    if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH
-        && x + cu_width <= frame->width && y + cu_width <= frame->height)
+        && x + cu_width <= frame->width && y + cu_width <= frame->height && 0)
    {
      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);

@ -913,6 +944,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
      // Copy this CU's mode all the way down for use in adjacent CUs mode
      // search.
      work_tree_copy_down(x_local, y_local, depth, work_tree);
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+      );

      if (state->frame->slicetype != KVZ_SLICE_I) {
        // Reset HMVP to the beginning of this CU level search and add this CU as the mvp
@ -925,6 +959,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
    // Need to copy modes down since the lower level of the work tree is used
    // when searching SMP and AMP blocks.
    work_tree_copy_down(x_local, y_local, depth, work_tree);
+    downsample_cclm_rec(
+      state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+    );

    if (state->frame->slicetype != KVZ_SLICE_I) {
      // Reset HMVP to the beginning of this CU level search and add this CU as the mvp
--- a/src/search_intra.c
+++ b/src/search_intra.c
@ -488,7 +488,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
    assert(state->encoder_control->cfg.cclm);
    kvz_predict_cclm(
      state,
-      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu->rec.y, refs_u,  pred, &cclm_params);
+      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u,  pred, &cclm_params);
  }

  kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width);
@ -498,6 +498,12 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
    //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
    costs[i] += satd_func(pred, orig_block);
  }
+  for (int i = 5; i < 8; i++) {
+    assert(state->encoder_control->cfg.cclm);
+    kvz_predict_cclm(
+      state,
+      COLOR_V, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u, pred, &cclm_params);
+  }

  kvz_sort_modes(modes, costs, 5);
 }
@ -836,17 +842,22 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,


  kvz_intra_references refs[2];
-  const vector2d_t luma_px = { x_px, y_px };
+  const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
  const vector2d_t pic_px = {
    state->tile->frame->width,
    state->tile->frame->height,
  };
+
+
+  if (reconstruct_chroma) {
+
+    int c_width = MAX(32 >> (depth), 4);
+
    kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp);
    kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp);

    cclm_parameters_t cclm_params[2] = { 0 };

-  if (reconstruct_chroma) {
    const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
    cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);

@ -864,7 +875,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
    for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) {
      chroma.mode = modes[chroma_mode_i];
      if (chroma.mode == -1) continue;
-      if(chroma.mode < 67) {
+      if(chroma.mode < 67 || depth == 0) {
        kvz_intra_recon_cu(state,
          x_px, y_px,
          depth,
@ -872,18 +883,38 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
          NULL, NULL, lcu);
      }
      else {
+
        kvz_predict_cclm(
-          state, COLOR_U, 32 >> (depth), 32 >> (depth), x_px, y_px, state->tile->frame->source->stride, chroma.mode, lcu->rec.y, &refs[0], NULL, &cclm_params[0]);
+          state, COLOR_U,
+          c_width, c_width,
+          x_px & ~7, y_px & ~7,
+          state->tile->frame->source->stride,
+          chroma.mode, 
+          lcu,
+          &refs[0], NULL,
+          &cclm_params[0]);
+
        chroma.cclm[0] = cclm_params[0];
+
        kvz_predict_cclm(
-          state, COLOR_V, 32 >> (depth), 32 >> (depth), x_px, y_px, state->tile->frame->source->stride, chroma.mode, lcu->rec.y, &refs[1], NULL, &cclm_params[1]);
+          state, COLOR_V,
+          c_width, c_width,
+          x_px & ~7, y_px & ~7,
+          state->tile->frame->source->stride, 
+          chroma.mode, 
+          lcu, 
+          &refs[1], NULL,
+          &cclm_params[1]);
+
        chroma.cclm[1] = cclm_params[1];

-        kvz_intra_recon_cu(state,
+        kvz_intra_recon_cu(
+          state,
          x_px, y_px,
          depth,
          -1, chroma.mode, // skip luma
-          NULL, cclm_params, lcu);
+          NULL, cclm_params, lcu
+        );
      }
      chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);

--- a/src/videoframe.c
+++ b/src/videoframe.c
@ -46,7 +46,7 @@
 videoframe_t * kvz_videoframe_alloc(int32_t width,
                                    int32_t height,
                                    enum kvz_chroma_format chroma_format,
-                                    enum kvz_alf alf_type)
+                                    enum kvz_alf alf_type, bool cclm)
 {
  videoframe_t *frame = calloc(1, sizeof(videoframe_t));
  if (!frame) return 0;
@ -59,6 +59,10 @@ videoframe_t * kvz_videoframe_alloc(int32_t width,
  frame->sao_luma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
  if (chroma_format != KVZ_CSP_400) {
    frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
+    if (cclm) {
+      assert(chroma_format == KVZ_CSP_420);
+      frame->cclm_luma_rec = MALLOC(kvz_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 7) & ~7) + FRAME_PADDING_LUMA) / 4);
+    }
  }
  
  return frame;
@ -76,6 +80,9 @@ int kvz_videoframe_free(videoframe_t * const frame)
    kvz_image_free(frame->rec_lmcs);
    frame->source_lmcs_mapped = false;
  }
+  if(frame->cclm_luma_rec) {
+    FREE_POINTER(frame->cclm_luma_rec);
+  }

  kvz_image_free(frame->source);
  frame->source = NULL;
--- a/src/videoframe.h
+++ b/src/videoframe.h
@ -53,6 +53,8 @@ typedef struct videoframe
  kvz_picture *rec;            //!< \brief Reconstructed image.
  kvz_picture *rec_lmcs;       //!< \brief LMCS mapped reconstructed image, if available, otherwise points to source.

+  kvz_pixel *cclm_luma_rec;    //!< \brief buffer for the downsampled luma reconstruction for cclm
+
  uint8_t* lmcs_avg_processed; //!< \brief For each LCU, indicates if already calculated average of border pixels is available
  int32_t* lmcs_avg;           //!< \brief Average of LCU border pixels

@ -78,7 +80,7 @@ typedef struct videoframe
 } videoframe_t;


-videoframe_t *kvz_videoframe_alloc(int32_t width, int32_t height, enum kvz_chroma_format chroma_format, enum kvz_alf alf_type);
+videoframe_t *kvz_videoframe_alloc(int32_t width, int32_t height, enum kvz_chroma_format chroma_format, enum kvz_alf alf_type, bool cclm);
 int kvz_videoframe_free(videoframe_t * const frame);

 void kvz_videoframe_set_poc(videoframe_t * frame, int32_t poc);