diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c
index c7c9985d..7a5cff52 100644
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@@ -122,7 +122,7 @@ static int encoder_state_config_tile_init(encoder_state_t * const state,
                                           const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
   
   const encoder_control_t * const encoder = state->encoder_control;
-  state->tile->frame = kvz_videoframe_alloc(width, height, state->encoder_control->chroma_format, encoder->cfg.alf_type);
+  state->tile->frame = kvz_videoframe_alloc(width, height, state->encoder_control->chroma_format, encoder->cfg.alf_type, encoder->cfg.cclm);
   
   state->tile->frame->rec = NULL;
   
diff --git a/src/intra.c b/src/intra.c
index 79079404..f786781c 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -259,7 +259,7 @@ enum lm_mode
 static void get_cclm_parameters(
   encoder_state_t const* const state,
   int8_t width, int8_t height, int8_t mode,
-  int x0, int y0,
+  int x0, int y0, int avai_above_right_units, int avai_left_below_units,
   kvz_intra_ref* luma_src, kvz_intra_references*chroma_ref,
   int16_t *a, int16_t*b, int16_t*shift) {
 
@@ -287,8 +287,8 @@ static void get_cclm_parameters(
   //int total_units = total_left_units + total_above_units + 1;
   //int above_right_units = total_above_units - tu_width_in_units;
   //int left_below_units = total_left_units - tu_height_in_units;
-  int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
-  int avai_left_below_units = 0;
+  //int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
+  //int avai_left_below_units = 0;
   int avai_above_units = CLIP(0, tu_height_in_units, y0/base_unit_size);
   int avai_left_units = CLIP(0, tu_width_in_units, x0 / base_unit_size);
 
@@ -465,7 +465,7 @@ void kvz_predict_cclm(
   const int16_t y0,
   const int16_t stride,
   const int8_t mode,
-  kvz_pixel const *  y_rec,
+  lcu_t* const lcu,
   kvz_intra_references* chroma_ref,
   kvz_pixel* dst,
   cclm_parameters_t* cclm_params
@@ -480,61 +480,92 @@ void kvz_predict_cclm(
 
   int x_scu = SUB_SCU(x0);
   int y_scu = SUB_SCU(y0);
-  y_rec += x_scu + y_scu * LCU_WIDTH;
+
+  int available_above_right = 0;
+  int available_left_below = 0;
+
+
+  kvz_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH;
 
   // Essentially what this does is that it uses 6-tap filtering to downsample
   // the luma intra references down to match the resolution of the chroma channel.
   // The luma reference is only needed when we are not on the edge of the picture.
   // Because the reference pixels that are needed on the edge of the ctu this code
   // is kinda messy but what can you do
+
+  if (y0) {
+    for (; available_above_right < width / 2; available_above_right++) {
+      int x_extension = x_scu + width * 2 + 4 * available_above_right;
+      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4);
+      if (pu->type == CU_NOTSET || x_extension > LCU_WIDTH) break;
+    }
+    if(y_scu == 0) {
+      if(!state->encoder_control->cfg.wpp) available_above_right = MIN(width / 2, (state->tile->frame->width - x0 - width * 2) / 4);
+      for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
+        bool left_padding = x0 || x;
+        sampled_luma_ref.top[x / 2] = (state->tile->frame->rec->y[x0 + x + (y0 - 1) * stride] * 2 +
+          state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride] +
+          state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride] + 
+          2) >> 2;
+      }
+    }
+    else {
+      for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
+        bool left_padding = x0 || x;
+        int s = 4;
+        s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2            : state->tile->frame->rec->y[x0 + x + (y0 - 2) * stride] * 2;
+        s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1]            : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
+        s += y_scu && !(x0 && !x && !x_scu) ? y_rec[x - LCU_WIDTH * 2 - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 2) * stride];
+        s += y_scu ? y_rec[x - LCU_WIDTH] * 2                : state->tile->frame->rec->y[x0 + x + (y0 - 1) * stride] * 2;
+        s += y_scu ? y_rec[x - LCU_WIDTH + 1]                : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
+        s += y_scu && !(x0 && !x && !x_scu) ? y_rec[x - LCU_WIDTH - left_padding]     : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
+        sampled_luma_ref.top[x / 2] = s >> 3;
+      }
+    }
+  }
+
   if(x0) {
-    for(int y = 0; y < height * 2; y+=2) {
+    for (; available_left_below < height / 2; available_left_below++) {
+      int y_extension = y_scu + height * 2 + 4 * available_left_below;
+      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension);
+      if (pu->type == CU_NOTSET || y_extension > LCU_WIDTH) break;
+      if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
+    }
+    for(int y = 0; y < height * (available_left_below ? 4 : 2); y+=2) {
       int s = 4;
-      s += x_scu ? y_rec[y * LCU_WIDTH - 1] * 2       : state->tile->frame->rec->y[x0 - 1 + (y0 + y) * stride] * 2;
-      s += x_scu ? y_rec[y * LCU_WIDTH - 2]           : state->tile->frame->rec->y[x0 - 2 + (y0 + y) * stride];
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 1] * 2 : state->tile->frame->rec->y[x0 - 1 + (y0 + y + 1) * stride] * 2;
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2]     : state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride];
-      s +=         y_rec[y * LCU_WIDTH];
-      s +=         y_rec[(y + 1) * LCU_WIDTH];
+      s += x_scu ? y_rec[y * LCU_WIDTH - 2] * 2       : state->tile->frame->rec->y[x0 - 2 + (y0 + y) * stride] * 2;
+      s += x_scu ? y_rec[y * LCU_WIDTH - 1]           : state->tile->frame->rec->y[x0 - 1 + (y0 + y) * stride];
+      s += x_scu ? y_rec[y * LCU_WIDTH - 3]           : state->tile->frame->rec->y[x0 - 3 + (y0 + y) * stride];
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2] * 2 : state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride] * 2;
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 1]     : state->tile->frame->rec->y[x0 - 1 + (y0 + y + 1) * stride];
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 3]     : state->tile->frame->rec->y[x0 - 3 + (y0 + y + 1) * stride];
       sampled_luma_ref.left[y/2] = s >> 3;
     }
   }
 
-  if(y0) {
-    for(int x = 0; x < width*2; x += 2) {
-      bool left_padding = x0 || x;
-      int s = 4;
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2            : state->tile->frame->rec->y[x0 + x +(y0 - 2) * stride] * 2;
-      s += y_scu ? y_rec[x - LCU_WIDTH] * 2                : state->tile->frame->rec->y[x0 + x +(y0 - 1) * stride] * 2;
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2 - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 2) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH - left_padding]     : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1]            : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH + 1]                : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
-      sampled_luma_ref.top[x / 2] = s >> 3;
-    }
-  }
+
 
   // Downsample the reconstructed luma sample so that they can be mapped into the chroma
   // to generate the chroma prediction
-  for (int y = 0; y < height * 2; y+=2) {
-    for (int x = 0; x <  width * 2; x+=2) {
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x <  width; x++) {
       int s = 4;
       s += y_rec[2 * x] * 2;
       s += y_rec[2 * x + 1];
       // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
       // *except* when we are also at the edge of the frame, in which case we want to duplicate
       // the edge pixel
-      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + y0 * stride] : y_rec[2 * x - ((x + x0) > 0)];
+      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + y*2) * stride] : y_rec[2 * x - ((x + x0) > 0)];
       s += y_rec[2 * x + LCU_WIDTH] * 2;
       s += y_rec[2 * x + 1 + LCU_WIDTH];
-      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + 1) * stride] : y_rec[2 * x - ((x + x0) > 0) + stride];
-      sampled_luma[x / 2 + y / 2 * width] = s >> 3;
+      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + y * 2 + 1) * stride] : y_rec[2 * x - ((x + x0) > 0) + LCU_WIDTH];
+      sampled_luma[x + y * width] = s >> 3;
     }
-    y_rec += LCU_WIDTH;
+    y_rec += LCU_WIDTH * 2;
   }
 
   int16_t a, b, shift;
-  get_cclm_parameters(state, width, height, mode,x0, y0, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
+  get_cclm_parameters(state, width, height, mode,x0, y0, available_above_right, available_left_below, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
   cclm_params->shift = shift;
   cclm_params->a = a;
   cclm_params->b = b;
@@ -889,7 +920,8 @@ static void intra_recon_tb_leaf(
     state->tile->frame->height,
   };
   int x_scu = SUB_SCU(x);
-  const vector2d_t lcu_px = {x_scu >> shift, SUB_SCU(y) >> shift};
+  int y_scu = SUB_SCU(y);
+  const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift };
 
   kvz_intra_references refs;
   kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp);
@@ -901,26 +933,27 @@ static void intra_recon_tb_leaf(
     kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary);
   } else {
     kvz_pixel *y_rec = lcu->rec.y;
-    for (int y_ = 0; y_ < width * 2; y_ += 2) {
-      for (int x_ = 0; x_ < width * 2; x_ += 2) {
+    y_rec += x_scu + y_scu * LCU_WIDTH;
+    for (int y_ = 0; y_ < width; y_++) {
+      for (int x_ = 0; x_ < width; x_++) {
         int s = 4;
         s += y_rec[2 * x_] * 2;
         s += y_rec[2 * x_ + 1];
         // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
         // *except* when we are also at the edge of the frame, in which case we want to duplicate
         // the edge pixel
-        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + y * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
+        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2) * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
         s += y_rec[2 * x_ + LCU_WIDTH] * 2;
         s += y_rec[2 * x_ + 1 + LCU_WIDTH];
-        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + stride];
-        pred[x_ / 2 + y_ * width / 2] = s >> 3;
+        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH];
+        pred[x_  + y_ * width] = s >> 3;
       }
-      y_rec += LCU_WIDTH;
+      y_rec += LCU_WIDTH * 2;
     }
     if(cclm_params == NULL) {
       cclm_parameters_t temp_params;
       kvz_predict_cclm(
-        state, color, width, width, x, y, stride, intra_mode, lcu->rec.y, &refs, pred, &temp_params);
+        state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params);
     }
     else {
       linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width);
@@ -996,10 +1029,10 @@ void kvz_intra_recon_cu(
     const int32_t x2 = x + offset;
     const int32_t y2 = y + offset;
 
-    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
-    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
-    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
-    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
+    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs[3] = {
diff --git a/src/intra.h b/src/intra.h
index eb737be7..846d77b2 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -137,7 +137,7 @@ void kvz_predict_cclm(
   const int16_t y0,
   const int16_t stride,
   const int8_t mode,
-  kvz_pixel const* y_rec,
+  lcu_t* const lcu,
   kvz_intra_references* chroma_ref,
   kvz_pixel* dst,
   cclm_parameters_t* cclm_params
diff --git a/src/search.c b/src/search.c
index 685c8428..5691d361 100644
--- a/src/search.c
+++ b/src/search.c
@@ -241,6 +241,33 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree,
 }
 
 
+static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, int height, kvz_pixel *y_rec) {
+  if (!state->encoder_control->cfg.cclm) return;
+  int x_scu = SUB_SCU(x);
+  int y_scu = SUB_SCU(y);
+  y_rec += x_scu + y_scu * LCU_WIDTH;
+  int stride = state->tile->frame->source->stride;
+
+  for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) {
+    for (int x_ = 0; x_ < width; x_++) {
+      int s = 4;
+      s += y_rec[2 * x_] * 2;
+      s += y_rec[2 * x_ + 1];
+      // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
+      // *except* when we are also at the edge of the frame, in which case we want to duplicate
+      // the edge pixel
+      s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2) * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
+      s += y_rec[2 * x_ + LCU_WIDTH] * 2;
+      s += y_rec[2 * x_ + 1 + LCU_WIDTH];
+      s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH];
+      int index = x / 2 + x_ + (y / 2 + y_ )* stride / 2;
+      state->tile->frame->cclm_luma_rec[index] = s >> 3;
+    }
+    y_rec += LCU_WIDTH * 2;
+  }
+}
+
+
 /**
 * Calculate RD cost for a Coding Unit.
 * \return Cost of block
@@ -711,6 +738,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                          cur_cu->intra.mode, -1, // skip chroma
                          NULL, NULL, lcu);
 
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+      );
+
       // TODO: This heavily relies to square CUs
       if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != KVZ_CSP_400) {
         // There is almost no benefit to doing the chroma mode search for
@@ -863,7 +894,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // gets used, at least in the most obvious cases, while avoiding any
     // searching.
     if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH
-        && x + cu_width <= frame->width && y + cu_width <= frame->height)
+        && x + cu_width <= frame->width && y + cu_width <= frame->height && 0)
     {
       cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);
 
@@ -913,6 +944,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       // Copy this CU's mode all the way down for use in adjacent CUs mode
       // search.
       work_tree_copy_down(x_local, y_local, depth, work_tree);
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+      );
 
       if (state->frame->slicetype != KVZ_SLICE_I) {
         // Reset HMVP to the beginning of this CU level search and add this CU as the mvp
@@ -925,6 +959,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // Need to copy modes down since the lower level of the work tree is used
     // when searching SMP and AMP blocks.
     work_tree_copy_down(x_local, y_local, depth, work_tree);
+    downsample_cclm_rec(
+      state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+    );
 
     if (state->frame->slicetype != KVZ_SLICE_I) {
       // Reset HMVP to the beginning of this CU level search and add this CU as the mvp
diff --git a/src/search_intra.c b/src/search_intra.c
index e4afdcf1..f225c777 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -488,7 +488,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
     assert(state->encoder_control->cfg.cclm);
     kvz_predict_cclm(
       state,
-      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu->rec.y, refs_u,  pred, &cclm_params);
+      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u,  pred, &cclm_params);
   }
 
   kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width);
@@ -498,6 +498,12 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costs[i] += satd_func(pred, orig_block);
   }
+  for (int i = 5; i < 8; i++) {
+    assert(state->encoder_control->cfg.cclm);
+    kvz_predict_cclm(
+      state,
+      COLOR_V, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u, pred, &cclm_params);
+  }
 
   kvz_sort_modes(modes, costs, 5);
 }
@@ -836,17 +842,22 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
 
 
   kvz_intra_references refs[2];
-  const vector2d_t luma_px = { x_px, y_px };
+  const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
   const vector2d_t pic_px = {
     state->tile->frame->width,
     state->tile->frame->height,
   };
-  kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp);
-  kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp);
 
-  cclm_parameters_t cclm_params[2] = {0};
 
   if (reconstruct_chroma) {
+
+    int c_width = MAX(32 >> (depth), 4);
+
+    kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp);
+    kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp);
+
+    cclm_parameters_t cclm_params[2] = { 0 };
+
     const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
     cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
 
@@ -864,7 +875,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
     for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) {
       chroma.mode = modes[chroma_mode_i];
       if (chroma.mode == -1) continue;
-      if(chroma.mode < 67) {
+      if(chroma.mode < 67 || depth == 0) {
         kvz_intra_recon_cu(state,
           x_px, y_px,
           depth,
@@ -872,18 +883,38 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
           NULL, NULL, lcu);
       }
       else {
+
         kvz_predict_cclm(
-          state, COLOR_U, 32 >> (depth), 32 >> (depth), x_px, y_px, state->tile->frame->source->stride, chroma.mode, lcu->rec.y, &refs[0], NULL, &cclm_params[0]);
+          state, COLOR_U,
+          c_width, c_width,
+          x_px & ~7, y_px & ~7,
+          state->tile->frame->source->stride,
+          chroma.mode, 
+          lcu,
+          &refs[0], NULL,
+          &cclm_params[0]);
+
         chroma.cclm[0] = cclm_params[0];
+
         kvz_predict_cclm(
-          state, COLOR_V, 32 >> (depth), 32 >> (depth), x_px, y_px, state->tile->frame->source->stride, chroma.mode, lcu->rec.y, &refs[1], NULL, &cclm_params[1]);
+          state, COLOR_V,
+          c_width, c_width,
+          x_px & ~7, y_px & ~7,
+          state->tile->frame->source->stride, 
+          chroma.mode, 
+          lcu, 
+          &refs[1], NULL,
+          &cclm_params[1]);
+
         chroma.cclm[1] = cclm_params[1];
 
-        kvz_intra_recon_cu(state,
+        kvz_intra_recon_cu(
+          state,
           x_px, y_px,
           depth,
           -1, chroma.mode, // skip luma
-          NULL, cclm_params, lcu);
+          NULL, cclm_params, lcu
+        );
       }
       chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
 
diff --git a/src/videoframe.c b/src/videoframe.c
index 76ab1da7..77919a84 100644
--- a/src/videoframe.c
+++ b/src/videoframe.c
@@ -46,7 +46,7 @@
 videoframe_t * kvz_videoframe_alloc(int32_t width,
                                     int32_t height,
                                     enum kvz_chroma_format chroma_format,
-                                    enum kvz_alf alf_type)
+                                    enum kvz_alf alf_type, bool cclm)
 {
   videoframe_t *frame = calloc(1, sizeof(videoframe_t));
   if (!frame) return 0;
@@ -59,8 +59,12 @@ videoframe_t * kvz_videoframe_alloc(int32_t width,
   frame->sao_luma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
   if (chroma_format != KVZ_CSP_400) {
     frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
+    if (cclm) {
+      assert(chroma_format == KVZ_CSP_420);
+      frame->cclm_luma_rec = MALLOC(kvz_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 7) & ~7) + FRAME_PADDING_LUMA) / 4);
+    }
   }
-
+  
   return frame;
 }
 
@@ -76,6 +80,9 @@ int kvz_videoframe_free(videoframe_t * const frame)
     kvz_image_free(frame->rec_lmcs);
     frame->source_lmcs_mapped = false;
   }
+  if(frame->cclm_luma_rec) {
+    FREE_POINTER(frame->cclm_luma_rec);
+  }
 
   kvz_image_free(frame->source);
   frame->source = NULL;
diff --git a/src/videoframe.h b/src/videoframe.h
index 3e8a6ed4..74963d85 100644
--- a/src/videoframe.h
+++ b/src/videoframe.h
@@ -53,6 +53,8 @@ typedef struct videoframe
   kvz_picture *rec;            //!< \brief Reconstructed image.
   kvz_picture *rec_lmcs;       //!< \brief LMCS mapped reconstructed image, if available, otherwise points to source.
 
+  kvz_pixel *cclm_luma_rec;    //!< \brief buffer for the downsampled luma reconstruction for cclm
+
   uint8_t* lmcs_avg_processed; //!< \brief For each LCU, indicates if already calculated average of border pixels is available
   int32_t* lmcs_avg;           //!< \brief Average of LCU border pixels
 
@@ -78,7 +80,7 @@ typedef struct videoframe
 } videoframe_t;
 
 
-videoframe_t *kvz_videoframe_alloc(int32_t width, int32_t height, enum kvz_chroma_format chroma_format, enum kvz_alf alf_type);
+videoframe_t *kvz_videoframe_alloc(int32_t width, int32_t height, enum kvz_chroma_format chroma_format, enum kvz_alf alf_type, bool cclm);
 int kvz_videoframe_free(videoframe_t * const frame);
 
 void kvz_videoframe_set_poc(videoframe_t * frame, int32_t poc);