From ef5a205faa42cbed59b7240ae76d9214171bf107 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 12 Nov 2021 13:11:54 +0200
Subject: [PATCH 01/19] [cclm] WIP: initial implementation of the cclm
 parameter calculation function

---
 src/intra.c | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 238 insertions(+)

diff --git a/src/intra.c b/src/intra.c
index a582d3e1..69ebaf15 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -248,6 +248,244 @@ static void intra_pred_dc(
 }
 
 
+enum lm_mode
+{
+  LM_CHROMA_IDX = 67,
+  LM_CHROMA_L_IDX = 68,
+  LM_CHROMA_T_IDX = 69,
+};
+
+
+static void get_cclm_parameters(
+  encoder_state_t const* const state,
+  int8_t width, int8_t height, int8_t mode,
+  int x0, int y0,
+  kvz_pixel * luma_src, kvz_pixel *chroma_ref,
+  int *a, int *b, int *shift) {
+
+  const int base_unit_size = 1 << (6 - PU_DEPTH_INTRA_MAX);
+
+  // TODO: take into account YUV422
+  const int unit_w = base_unit_size >> 1;
+  const int unit_h = base_unit_size >> 1;
+
+  const int tu_width_in_units = width / unit_w;
+  const int tu_height_in_units = height / unit_h;
+
+  const int c_height = height / 2;
+  const int c_width = width / 2;
+
+  int top_template_samp_num = width; // for MDLM, the template sample number is 2W or 2H;
+  int left_template_samp_num = height;
+
+  int total_above_units = (top_template_samp_num + (unit_w - 1)) / unit_w;
+  int total_left_units = (left_template_samp_num + (unit_h - 1)) / unit_h;
+  int total_units = total_left_units + total_above_units + 1;
+  int above_right_units = total_above_units - tu_width_in_units;
+  int left_below_units = total_left_units - tu_height_in_units;
+  int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
+  int avai_left_below_units = 0;
+  int avai_above_units = CLIP(0, tu_height_in_units, y0/base_unit_size);
+  int avai_left_units = CLIP(0, tu_width_in_units, x0 / base_unit_size);
+
+  bool above_available = avai_above_units != 0;
+  bool left_available = avai_left_units != 0;
+  // Not sure if LCU_CU_WIDTH is correct macro here,
+  // should be 16 for 64 CTU width 32 for 128
+
+  int min_luma[2] = { MAX_INT, 0 };
+  int max_luma[2] = { -MAX_INT, 0 };
+
+  kvz_pixel *src_color0 = luma_src;
+  kvz_pixel*  cur_chroma0 = chroma_ref;
+
+  char internal_bit_depth = state->encoder_control->bitdepth;
+
+  int minLuma[2] = { MAX_INT, 0 };
+  int maxLuma[2] = { -MAX_INT, 0 };
+
+  int32_t src_stride = state->tile->frame->source->stride;
+  kvz_pixel* src = src_color0 - src_stride;
+  int actualTopTemplateSampNum = 0;
+  int actualLeftTemplateSampNum = 0;
+  if (mode == LM_CHROMA_T_IDX)
+  {
+    left_available = 0;
+    avai_above_right_units = avai_above_right_units > (c_height / unit_w) ? c_height / unit_w : avai_above_right_units;
+    actualTopTemplateSampNum = unit_w * (avai_above_units + avai_above_right_units);
+  }
+  else if (mode == LM_CHROMA_L_IDX)
+  {
+    above_available = 0;
+    avai_left_below_units = avai_left_below_units > (c_width / unit_h) ? c_width / unit_h : avai_left_below_units;
+    actualLeftTemplateSampNum = unit_h * (avai_left_units + avai_left_below_units);
+  }
+  else if (mode == LM_CHROMA_IDX)
+  {
+    actualTopTemplateSampNum = c_width;
+    actualLeftTemplateSampNum = c_height;
+  }
+  int startPos[2]; //0:Above, 1: Left
+  int pickStep[2];
+
+  int aboveIs4 = left_available ? 0 : 1;
+  int leftIs4 = above_available ? 0 : 1;
+
+  startPos[0] = actualTopTemplateSampNum >> (2 + aboveIs4);
+  pickStep[0] = MAX(1, actualTopTemplateSampNum >> (1 + aboveIs4));
+
+  startPos[1] = actualLeftTemplateSampNum >> (2 + leftIs4);
+  pickStep[1] = MAX(1, actualLeftTemplateSampNum >> (1 + leftIs4));
+
+  kvz_pixel selectLumaPix[4] = { 0, 0, 0, 0 };
+  kvz_pixel selectChromaPix[4] = { 0, 0, 0, 0 };
+
+  int cntT, cntL;
+  cntT = cntL = 0;
+  int cnt = 0;
+  if (above_available)
+  {
+    cntT = MIN(actualTopTemplateSampNum, (1 + aboveIs4) << 1);
+    src = src_color0 - src_stride;
+    const kvz_pixel* cur = cur_chroma0 + 1;
+    for (int pos = startPos[0]; cnt < cntT; pos += pickStep[0], cnt++)
+    {
+      selectLumaPix[cnt] = src[pos];
+      selectChromaPix[cnt] = cur[pos];
+    }
+  }
+
+  if (left_available)
+  {
+    cntL = MIN(actualLeftTemplateSampNum, (1 + leftIs4) << 1);
+    src = src_color0 - 1;
+    const kvz_pixel* cur = cur_chroma0 + src_stride/2 + 1;
+    for (int pos = startPos[1], cnt = 0; cnt < cntL; pos += pickStep[1], cnt++)
+    {
+      selectLumaPix[cnt + cntT] = src[pos * src_stride];
+      selectChromaPix[cnt + cntT] = cur[pos];
+    }
+  }
+  cnt = cntL + cntT;
+
+  if (cnt == 2)
+  {
+    selectLumaPix[3] = selectLumaPix[0]; selectChromaPix[3] = selectChromaPix[0];
+    selectLumaPix[2] = selectLumaPix[1]; selectChromaPix[2] = selectChromaPix[1];
+    selectLumaPix[0] = selectLumaPix[1]; selectChromaPix[0] = selectChromaPix[1];
+    selectLumaPix[1] = selectLumaPix[3]; selectChromaPix[1] = selectChromaPix[3];
+  }
+
+  int minGrpIdx[2] = { 0, 2 };
+  int maxGrpIdx[2] = { 1, 3 };
+  int* tmpMinGrp = minGrpIdx;
+  int* tmpMaxGrp = maxGrpIdx;
+  if (selectLumaPix[tmpMinGrp[0]] > selectLumaPix[tmpMinGrp[1]])
+  {
+    SWAP(tmpMinGrp[0], tmpMinGrp[1], int);
+  }
+  if (selectLumaPix[tmpMaxGrp[0]] > selectLumaPix[tmpMaxGrp[1]])
+  {
+    SWAP(tmpMaxGrp[0], tmpMaxGrp[1], int);
+  }
+  if (selectLumaPix[tmpMinGrp[0]] > selectLumaPix[tmpMaxGrp[1]])
+  {
+    SWAP(tmpMinGrp, tmpMaxGrp, int);
+  }
+  if (selectLumaPix[tmpMinGrp[1]] > selectLumaPix[tmpMaxGrp[0]])
+  {
+    SWAP(tmpMinGrp[1], tmpMaxGrp[0], int);
+  }
+
+  minLuma[0] = (selectLumaPix[tmpMinGrp[0]] + selectLumaPix[tmpMinGrp[1]] + 1) >> 1;
+  minLuma[1] = (selectChromaPix[tmpMinGrp[0]] + selectChromaPix[tmpMinGrp[1]] + 1) >> 1;
+  maxLuma[0] = (selectLumaPix[tmpMaxGrp[0]] + selectLumaPix[tmpMaxGrp[1]] + 1) >> 1;
+  maxLuma[1] = (selectChromaPix[tmpMaxGrp[0]] + selectChromaPix[tmpMaxGrp[1]] + 1) >> 1;
+
+  if (left_available || above_available)
+  {
+    int diff = maxLuma[0] - minLuma[0];
+    if (diff > 0)
+    {
+      int diffC = maxLuma[1] - minLuma[1];
+      int x = kvz_math_floor_log2(diff);
+      static const uint8_t DivSigTable[1 << 4] = {
+        // 4bit significands - 8 ( MSB is omitted )
+        0,  7,  6,  5,  5,  4,  4,  3,  3,  2,  2,  1,  1,  1,  1,  0
+      };
+      int normDiff = (diff << 4 >> x) & 15;
+      int v = DivSigTable[normDiff] | 8;
+      x += normDiff != 0;
+
+      int y = kvz_math_floor_log2(abs(diffC)) + 1;
+      int add = 1 << y >> 1;
+      *a = (diffC * v + add) >> y;
+      *shift = 3 + x - y;
+      if (*shift < 1)
+      {
+        *shift = 1;
+        *a = ((*a == 0) ? 0 : (*a < 0) ? -15 : 15);   // a=Sign(a)*15
+      }
+      *b = minLuma[1] - ((*a * minLuma[0]) >> *shift);
+    }
+    else
+    {
+      *a = 0;
+      *b = minLuma[1];
+      *shift = 0;
+    }
+  }
+  else
+  {
+    *a = 0;
+
+    *b = 1 << (internal_bit_depth - 1);
+
+    *shift = 0;
+  }
+}
+
+static void linear_transform_cclm(int a, int b, int shift, kvz_pixel * dst) {
+
+}
+
+
+void kvz_predict_cclm(
+  encoder_state_t const* const state,
+  const color_t color,
+  const int8_t width,
+  const int8_t height,
+  const int16_t x0,
+  const int16_t y0,
+  const int16_t stride,
+  const int8_t mode,
+  kvz_pixel* const y_rec,
+  kvz_pixel* dst
+)
+{
+  assert(mode == LM_CHROMA_IDX || mode == LM_CHROMA_L_IDX || mode == LM_CHROMA_T_IDX);
+
+  kvz_pixel sampled_luma[(LCU_WIDTH_C+1)*(LCU_WIDTH_C+1)];
+
+  for (int y = MAX(0, y0 -1); y < y0 + height; y++) {
+    for (int x = MAX(0, x0 - 1); x < x0 + width; x++) {
+      int s = 4;
+      s += y_rec[2 * x] * 2;
+      s += y_rec[2 * x + 1];
+      s += y_rec[2 * x - (x + x0 > 0)];
+      s += y_rec[2 * x + stride] * 2;
+      s += y_rec[2 * x + 1 + stride];
+      s += y_rec[2 * x - (x + x0 > 0) + stride];
+      sampled_luma[x + 1 + (y + 1) * 32] = s >> 3;
+    }
+    y += stride;
+  }
+
+  int a, b, shift;
+  get_cclm_parameters(state, width, height, mode,x0, y0, state->tile->frame->rec->y, state->tile->frame->source->u, &a, &b, &shift);
+  linear_transform_cclm(a, b, shift, dst);
+}
+
 void kvz_intra_predict(
   encoder_state_t *const state,
   kvz_intra_references *refs,

From 93c02644c87d0e10cc939a4a1290fb98c8e81650 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 16 Nov 2021 07:10:31 +0200
Subject: [PATCH 02/19] [cclm] WIP fix parameter calculation and add calling to
 the functions

---
 src/intra.c        | 110 ++++++++++++++++++++++++++++-----------------
 src/intra.h        |  14 ++++++
 src/search_intra.c |  48 ++++++++++++++------
 3 files changed, 117 insertions(+), 55 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 69ebaf15..2393b344 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -260,7 +260,7 @@ static void get_cclm_parameters(
   encoder_state_t const* const state,
   int8_t width, int8_t height, int8_t mode,
   int x0, int y0,
-  kvz_pixel * luma_src, kvz_pixel *chroma_ref,
+  kvz_intra_ref* luma_src, kvz_intra_references*chroma_ref,
   int *a, int *b, int *shift) {
 
   const int base_unit_size = 1 << (6 - PU_DEPTH_INTRA_MAX);
@@ -269,11 +269,14 @@ static void get_cclm_parameters(
   const int unit_w = base_unit_size >> 1;
   const int unit_h = base_unit_size >> 1;
 
+  const int c_height = height;
+  const int c_width = width;
+  height *= 2;
+  width *= 2;
+
   const int tu_width_in_units = width / unit_w;
   const int tu_height_in_units = height / unit_h;
 
-  const int c_height = height / 2;
-  const int c_width = width / 2;
 
   int top_template_samp_num = width; // for MDLM, the template sample number is 2W or 2H;
   int left_template_samp_num = height;
@@ -292,20 +295,13 @@ static void get_cclm_parameters(
   bool left_available = avai_left_units != 0;
   // Not sure if LCU_CU_WIDTH is correct macro here,
   // should be 16 for 64 CTU width 32 for 128
+    
+  char internal_bit_depth = state->encoder_control->bitdepth;
 
   int min_luma[2] = { MAX_INT, 0 };
   int max_luma[2] = { -MAX_INT, 0 };
-
-  kvz_pixel *src_color0 = luma_src;
-  kvz_pixel*  cur_chroma0 = chroma_ref;
-
-  char internal_bit_depth = state->encoder_control->bitdepth;
-
-  int minLuma[2] = { MAX_INT, 0 };
-  int maxLuma[2] = { -MAX_INT, 0 };
-
-  int32_t src_stride = state->tile->frame->source->stride;
-  kvz_pixel* src = src_color0 - src_stride;
+  
+  kvz_pixel* src;
   int actualTopTemplateSampNum = 0;
   int actualLeftTemplateSampNum = 0;
   if (mode == LM_CHROMA_T_IDX)
@@ -346,8 +342,8 @@ static void get_cclm_parameters(
   if (above_available)
   {
     cntT = MIN(actualTopTemplateSampNum, (1 + aboveIs4) << 1);
-    src = src_color0 - src_stride;
-    const kvz_pixel* cur = cur_chroma0 + 1;
+    src = luma_src->top + 1;
+    const kvz_pixel* cur = chroma_ref->ref.top + 1;
     for (int pos = startPos[0]; cnt < cntT; pos += pickStep[0], cnt++)
     {
       selectLumaPix[cnt] = src[pos];
@@ -358,11 +354,11 @@ static void get_cclm_parameters(
   if (left_available)
   {
     cntL = MIN(actualLeftTemplateSampNum, (1 + leftIs4) << 1);
-    src = src_color0 - 1;
-    const kvz_pixel* cur = cur_chroma0 + src_stride/2 + 1;
+    src = luma_src->left + 1;
+    const kvz_pixel* cur = chroma_ref->ref.left + 1;
     for (int pos = startPos[1], cnt = 0; cnt < cntL; pos += pickStep[1], cnt++)
     {
-      selectLumaPix[cnt + cntT] = src[pos * src_stride];
+      selectLumaPix[cnt + cntT] = src[pos];
       selectChromaPix[cnt + cntT] = cur[pos];
     }
   }
@@ -390,24 +386,24 @@ static void get_cclm_parameters(
   }
   if (selectLumaPix[tmpMinGrp[0]] > selectLumaPix[tmpMaxGrp[1]])
   {
-    SWAP(tmpMinGrp, tmpMaxGrp, int);
+    SWAP(tmpMinGrp, tmpMaxGrp, int*);
   }
   if (selectLumaPix[tmpMinGrp[1]] > selectLumaPix[tmpMaxGrp[0]])
   {
     SWAP(tmpMinGrp[1], tmpMaxGrp[0], int);
   }
 
-  minLuma[0] = (selectLumaPix[tmpMinGrp[0]] + selectLumaPix[tmpMinGrp[1]] + 1) >> 1;
-  minLuma[1] = (selectChromaPix[tmpMinGrp[0]] + selectChromaPix[tmpMinGrp[1]] + 1) >> 1;
-  maxLuma[0] = (selectLumaPix[tmpMaxGrp[0]] + selectLumaPix[tmpMaxGrp[1]] + 1) >> 1;
-  maxLuma[1] = (selectChromaPix[tmpMaxGrp[0]] + selectChromaPix[tmpMaxGrp[1]] + 1) >> 1;
+  min_luma[0] = (selectLumaPix[tmpMinGrp[0]] + selectLumaPix[tmpMinGrp[1]] + 1) >> 1;
+  min_luma[1] = (selectChromaPix[tmpMinGrp[0]] + selectChromaPix[tmpMinGrp[1]] + 1) >> 1;
+  max_luma[0] = (selectLumaPix[tmpMaxGrp[0]] + selectLumaPix[tmpMaxGrp[1]] + 1) >> 1;
+  max_luma[1] = (selectChromaPix[tmpMaxGrp[0]] + selectChromaPix[tmpMaxGrp[1]] + 1) >> 1;
 
   if (left_available || above_available)
   {
-    int diff = maxLuma[0] - minLuma[0];
+    int diff = max_luma[0] - min_luma[0];
     if (diff > 0)
     {
-      int diffC = maxLuma[1] - minLuma[1];
+      int diffC = max_luma[1] - min_luma[1];
       int x = kvz_math_floor_log2(diff);
       static const uint8_t DivSigTable[1 << 4] = {
         // 4bit significands - 8 ( MSB is omitted )
@@ -426,12 +422,12 @@ static void get_cclm_parameters(
         *shift = 1;
         *a = ((*a == 0) ? 0 : (*a < 0) ? -15 : 15);   // a=Sign(a)*15
       }
-      *b = minLuma[1] - ((*a * minLuma[0]) >> *shift);
+      *b = min_luma[1] - ((*a * min_luma[0]) >> *shift);
     }
     else
     {
       *a = 0;
-      *b = minLuma[1];
+      *b = min_luma[1];
       *shift = 0;
     }
   }
@@ -459,30 +455,62 @@ void kvz_predict_cclm(
   const int16_t y0,
   const int16_t stride,
   const int8_t mode,
-  kvz_pixel* const y_rec,
+  kvz_pixel const *  y_rec,
+  kvz_intra_references* chroma_ref,
   kvz_pixel* dst
 )
 {
   assert(mode == LM_CHROMA_IDX || mode == LM_CHROMA_L_IDX || mode == LM_CHROMA_T_IDX);
 
-  kvz_pixel sampled_luma[(LCU_WIDTH_C+1)*(LCU_WIDTH_C+1)];
+  
+  kvz_intra_ref sampled_luma;
 
-  for (int y = MAX(0, y0 -1); y < y0 + height; y++) {
-    for (int x = MAX(0, x0 - 1); x < x0 + width; x++) {
+  int x_scu = SUB_SCU(x0);
+  int y_scu = SUB_SCU(y0);
+
+  if(x0) {
+    for(int y = 0; y < height * 2; y+=2) {
       int s = 4;
-      s += y_rec[2 * x] * 2;
-      s += y_rec[2 * x + 1];
-      s += y_rec[2 * x - (x + x0 > 0)];
-      s += y_rec[2 * x + stride] * 2;
-      s += y_rec[2 * x + 1 + stride];
-      s += y_rec[2 * x - (x + x0 > 0) + stride];
-      sampled_luma[x + 1 + (y + 1) * 32] = s >> 3;
+      s += x_scu ? y_rec[y * LCU_WIDTH - 1] * 2 : state->tile->frame->rec->y[x0 - 1 + (y0 + y) * stride] * 2;
+      s += x_scu ? y_rec[y * LCU_WIDTH - 2]: state->tile->frame->rec->y[x0 - 2 + (y0 + y) * stride];
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 1] * 2: state->tile->frame->rec->y[x0 - 1 + (y0 + y + 1) * stride] * 2;
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2]: state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride];
+      s += y_rec[y * LCU_WIDTH];
+      s += y_rec[(y + 1) * LCU_WIDTH];
+      sampled_luma.left[y/2] = s >> 3;
     }
-    y += stride;
   }
 
+  if(y0) {
+    for(int x = 0; x < width*2; x += 2) {
+      bool left_padding = x0 || x;
+      int s = 4;
+      s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2 : state->tile->frame->rec->y[x0 + x +(y0 - 2) * stride] * 2;
+      s += y_scu ? y_rec[x - LCU_WIDTH] * 2 : state->tile->frame->rec->y[x0 + x +(y0 - 1) * stride] * 2;
+      s += y_scu ? y_rec[x - LCU_WIDTH * 2 - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 2) * stride];
+      s += y_scu ? y_rec[x - LCU_WIDTH - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
+      s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1] : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
+      s += y_scu ? y_rec[x - LCU_WIDTH + 1] : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
+      sampled_luma.top[x / 2] = s >> 3;
+    }
+  }
+
+  //for (int y = MAX(0, y0 -1) % 64; y < y0 + height; y++) {
+  //  for (int x = MAX(0, x0 - 1) % 64; x < x0 + width; x++) {
+  //    int s = 4;
+  //    s += y_rec[2 * x] * 2;
+  //    s += y_rec[2 * x + 1];
+  //    s += y_rec[2 * x - (x + x0 > 0)];
+  //    s += y_rec[2 * x + stride] * 2;
+  //    s += y_rec[2 * x + 1 + stride];
+  //    s += y_rec[2 * x - (x + x0 > 0) + stride];
+  //    sampled_luma[x + 1 + (y + 1) * 33] = s >> 3;
+  //  }
+  //  y_rec += 64;
+  //}
+
   int a, b, shift;
-  get_cclm_parameters(state, width, height, mode,x0, y0, state->tile->frame->rec->y, state->tile->frame->source->u, &a, &b, &shift);
+  get_cclm_parameters(state, width, height, mode,x0, y0, &sampled_luma, chroma_ref, &a, &b, &shift);
   linear_transform_cclm(a, b, shift, dst);
 }
 
diff --git a/src/intra.h b/src/intra.h
index e69621c4..5652ec41 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -120,3 +120,17 @@ void kvz_intra_recon_cu(
   cu_info_t *cur_cu,
   lcu_t *lcu);
 
+
+void kvz_predict_cclm(
+  encoder_state_t const* const state,
+  const color_t color,
+  const int8_t width,
+  const int8_t height,
+  const int16_t x0,
+  const int16_t y0,
+  const int16_t stride,
+  const int8_t mode,
+  kvz_pixel const* y_rec,
+  kvz_intra_references* chroma_ref,
+  kvz_pixel* dst
+);
\ No newline at end of file
diff --git a/src/search_intra.c b/src/search_intra.c
index 206a26a1..4960f6cf 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -461,7 +461,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
   const unsigned width = MAX(LCU_WIDTH_C >> depth, TR_MIN_WIDTH);
   const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - (depth + 1), 2);
 
-  for (int i = 0; i < 5; ++i) {
+  for (int i = 0; i < 8; ++i) {
     costs[i] = 0;
   }
 
@@ -476,15 +476,20 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
 
   kvz_pixels_blit(orig_u, orig_block, width, width, origstride, width);
   for (int i = 0; i < 5; ++i) {
-    if (modes[i] == luma_mode) continue;
+    if (modes[i] == -1) continue;
     kvz_intra_predict(state, refs_u, log2_width_c, modes[i], COLOR_U, pred, false);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costs[i] += satd_func(pred, orig_block);
   }
+  for (int i = 5; i < 8; i++) {
+    kvz_predict_cclm(
+      state,
+      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], state->tile->frame->rec->y, refs_u,  _pred);
+  }
 
   kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width);
   for (int i = 0; i < 5; ++i) {
-    if (modes[i] == luma_mode) continue;
+    if (modes[i] == -1) continue;
     kvz_intra_predict(state, refs_v, log2_width_c, modes[i], COLOR_V, pred, false);
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costs[i] += satd_func(pred, orig_block);
@@ -820,11 +825,20 @@ double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
 int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
                                   int x_px, int y_px, int depth,
                                   int8_t intra_mode,
-                                  int8_t modes[5], int8_t num_modes,
+                                  int8_t modes[8], int8_t num_modes,
                                   lcu_t *const lcu)
 {
   const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4);
 
+
+  kvz_intra_references refs;
+  const vector2d_t luma_px = { x_px, y_px };
+  const vector2d_t pic_px = {
+    state->tile->frame->width,
+    state->tile->frame->height,
+  };
+  kvz_intra_build_reference(6-depth, COLOR_U, &luma_px, &pic_px, lcu, &refs, state->encoder_control->cfg.wpp);
+
   if (reconstruct_chroma) {
     const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
     cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
@@ -839,12 +853,18 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
 
     for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) {
       chroma.mode = modes[chroma_mode_i];
-
-      kvz_intra_recon_cu(state,
-                         x_px, y_px,
-                         depth,
-                         -1, chroma.mode, // skip luma
-                         NULL, lcu);
+      if (chroma.mode == -1) continue;
+      if(chroma.mode < 67) {
+        kvz_intra_recon_cu(state,
+                           x_px, y_px,
+                           depth,
+                           -1, chroma.mode, // skip luma
+                           NULL, lcu);
+      }
+      else {
+        kvz_predict_cclm(
+          state, COLOR_U, 32 >> (depth), 32 >> (depth), x_px, y_px, state->tile->frame->source->stride, chroma.mode, lcu->rec.y, &refs, NULL);
+      }
       chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
 
       double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode);
@@ -871,8 +891,8 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
   cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   int8_t intra_mode = cur_pu->intra.mode;
 
-  double costs[5];
-  int8_t modes[5] = { 0, 50, 18, 1, 67 };
+  double costs[8];
+  int8_t modes[8] = { 0, 50, 18, 1, -1, 67, 68, 69 };
   if (intra_mode != 0 && intra_mode != 50 && intra_mode != 18 && intra_mode != 1) {
     modes[4] = intra_mode;
   }
@@ -885,13 +905,13 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
   int num_modes = modes_in_depth[depth];
 
   if (state->encoder_control->cfg.rdo == 3) {
-    num_modes = modes[4] == intra_mode ? 5 : 4;
+    num_modes = 8;
   }
 
   // Don't do rough mode search if all modes are selected.
   // FIXME: It might make more sense to only disable rough search if
   // num_modes is 0.is 0.
-  if (num_modes != 1 && num_modes != 5 && num_modes != 4) {
+  if (num_modes != 1 && num_modes != 5 && num_modes != 4 && num_modes != 8) {
     const int_fast8_t log2_width_c = MAX(LOG2_LCU_WIDTH - depth - 1, 2);
     const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
     const vector2d_t luma_px = { x_px, y_px };

From 4e8c9043a1fbed71a289d75f716506dabcb7d054 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 16 Nov 2021 08:31:32 +0200
Subject: [PATCH 03/19] [cclm] CCLM parameter calculation *should* work
 correctly

---
 src/intra.c        | 9 +++++----
 src/search_intra.c | 6 +++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 2393b344..e1ffe692 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -274,8 +274,8 @@ static void get_cclm_parameters(
   height *= 2;
   width *= 2;
 
-  const int tu_width_in_units = width / unit_w;
-  const int tu_height_in_units = height / unit_h;
+  const int tu_width_in_units = c_width / unit_w;
+  const int tu_height_in_units = c_height / unit_h;
 
 
   int top_template_samp_num = width; // for MDLM, the template sample number is 2W or 2H;
@@ -342,7 +342,7 @@ static void get_cclm_parameters(
   if (above_available)
   {
     cntT = MIN(actualTopTemplateSampNum, (1 + aboveIs4) << 1);
-    src = luma_src->top + 1;
+    src = luma_src->top;
     const kvz_pixel* cur = chroma_ref->ref.top + 1;
     for (int pos = startPos[0]; cnt < cntT; pos += pickStep[0], cnt++)
     {
@@ -354,7 +354,7 @@ static void get_cclm_parameters(
   if (left_available)
   {
     cntL = MIN(actualLeftTemplateSampNum, (1 + leftIs4) << 1);
-    src = luma_src->left + 1;
+    src = luma_src->left;
     const kvz_pixel* cur = chroma_ref->ref.left + 1;
     for (int pos = startPos[1], cnt = 0; cnt < cntL; pos += pickStep[1], cnt++)
     {
@@ -467,6 +467,7 @@ void kvz_predict_cclm(
 
   int x_scu = SUB_SCU(x0);
   int y_scu = SUB_SCU(y0);
+  y_rec += x_scu + y_scu * LCU_WIDTH;
 
   if(x0) {
     for(int y = 0; y < height * 2; y+=2) {
diff --git a/src/search_intra.c b/src/search_intra.c
index 4960f6cf..52b654bd 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -454,7 +454,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
                                       const kvz_pixel *orig_u, const kvz_pixel *orig_v, int16_t origstride,
                                       kvz_intra_references *refs_u, kvz_intra_references *refs_v,
                                       int8_t luma_mode,
-                                      int8_t modes[5], double costs[5])
+                                      int8_t modes[8], double costs[8], lcu_t* lcu)
 {
   assert(!(x_px & 4 || y_px & 4));
 
@@ -484,7 +484,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
   for (int i = 5; i < 8; i++) {
     kvz_predict_cclm(
       state,
-      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], state->tile->frame->rec->y, refs_u,  _pred);
+      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu->rec.y, refs_u,  _pred);
   }
 
   kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width);
@@ -929,7 +929,7 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
     search_intra_chroma_rough(state, x_px, y_px, depth,
                               ref_u, ref_v, LCU_WIDTH_C,
                               &refs_u, &refs_v,
-                              intra_mode, modes, costs);
+                              intra_mode, modes, costs, lcu);
   }
 
   int8_t intra_mode_chroma = intra_mode;

From 677bf1edcb974250ca3fdbc4515db45dbe8535a7 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 16 Nov 2021 09:31:47 +0200
Subject: [PATCH 04/19] [cclm] linear transform

---
 src/intra.c | 49 +++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index e1ffe692..32d1750c 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -441,8 +441,16 @@ static void get_cclm_parameters(
   }
 }
 
-static void linear_transform_cclm(int a, int b, int shift, kvz_pixel * dst) {
-
+static void linear_transform_cclm(int scale, int shift, int offset, kvz_pixel * src, kvz_pixel * dst, int stride, int height) {
+  for (int y = 0; y < height; ++y) {
+    for (int x=0; x < stride; ++x) {
+      int val = src[x + y * stride] * scale;
+      val >>= shift;
+      val += offset;
+      val = CLIP_TO_PIXEL(val);
+      dst[x + y * stride] = val;
+    }
+  }
 }
 
 
@@ -463,7 +471,8 @@ void kvz_predict_cclm(
   assert(mode == LM_CHROMA_IDX || mode == LM_CHROMA_L_IDX || mode == LM_CHROMA_T_IDX);
 
   
-  kvz_intra_ref sampled_luma;
+  kvz_intra_ref sampled_luma_ref;
+  kvz_pixel sampled_luma[LCU_CHROMA_SIZE];
 
   int x_scu = SUB_SCU(x0);
   int y_scu = SUB_SCU(y0);
@@ -478,7 +487,7 @@ void kvz_predict_cclm(
       s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2]: state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride];
       s += y_rec[y * LCU_WIDTH];
       s += y_rec[(y + 1) * LCU_WIDTH];
-      sampled_luma.left[y/2] = s >> 3;
+      sampled_luma_ref.left[y/2] = s >> 3;
     }
   }
 
@@ -492,27 +501,27 @@ void kvz_predict_cclm(
       s += y_scu ? y_rec[x - LCU_WIDTH - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
       s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1] : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
       s += y_scu ? y_rec[x - LCU_WIDTH + 1] : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
-      sampled_luma.top[x / 2] = s >> 3;
+      sampled_luma_ref.top[x / 2] = s >> 3;
     }
   }
 
-  //for (int y = MAX(0, y0 -1) % 64; y < y0 + height; y++) {
-  //  for (int x = MAX(0, x0 - 1) % 64; x < x0 + width; x++) {
-  //    int s = 4;
-  //    s += y_rec[2 * x] * 2;
-  //    s += y_rec[2 * x + 1];
-  //    s += y_rec[2 * x - (x + x0 > 0)];
-  //    s += y_rec[2 * x + stride] * 2;
-  //    s += y_rec[2 * x + 1 + stride];
-  //    s += y_rec[2 * x - (x + x0 > 0) + stride];
-  //    sampled_luma[x + 1 + (y + 1) * 33] = s >> 3;
-  //  }
-  //  y_rec += 64;
-  //}
+  for (int y = 0; y < height * 2; y+=2) {
+    for (int x = 0; x <  width * 2; x+=2) {
+      int s = 4;
+      s += y_rec[2 * x] * 2;
+      s += y_rec[2 * x + 1];
+      s += x0 && !x ? state->tile->frame->rec->y[x0 - 1 + y0 * stride] : y_rec[2 * x - (x + x0 > 0)];
+      s += y_rec[2 * x + LCU_WIDTH] * 2;
+      s += y_rec[2 * x + 1 + LCU_WIDTH];
+      s += x0 && !x ? state->tile->frame->rec->y[x0 - 1 + (y0 + 1) * stride] : y_rec[2 * x - (x + x0 > 0) + stride];
+      sampled_luma[x + y * width] = s >> 3;
+    }
+    y_rec += LCU_WIDTH;
+  }
 
   int a, b, shift;
-  get_cclm_parameters(state, width, height, mode,x0, y0, &sampled_luma, chroma_ref, &a, &b, &shift);
-  linear_transform_cclm(a, b, shift, dst);
+  get_cclm_parameters(state, width, height, mode,x0, y0, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
+  linear_transform_cclm(a, shift, b,sampled_luma, dst, width, height);
 }
 
 void kvz_intra_predict(

From ecc55c9edf97e9a2be48fff68fa4f4fd579d3389 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 16 Nov 2021 14:21:38 +0200
Subject: [PATCH 05/19] [cclm] align ref pixel generation code and add comments

---
 src/intra.c | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 32d1750c..be85b3d3 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -281,11 +281,12 @@ static void get_cclm_parameters(
   int top_template_samp_num = width; // for MDLM, the template sample number is 2W or 2H;
   int left_template_samp_num = height;
 
-  int total_above_units = (top_template_samp_num + (unit_w - 1)) / unit_w;
-  int total_left_units = (left_template_samp_num + (unit_h - 1)) / unit_h;
-  int total_units = total_left_units + total_above_units + 1;
-  int above_right_units = total_above_units - tu_width_in_units;
-  int left_below_units = total_left_units - tu_height_in_units;
+  // These are used for calculating some stuff for non-square CUs
+  //int total_above_units = (top_template_samp_num + (unit_w - 1)) / unit_w;
+  //int total_left_units = (left_template_samp_num + (unit_h - 1)) / unit_h;
+  //int total_units = total_left_units + total_above_units + 1;
+  //int above_right_units = total_above_units - tu_width_in_units;
+  //int left_below_units = total_left_units - tu_height_in_units;
   int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
   int avai_left_below_units = 0;
   int avai_above_units = CLIP(0, tu_height_in_units, y0/base_unit_size);
@@ -293,8 +294,6 @@ static void get_cclm_parameters(
 
   bool above_available = avai_above_units != 0;
   bool left_available = avai_left_units != 0;
-  // Not sure if LCU_CU_WIDTH is correct macro here,
-  // should be 16 for 64 CTU width 32 for 128
     
   char internal_bit_depth = state->encoder_control->bitdepth;
 
@@ -478,15 +477,20 @@ void kvz_predict_cclm(
   int y_scu = SUB_SCU(y0);
   y_rec += x_scu + y_scu * LCU_WIDTH;
 
+  // Essentially what this does is that it uses 6-tap filtering to downsample
+  // the luma intra references down to match the resolution of the chroma channel.
+  // The luma reference is only needed when we are not on the edge of the picture.
+  // Because the reference pixels that are needed on the edge of the ctu this code
+  // is kinda messy but what can you do
   if(x0) {
     for(int y = 0; y < height * 2; y+=2) {
       int s = 4;
-      s += x_scu ? y_rec[y * LCU_WIDTH - 1] * 2 : state->tile->frame->rec->y[x0 - 1 + (y0 + y) * stride] * 2;
-      s += x_scu ? y_rec[y * LCU_WIDTH - 2]: state->tile->frame->rec->y[x0 - 2 + (y0 + y) * stride];
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 1] * 2: state->tile->frame->rec->y[x0 - 1 + (y0 + y + 1) * stride] * 2;
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2]: state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride];
-      s += y_rec[y * LCU_WIDTH];
-      s += y_rec[(y + 1) * LCU_WIDTH];
+      s += x_scu ? y_rec[y * LCU_WIDTH - 1] * 2       : state->tile->frame->rec->y[x0 - 1 + (y0 + y) * stride] * 2;
+      s += x_scu ? y_rec[y * LCU_WIDTH - 2]           : state->tile->frame->rec->y[x0 - 2 + (y0 + y) * stride];
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 1] * 2 : state->tile->frame->rec->y[x0 - 1 + (y0 + y + 1) * stride] * 2;
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2]     : state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride];
+      s +=         y_rec[y * LCU_WIDTH];
+      s +=         y_rec[(y + 1) * LCU_WIDTH];
       sampled_luma_ref.left[y/2] = s >> 3;
     }
   }
@@ -495,16 +499,18 @@ void kvz_predict_cclm(
     for(int x = 0; x < width*2; x += 2) {
       bool left_padding = x0 || x;
       int s = 4;
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2 : state->tile->frame->rec->y[x0 + x +(y0 - 2) * stride] * 2;
-      s += y_scu ? y_rec[x - LCU_WIDTH] * 2 : state->tile->frame->rec->y[x0 + x +(y0 - 1) * stride] * 2;
+      s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2            : state->tile->frame->rec->y[x0 + x +(y0 - 2) * stride] * 2;
+      s += y_scu ? y_rec[x - LCU_WIDTH] * 2                : state->tile->frame->rec->y[x0 + x +(y0 - 1) * stride] * 2;
       s += y_scu ? y_rec[x - LCU_WIDTH * 2 - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 2) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1] : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH + 1] : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
+      s += y_scu ? y_rec[x - LCU_WIDTH - left_padding]     : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
+      s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1]            : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
+      s += y_scu ? y_rec[x - LCU_WIDTH + 1]                : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
       sampled_luma_ref.top[x / 2] = s >> 3;
     }
   }
 
+  // Downsample the reconstructed luma sample so that they can be mapped into the chroma
+  // to generate the chroma prediction
   for (int y = 0; y < height * 2; y+=2) {
     for (int x = 0; x <  width * 2; x+=2) {
       int s = 4;
@@ -521,6 +527,7 @@ void kvz_predict_cclm(
 
   int a, b, shift;
   get_cclm_parameters(state, width, height, mode,x0, y0, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
+
   linear_transform_cclm(a, shift, b,sampled_luma, dst, width, height);
 }
 

From 91877ef2c4ac9c24a67c94ab53787938c84aaca2 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Tue, 16 Nov 2021 15:02:19 +0200
Subject: [PATCH 06/19] [cclm] fix condition for selecting the data from frame
 buffer and not ctu

---
 src/intra.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index be85b3d3..2cde088b 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -516,10 +516,13 @@ void kvz_predict_cclm(
       int s = 4;
       s += y_rec[2 * x] * 2;
       s += y_rec[2 * x + 1];
-      s += x0 && !x ? state->tile->frame->rec->y[x0 - 1 + y0 * stride] : y_rec[2 * x - (x + x0 > 0)];
+      // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
+      // *except* when we are also at the edge of the frame, in which case we want to duplicate
+      // the edge pixel
+      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + y0 * stride] : y_rec[2 * x - ((x + x0) > 0)];
       s += y_rec[2 * x + LCU_WIDTH] * 2;
       s += y_rec[2 * x + 1 + LCU_WIDTH];
-      s += x0 && !x ? state->tile->frame->rec->y[x0 - 1 + (y0 + 1) * stride] : y_rec[2 * x - (x + x0 > 0) + stride];
+      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + 1) * stride] : y_rec[2 * x - ((x + x0) > 0) + stride];
       sampled_luma[x + y * width] = s >> 3;
     }
     y_rec += LCU_WIDTH;

From 70f6afff4f480ce74eac13762d7f9d25ef470fd5 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 17 Nov 2021 11:19:55 +0200
Subject: [PATCH 07/19] [cclm] Search should be working

still has duplicate code and inefficiencies, but the actual search *should*
be working.
---
 src/intra.c        | 72 ++++++++++++++++++++++++++++++++++------------
 src/intra.h        | 10 ++++++-
 src/search.c       |  9 +++---
 src/search_intra.c | 61 ++++++++++++++++++++++++++-------------
 src/search_intra.h |  5 ++--
 5 files changed, 111 insertions(+), 46 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 2cde088b..21c539cf 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -250,9 +250,9 @@ static void intra_pred_dc(
 
 enum lm_mode
 {
-  LM_CHROMA_IDX = 67,
-  LM_CHROMA_L_IDX = 68,
-  LM_CHROMA_T_IDX = 69,
+  LM_CHROMA_IDX = 81,
+  LM_CHROMA_L_IDX = 82,
+  LM_CHROMA_T_IDX = 83,
 };
 
 
@@ -261,7 +261,7 @@ static void get_cclm_parameters(
   int8_t width, int8_t height, int8_t mode,
   int x0, int y0,
   kvz_intra_ref* luma_src, kvz_intra_references*chroma_ref,
-  int *a, int *b, int *shift) {
+  int16_t *a, int16_t*b, int16_t*shift) {
 
   const int base_unit_size = 1 << (6 - PU_DEPTH_INTRA_MAX);
 
@@ -412,7 +412,7 @@ static void get_cclm_parameters(
       int v = DivSigTable[normDiff] | 8;
       x += normDiff != 0;
 
-      int y = kvz_math_floor_log2(abs(diffC)) + 1;
+      int y = diffC ? kvz_math_floor_log2(abs(diffC)) + 1 : 0;
       int add = 1 << y >> 1;
       *a = (diffC * v + add) >> y;
       *shift = 3 + x - y;
@@ -440,7 +440,10 @@ static void get_cclm_parameters(
   }
 }
 
-static void linear_transform_cclm(int scale, int shift, int offset, kvz_pixel * src, kvz_pixel * dst, int stride, int height) {
+static void linear_transform_cclm(cclm_parameters_t* cclm_params, kvz_pixel * src, kvz_pixel * dst, int stride, int height) {
+  int scale = cclm_params->a;
+  int shift = cclm_params->shift;
+  int offset = cclm_params->b;
   for (int y = 0; y < height; ++y) {
     for (int x=0; x < stride; ++x) {
       int val = src[x + y * stride] * scale;
@@ -464,7 +467,8 @@ void kvz_predict_cclm(
   const int8_t mode,
   kvz_pixel const *  y_rec,
   kvz_intra_references* chroma_ref,
-  kvz_pixel* dst
+  kvz_pixel* dst,
+  cclm_parameters_t* cclm_params
 )
 {
   assert(mode == LM_CHROMA_IDX || mode == LM_CHROMA_L_IDX || mode == LM_CHROMA_T_IDX);
@@ -523,15 +527,19 @@ void kvz_predict_cclm(
       s += y_rec[2 * x + LCU_WIDTH] * 2;
       s += y_rec[2 * x + 1 + LCU_WIDTH];
       s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + 1) * stride] : y_rec[2 * x - ((x + x0) > 0) + stride];
-      sampled_luma[x + y * width] = s >> 3;
+      sampled_luma[x / 2 + y / 2 * width] = s >> 3;
     }
     y_rec += LCU_WIDTH;
   }
 
-  int a, b, shift;
+  int16_t a, b, shift;
   get_cclm_parameters(state, width, height, mode,x0, y0, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
+  cclm_params->shift = shift;
+  cclm_params->a = a;
+  cclm_params->b = b;
 
-  linear_transform_cclm(a, shift, b,sampled_luma, dst, width, height);
+  if(dst)
+    linear_transform_cclm(cclm_params, sampled_luma, dst, width, height);
 }
 
 void kvz_intra_predict(
@@ -859,6 +867,7 @@ static void intra_recon_tb_leaf(
   int y,
   int depth,
   int8_t intra_mode,
+  cclm_parameters_t *cclm_params,
   lcu_t *lcu,
   color_t color)
 {
@@ -878,14 +887,37 @@ static void intra_recon_tb_leaf(
     state->tile->frame->width,
     state->tile->frame->height,
   };
-  const vector2d_t lcu_px = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift};
+  int x_scu = SUB_SCU(x);
+  const vector2d_t lcu_px = {x_scu >> shift, SUB_SCU(y) >> shift};
 
   kvz_intra_references refs;
   kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp);
 
   kvz_pixel pred[32 * 32];
+  int stride = state->tile->frame->source->stride;
   const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
-  kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary);
+  if(intra_mode < 68) {
+    kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary);
+  } else {
+    kvz_pixel *y_rec = lcu->rec.y;
+    for (int y_ = 0; y_ < width * 2; y_ += 2) {
+      for (int x_ = 0; x_ < width * 2; x_ += 2) {
+        int s = 4;
+        s += y_rec[2 * x_] * 2;
+        s += y_rec[2 * x_ + 1];
+        // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
+        // *except* when we are also at the edge of the frame, in which case we want to duplicate
+        // the edge pixel
+        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + y * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
+        s += y_rec[2 * x_ + LCU_WIDTH] * 2;
+        s += y_rec[2 * x_ + 1 + LCU_WIDTH];
+        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + stride];
+        pred[x_ / 2 + y_ * width / 2] = s >> 3;
+      }
+      y_rec += LCU_WIDTH;
+    }
+    linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width);
+  }
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;
   kvz_pixel *block = NULL;
@@ -920,6 +952,7 @@ static void intra_recon_tb_leaf(
  * \param mode_luma     intra mode for luma, or -1 to skip luma recon
  * \param mode_chroma   intra mode for chroma, or -1 to skip chroma recon
  * \param cur_cu        pointer to the CU, or NULL to fetch CU from LCU
+ * \param cclm_params   pointer for the cclm_parameters, can be NULL if the mode is not cclm mode
  * \param lcu           containing LCU
  */
 void kvz_intra_recon_cu(
@@ -930,6 +963,7 @@ void kvz_intra_recon_cu(
   int8_t mode_luma,
   int8_t mode_chroma,
   cu_info_t *cur_cu,
+  cclm_parameters_t *cclm_params,
   lcu_t *lcu)
 {
   const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
@@ -954,10 +988,10 @@ void kvz_intra_recon_cu(
     const int32_t x2 = x + offset;
     const int32_t y2 = y + offset;
 
-    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, lcu);
-    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, lcu);
-    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, lcu);
-    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, lcu);
+    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
+    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
+    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
+    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs[3] = {
@@ -978,11 +1012,11 @@ void kvz_intra_recon_cu(
     const bool has_chroma = mode_chroma != -1 &&  (x % 8 == 0 && y % 8 == 0);
     // Process a leaf TU.
     if (has_luma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_luma, lcu, COLOR_Y);
+      intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y);
     }
     if (has_chroma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, lcu, COLOR_U);
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, lcu, COLOR_V);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V);
     }
 
     kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
diff --git a/src/intra.h b/src/intra.h
index 5652ec41..eb737be7 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -54,6 +54,12 @@ typedef struct
   bool filtered_initialized;
 } kvz_intra_references;
 
+typedef struct
+{
+  int16_t a;
+  int16_t shift;
+  int16_t b;
+} cclm_parameters_t;
 
 /**
 * \brief Function for deriving intra luma predictions
@@ -118,6 +124,7 @@ void kvz_intra_recon_cu(
   int8_t mode_luma,
   int8_t mode_chroma,
   cu_info_t *cur_cu,
+  cclm_parameters_t* cclm_params,
   lcu_t *lcu);
 
 
@@ -132,5 +139,6 @@ void kvz_predict_cclm(
   const int8_t mode,
   kvz_pixel const* y_rec,
   kvz_intra_references* chroma_ref,
-  kvz_pixel* dst
+  kvz_pixel* dst,
+  cclm_parameters_t* cclm_params
 );
\ No newline at end of file
diff --git a/src/search.c b/src/search.c
index 278b5d50..685c8428 100644
--- a/src/search.c
+++ b/src/search.c
@@ -709,7 +709,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                          x, y,
                          depth,
                          cur_cu->intra.mode, -1, // skip chroma
-                         NULL, lcu);
+                         NULL, NULL, lcu);
 
       // TODO: This heavily relies to square CUs
       if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != KVZ_CSP_400) {
@@ -717,8 +717,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         // rd2. Possibly because the luma mode search already takes chroma
         // into account, so there is less of a chanse of luma mode being
         // really bad for chroma.
+        cclm_parameters_t cclm_params[2];
         if (ctrl->cfg.rdo == 3) {
-          cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu);
+          cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, cclm_params);
           lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         }
 
@@ -726,7 +727,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                            x & ~7, y & ~7, // TODO: as does this
                            depth,
                            -1, cur_cu->intra.mode_chroma, // skip luma
-                           NULL, lcu);
+                           NULL, cclm_params, lcu);
       }
     } else if (cur_cu->type == CU_INTER) {
 
@@ -883,7 +884,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                            x, y,
                            depth,
                            cur_cu->intra.mode, mode_chroma,
-                           NULL, lcu);
+                           NULL,NULL, lcu);
 
         cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
         if (has_chroma) {
diff --git a/src/search_intra.c b/src/search_intra.c
index 52b654bd..bbef6fbe 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -258,6 +258,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
                                    int intra_mode, int cost_treshold,
                                    cu_info_t *const pred_cu,
                                    lcu_t *const lcu,
+                                   cclm_parameters_t *cclm_params,
                                    const int mts_mode)
 {
   assert(depth >= 0 && depth <= MAX_PU_DEPTH);
@@ -332,7 +333,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
         x_px, y_px,
         depth,
         intra_mode, -1,
-        pred_cu, lcu);
+        pred_cu, cclm_params, lcu);
 
       // TODO: Not sure if this should be 0 or 1 but at least seems to work with 1
       if (pred_cu->tr_idx > 1)
@@ -360,7 +361,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
         x_px, y_px,
         depth,
         -1, chroma_mode,
-        pred_cu, lcu);
+        pred_cu, cclm_params, lcu);
       best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
     }
     pred_cu->tr_skip = best_tr_idx == MTS_SKIP;
@@ -391,15 +392,15 @@ static double search_intra_trdepth(encoder_state_t * const state,
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
     split_cost = 3 * state->lambda;
 
-    split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, -1);
+    split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
     if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, -1);
+      split_cost += search_intra_trdepth(state, x_px + offset, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
     }
     if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, -1);
+      split_cost += search_intra_trdepth(state, x_px, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
     }
     if (split_cost < nosplit_cost) {
-      split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, -1);
+      split_cost += search_intra_trdepth(state, x_px + offset, y_px + offset, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu, cclm_params, -1);
     }
 
     double cbf_bits = 0.0;
@@ -468,6 +469,8 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
   cost_pixel_nxn_func *const satd_func = kvz_pixels_get_satd_func(width);
   //cost_pixel_nxn_func *const sad_func = kvz_pixels_get_sad_func(width);
 
+  cclm_parameters_t cclm_params;
+  
   kvz_pixel _pred[32 * 32 + SIMD_ALIGNMENT];
   kvz_pixel *pred = ALIGNED_POINTER(_pred, SIMD_ALIGNMENT);
 
@@ -484,7 +487,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
   for (int i = 5; i < 8; i++) {
     kvz_predict_cclm(
       state,
-      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu->rec.y, refs_u,  _pred);
+      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu->rec.y, refs_u,  pred, &cclm_params);
   }
 
   kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width);
@@ -749,7 +752,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
     // Reset transform split data in lcu.cu for this area.
     kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
 
-    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu, -1);
+    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu, NULL, -1);
     costs[rdo_mode] += mode_cost;
     trafo[rdo_mode] = pred_cu.tr_idx;
 
@@ -774,7 +777,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
     pred_cu.intra.mode = modes[0];
     pred_cu.intra.mode_chroma = modes[0];
     FILL(pred_cu.cbf, 0);
-    search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu, trafo[0]);
+    search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu, NULL, trafo[0]);
   }
 
   return modes_to_check;
@@ -826,18 +829,21 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
                                   int x_px, int y_px, int depth,
                                   int8_t intra_mode,
                                   int8_t modes[8], int8_t num_modes,
-                                  lcu_t *const lcu)
+                                  lcu_t *const lcu, cclm_parameters_t *best_cclm)
 {
   const bool reconstruct_chroma = (depth != 4) || (x_px & 4 && y_px & 4);
 
 
-  kvz_intra_references refs;
+  kvz_intra_references refs[2];
   const vector2d_t luma_px = { x_px, y_px };
   const vector2d_t pic_px = {
     state->tile->frame->width,
     state->tile->frame->height,
   };
-  kvz_intra_build_reference(6-depth, COLOR_U, &luma_px, &pic_px, lcu, &refs, state->encoder_control->cfg.wpp);
+  kvz_intra_build_reference(6-depth, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp);
+  kvz_intra_build_reference(6-depth, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp);
+
+  cclm_parameters_t cclm_params[2] = {0};
 
   if (reconstruct_chroma) {
     const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
@@ -846,8 +852,11 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
     struct {
       double cost;
       int8_t mode;
+      cclm_parameters_t cclm[2];
     } chroma, best_chroma;
 
+    // chroma.cclm = cclm_params;
+
     best_chroma.mode = 0;
     best_chroma.cost = MAX_INT;
 
@@ -856,14 +865,24 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
       if (chroma.mode == -1) continue;
       if(chroma.mode < 67) {
         kvz_intra_recon_cu(state,
-                           x_px, y_px,
-                           depth,
-                           -1, chroma.mode, // skip luma
-                           NULL, lcu);
+          x_px, y_px,
+          depth,
+          -1, chroma.mode, // skip luma
+          NULL, NULL, lcu);
       }
       else {
         kvz_predict_cclm(
-          state, COLOR_U, 32 >> (depth), 32 >> (depth), x_px, y_px, state->tile->frame->source->stride, chroma.mode, lcu->rec.y, &refs, NULL);
+          state, COLOR_U, 32 >> (depth), 32 >> (depth), x_px, y_px, state->tile->frame->source->stride, chroma.mode, lcu->rec.y, &refs[0], NULL, &cclm_params[0]);
+        chroma.cclm[0] = cclm_params[0];
+        kvz_predict_cclm(
+          state, COLOR_V, 32 >> (depth), 32 >> (depth), x_px, y_px, state->tile->frame->source->stride, chroma.mode, lcu->rec.y, &refs[1], NULL, &cclm_params[1]);
+        chroma.cclm[1] = cclm_params[1];
+
+        kvz_intra_recon_cu(state,
+          x_px, y_px,
+          depth,
+          -1, chroma.mode, // skip luma
+          NULL, cclm_params, lcu);
       }
       chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
 
@@ -874,6 +893,8 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
         best_chroma = chroma;
       }
     }
+    best_cclm[0] = best_chroma.cclm[0];
+    best_cclm[1] = best_chroma.cclm[1];
 
     return best_chroma.mode;
   }
@@ -884,7 +905,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
 
 int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
                               const int x_px, const int y_px,
-                              const int depth, lcu_t *lcu)
+                              const int depth, lcu_t *lcu, cclm_parameters_t *best_cclm)
 {
   const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
 
@@ -892,7 +913,7 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
   int8_t intra_mode = cur_pu->intra.mode;
 
   double costs[8];
-  int8_t modes[8] = { 0, 50, 18, 1, -1, 67, 68, 69 };
+  int8_t modes[8] = { 0, 50, 18, 1, -1, 81, 82, 83 };
   if (intra_mode != 0 && intra_mode != 50 && intra_mode != 18 && intra_mode != 1) {
     modes[4] = intra_mode;
   }
@@ -934,7 +955,7 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
 
   int8_t intra_mode_chroma = intra_mode;
   if (num_modes > 1) {
-    intra_mode_chroma = kvz_search_intra_chroma_rdo(state, x_px, y_px, depth, intra_mode, modes, num_modes, lcu);
+    intra_mode_chroma = kvz_search_intra_chroma_rdo(state, x_px, y_px, depth, intra_mode, modes, num_modes, lcu, best_cclm);
   }
 
   return intra_mode_chroma;
diff --git a/src/search_intra.h b/src/search_intra.h
index e7cc79a0..83a109f6 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -41,17 +41,18 @@
 #include "cu.h"
 #include "encoderstate.h"
 #include "global.h" // IWYU pragma: keep
+#include "intra.h"
 
 
 double kvz_luma_mode_bits(const encoder_state_t *state, 
-                      int8_t luma_mode, const int8_t *intra_preds);
+                          int8_t luma_mode, const int8_t *intra_preds);
                        
 double kvz_chroma_mode_bits(const encoder_state_t *state,
                         int8_t chroma_mode, int8_t luma_mode);
 
 int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
                               const int x_px, const int y_px,
-                              const int depth, lcu_t *lcu);
+                              const int depth, lcu_t *lcu, cclm_parameters_t* best_cclm);
 
 void kvz_search_cu_intra(encoder_state_t * const state,
                          const int x_px, const int y_px,

From 1c431d8f884d270342fc2c3a24efe7f9a86b72c7 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 17 Nov 2021 11:33:57 +0200
Subject: [PATCH 08/19] [cclm] Add commandline argument

---
 configure.ac                  |  2 +-
 src/cfg.c                     |  5 +++++
 src/cli.c                     | 10 ++++++++--
 src/encoder_state-bitstream.c |  2 +-
 src/kvazaar.h                 |  2 ++
 5 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/configure.ac b/configure.ac
index dbbb1b72..3a0d1582 100644
--- a/configure.ac
+++ b/configure.ac
@@ -23,7 +23,7 @@ AC_CONFIG_SRCDIR([src/encmain.c])
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=6
-ver_minor=6
+ver_minor=7
 ver_release=0
 
 # Prevents configure from adding a lot of defines to the CFLAGS
diff --git a/src/cfg.c b/src/cfg.c
index a7a1f074..f85e5c71 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -209,6 +209,8 @@ int kvz_config_init(kvz_config *cfg)
 
   cfg->amvr = 0;
 
+  cfg->cclm = 0;
+
   return 1;
 }
 
@@ -1486,6 +1488,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
   else if OPT("amvr") {
     cfg->amvr = (bool)atobool(value);
   }
+  else if OPT("cclm") {
+    cfg->cclm = (bool)atobool(value);
+  }
   else {
     return 0;
   }
diff --git a/src/cli.c b/src/cli.c
index edefc814..270e924f 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -178,6 +178,8 @@ static const struct option long_options[] = {
   { "no-jccr",                  no_argument, NULL, 0 },
   { "amvr",                     no_argument, NULL, 0 },
   { "no-amvr",                  no_argument, NULL, 0 },
+  { "cclm",                     no_argument, NULL, 0 },
+  { "no-cclm",                  no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -629,8 +631,12 @@ void print_help(void)
     "                                   - both: MTS applied for both intra and inter blocks.\n"
     "                                   - implicit: uses implicit MTS. Applies DST7 instead \n"
     "                                               of DCT2 to certain intra blocks.\n"
-    "      --(no-)jccr            : Joint coding of chroma residual.\n"
-    "                               Requires rdo> = 2. [disabled]\n"      
+    "      --(no-)jccr            : Joint coding of chroma residual. "
+    "                               Requires rdo> = 2. [disabled]\n"
+    "      --(no-)cclm            : Cross component linear model. \n"
+    "                               Extra chroma prediction modes that are formed\n"
+    "                               via linear transformation from the luma\n"
+    "                               prediction. Requires rdo >=3. [disabled\n"
     "      --(no-)amvr            : Adaptive Motion Vector Resolution.\n"
     "                               Code some mv's with reduced resolution [disabled]\n"
     "\n"
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index 7d73544e..537c1be9 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -722,7 +722,7 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
   WRITE_U(stream, 0, 1, "sps_mip_enabled_flag");
   // if(!no_cclm_constraint_flag)
   if(encoder->chroma_format != KVZ_CSP_400) {
-    WRITE_U(stream, 0, 1, "sps_cclm_enabled_flag");
+    WRITE_U(stream, encoder->cfg.cclm, 1, "sps_cclm_enabled_flag");
   }
   if (encoder->chroma_format == KVZ_CSP_420) {
     WRITE_U(stream, 0, 1, "sps_chroma_horizontal_collocated_flag");
diff --git a/src/kvazaar.h b/src/kvazaar.h
index 7506d3bc..61c13031 100644
--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@@ -516,6 +516,8 @@ typedef struct kvz_config
 
   int8_t jccr;
 
+  int8_t cclm;
+
   int8_t amvr; /* \brief Adaptive motion vector resolution parameter */
 } kvz_config;
 

From 754607dae4840dbd3f436b8cd6bb984f9b63c536 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 17 Nov 2021 12:41:42 +0200
Subject: [PATCH 09/19] [cclm] bitstream writing + cabac

---
 src/cabac.h              |  2 ++
 src/context.c            | 17 +++++++++++++++++
 src/encode_coding_tree.c | 24 ++++++++++++++++++++----
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/src/cabac.h b/src/cabac.h
index 34f48c2c..9e7e983b 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -117,6 +117,8 @@ typedef struct
     cabac_ctx_t transform_skip_gt1[4];
     cabac_ctx_t transform_skip_par;
     cabac_ctx_t transform_skip_gt2[5];
+    cabac_ctx_t cclm_flag;
+    cabac_ctx_t cclm_model;
 
   } ctx;
 } cabac_data_t;
diff --git a/src/context.c b/src/context.c
index 7e174fb9..78d3f134 100644
--- a/src/context.c
+++ b/src/context.c
@@ -395,6 +395,20 @@ static const uint8_t INIT_IMV_FLAG[4][5] = {
   {   0,   5,   0,   0,   4, },
 };
 
+static const uint8_t INIT_CCLM_FLAG[4] = {
+  {  26, },
+  {  34, },
+  {  59, },
+  {   4, },
+};
+
+static const uint8_t INIT_CCLM_MODEL[4] = {
+  {  27, },
+  {  27, },
+  {  27, },
+  {   9, },
+};
+
 /*
 static const uint16_t g_inistateToCount[128] = {
   614,   647,   681,   718,   756,   797,   839,   884,   932,   982,   1034,  1089,  1148,  1209,  1274,  1342,
@@ -471,6 +485,9 @@ void kvz_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
 
   kvz_ctx_init(&cabac->ctx.chroma_pred_model, QP, INIT_CHROMA_PRED_MODE[slice], INIT_CHROMA_PRED_MODE[3]);
 
+  kvz_ctx_init(&cabac->ctx.cclm_flag, QP, INIT_CCLM_FLAG[slice], INIT_CCLM_FLAG[3]);
+  kvz_ctx_init(&cabac->ctx.cclm_model, QP, INIT_CCLM_MODEL[slice], INIT_CCLM_MODEL[3]);
+
 
   for (i = 0; i < 3; i++) {
     kvz_ctx_init(&cabac->ctx.cu_skip_flag_model[i], QP, INIT_SKIP_FLAG[slice][i], INIT_SKIP_FLAG[3][i]);
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 7de4f61a..cfc0f4cf 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -701,7 +701,7 @@ static bool encode_inter_prediction_unit(encoder_state_t * const state,
   return non_zero_mvd;
 }
 
-static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* const cur_cu, int x, int y, const videoframe_t* const frame, const int cu_width) {
+static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* const cur_cu, int x, int y, const videoframe_t* const frame, const int cu_width, const int cclm_enabled) {
   unsigned pred_mode = 0;
   unsigned chroma_pred_modes[8] = {0, 50, 18, 1, 67, 81, 82, 83};
   const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, 0);
@@ -710,7 +710,23 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c
   int8_t chroma_intra_dir = first_pu->intra.mode_chroma;
   int8_t luma_intra_dir = first_pu->intra.mode;
 
+
   bool derived_mode = chroma_intra_dir == luma_intra_dir;
+  bool cclm_mode = chroma_intra_dir > 67;
+
+  if (cclm_enabled) {
+    cabac->cur_ctx = &cabac->ctx.cclm_flag;
+    CABAC_BIN(cabac, cclm_mode, "cclm_flag");
+    if(cclm_mode) {
+      cabac->cur_ctx = &cabac->ctx.cclm_model;
+      CABAC_BIN(cabac, chroma_intra_dir != 81, "cclm_model_1");
+      if(chroma_intra_dir != 81) {
+        CABAC_BIN(cabac, chroma_intra_dir == 83, "cclm_model_2");        
+      }
+      return;
+    }
+
+  }
   cabac->cur_ctx = &(cabac->ctx.chroma_pred_model);
   CABAC_BIN(cabac, derived_mode ? 0 : 1, "intra_chroma_pred_mode");
 
@@ -722,7 +738,7 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c
           break;
         }
       }*/
-    for (; pred_mode < 8; pred_mode++) {
+    for (; pred_mode < 5; pred_mode++) {
       if (chroma_intra_dir == chroma_pred_modes[pred_mode]) {
         break;
       }
@@ -983,7 +999,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
 
   // Code chroma prediction mode.
   if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) {
-    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width);
+    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
   }
 
   encode_transform_coeff(state, x, y, depth, 0, 0, 0, 0, coeff);
@@ -991,7 +1007,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
   encode_mts_idx(state, cabac, cur_cu);
 
   if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) {
-    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width);
+    encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
     encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff);
   }
 

From 7187678526110b65473fd0b5253fabf82081e129 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 17 Nov 2021 13:01:06 +0200
Subject: [PATCH 10/19] [cclm] Only do cclm search when it is enabled and add
 test for cclm

---
 src/intra.c         | 1 +
 src/search_intra.c  | 3 ++-
 tests/test_intra.sh | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 21c539cf..5de302c9 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -472,6 +472,7 @@ void kvz_predict_cclm(
 )
 {
   assert(mode == LM_CHROMA_IDX || mode == LM_CHROMA_L_IDX || mode == LM_CHROMA_T_IDX);
+  assert(state->encoder_control->cfg.cclm);
 
   
   kvz_intra_ref sampled_luma_ref;
diff --git a/src/search_intra.c b/src/search_intra.c
index bbef6fbe..39e4c48a 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -485,6 +485,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
     costs[i] += satd_func(pred, orig_block);
   }
   for (int i = 5; i < 8; i++) {
+    assert(state->encoder_control->cfg.cclm);
     kvz_predict_cclm(
       state,
       COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu->rec.y, refs_u,  pred, &cclm_params);
@@ -926,7 +927,7 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
   int num_modes = modes_in_depth[depth];
 
   if (state->encoder_control->cfg.rdo == 3) {
-    num_modes = 8;
+    num_modes = state->encoder_control->cfg.cclm ? 8 : 5;
   }
 
   // Don't do rough mode search if all modes are selected.
diff --git a/tests/test_intra.sh b/tests/test_intra.sh
index 4c2e93bd..af806a6a 100755
--- a/tests/test_intra.sh
+++ b/tests/test_intra.sh
@@ -10,8 +10,9 @@ common_args='256x128 10 yuv420p -p1 --preset=ultrafast --threads=0 --no-wpp --no
 valgrind_test $common_args --rd=1
 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37
 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37 --signhide --rdoq 
+valgrind_test $common_args --rd=3
 valgrind_test $common_args --alf=full --no-wpp --threads=0 --owf=0
 valgrind_test $common_args --alf=full --wpp --threads=1
-valgrind_test $common_args --jccr
 valgrind_test $common_args --jccr --rdoq --rd=2 --mts=intra
+valgrind_test $common_args --rd=3 --cclm --jccr
 

From 87a458dc85ddc9e4839ba13533f0b2b3ab98d03e Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 17 Nov 2021 13:12:21 +0200
Subject: [PATCH 11/19] [cclm] fix incorrect log_width for chroma ref
 generation

---
 src/search_intra.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search_intra.c b/src/search_intra.c
index 39e4c48a..e4afdcf1 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -841,8 +841,8 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
     state->tile->frame->width,
     state->tile->frame->height,
   };
-  kvz_intra_build_reference(6-depth, COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp);
-  kvz_intra_build_reference(6-depth, COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp);
+  kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp);
+  kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp);
 
   cclm_parameters_t cclm_params[2] = {0};
 

From c16ce9a82bb095c4984456eead156751e6940ac9 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 18 Nov 2021 06:55:42 +0200
Subject: [PATCH 12/19] [cclm] fix bin type for the second cclm mode bit

---
 src/encode_coding_tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index cfc0f4cf..4cbc4a0f 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -721,7 +721,7 @@ static void encode_chroma_intra_cu(cabac_data_t* const cabac, const cu_info_t* c
       cabac->cur_ctx = &cabac->ctx.cclm_model;
       CABAC_BIN(cabac, chroma_intra_dir != 81, "cclm_model_1");
       if(chroma_intra_dir != 81) {
-        CABAC_BIN(cabac, chroma_intra_dir == 83, "cclm_model_2");        
+        CABAC_BIN_EP(cabac, chroma_intra_dir == 83, "cclm_model_2");
       }
       return;
     }

From d9875a3d923c6eef94d41f1ff4de104e862f516f Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 18 Nov 2021 07:13:16 +0200
Subject: [PATCH 13/19] [cclm] fix trying cclm on the upper level

---
 src/intra.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/intra.c b/src/intra.c
index 5de302c9..79079404 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -917,7 +917,14 @@ static void intra_recon_tb_leaf(
       }
       y_rec += LCU_WIDTH;
     }
-    linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width);
+    if(cclm_params == NULL) {
+      cclm_parameters_t temp_params;
+      kvz_predict_cclm(
+        state, color, width, width, x, y, stride, intra_mode, lcu->rec.y, &refs, pred, &temp_params);
+    }
+    else {
+      linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width);
+    }
   }
 
   const int index = lcu_px.x + lcu_px.y * lcu_width;

From f030158703e3467c10a6a18a2009b1c46f8e5adc Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Fri, 19 Nov 2021 11:54:51 +0200
Subject: [PATCH 14/19] [cclm] Fix search and parameter generation for CCLM

---
 src/encoder_state-ctors_dtors.c |   2 +-
 src/intra.c                     | 121 ++++++++++++++++++++------------
 src/intra.h                     |   2 +-
 src/search.c                    |  39 +++++++++-
 src/search_intra.c              |  51 +++++++++++---
 src/videoframe.c                |  11 ++-
 src/videoframe.h                |   4 +-
 7 files changed, 170 insertions(+), 60 deletions(-)

diff --git a/src/encoder_state-ctors_dtors.c b/src/encoder_state-ctors_dtors.c
index c7c9985d..7a5cff52 100644
--- a/src/encoder_state-ctors_dtors.c
+++ b/src/encoder_state-ctors_dtors.c
@@ -122,7 +122,7 @@ static int encoder_state_config_tile_init(encoder_state_t * const state,
                                           const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
   
   const encoder_control_t * const encoder = state->encoder_control;
-  state->tile->frame = kvz_videoframe_alloc(width, height, state->encoder_control->chroma_format, encoder->cfg.alf_type);
+  state->tile->frame = kvz_videoframe_alloc(width, height, state->encoder_control->chroma_format, encoder->cfg.alf_type, encoder->cfg.cclm);
   
   state->tile->frame->rec = NULL;
   
diff --git a/src/intra.c b/src/intra.c
index 79079404..f786781c 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -259,7 +259,7 @@ enum lm_mode
 static void get_cclm_parameters(
   encoder_state_t const* const state,
   int8_t width, int8_t height, int8_t mode,
-  int x0, int y0,
+  int x0, int y0, int avai_above_right_units, int avai_left_below_units,
   kvz_intra_ref* luma_src, kvz_intra_references*chroma_ref,
   int16_t *a, int16_t*b, int16_t*shift) {
 
@@ -287,8 +287,8 @@ static void get_cclm_parameters(
   //int total_units = total_left_units + total_above_units + 1;
   //int above_right_units = total_above_units - tu_width_in_units;
   //int left_below_units = total_left_units - tu_height_in_units;
-  int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
-  int avai_left_below_units = 0;
+  //int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
+  //int avai_left_below_units = 0;
   int avai_above_units = CLIP(0, tu_height_in_units, y0/base_unit_size);
   int avai_left_units = CLIP(0, tu_width_in_units, x0 / base_unit_size);
 
@@ -465,7 +465,7 @@ void kvz_predict_cclm(
   const int16_t y0,
   const int16_t stride,
   const int8_t mode,
-  kvz_pixel const *  y_rec,
+  lcu_t* const lcu,
   kvz_intra_references* chroma_ref,
   kvz_pixel* dst,
   cclm_parameters_t* cclm_params
@@ -480,61 +480,92 @@ void kvz_predict_cclm(
 
   int x_scu = SUB_SCU(x0);
   int y_scu = SUB_SCU(y0);
-  y_rec += x_scu + y_scu * LCU_WIDTH;
+
+  int available_above_right = 0;
+  int available_left_below = 0;
+
+
+  kvz_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH;
 
   // Essentially what this does is that it uses 6-tap filtering to downsample
   // the luma intra references down to match the resolution of the chroma channel.
   // The luma reference is only needed when we are not on the edge of the picture.
   // Because the reference pixels that are needed on the edge of the ctu this code
   // is kinda messy but what can you do
+
+  if (y0) {
+    for (; available_above_right < width / 2; available_above_right++) {
+      int x_extension = x_scu + width * 2 + 4 * available_above_right;
+      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4);
+      if (pu->type == CU_NOTSET || x_extension > LCU_WIDTH) break;
+    }
+    if(y_scu == 0) {
+      if(!state->encoder_control->cfg.wpp) available_above_right = MIN(width / 2, (state->tile->frame->width - x0 - width * 2) / 4);
+      for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
+        bool left_padding = x0 || x;
+        sampled_luma_ref.top[x / 2] = (state->tile->frame->rec->y[x0 + x + (y0 - 1) * stride] * 2 +
+          state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride] +
+          state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride] + 
+          2) >> 2;
+      }
+    }
+    else {
+      for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
+        bool left_padding = x0 || x;
+        int s = 4;
+        s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2            : state->tile->frame->rec->y[x0 + x + (y0 - 2) * stride] * 2;
+        s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1]            : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
+        s += y_scu && !(x0 && !x && !x_scu) ? y_rec[x - LCU_WIDTH * 2 - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 2) * stride];
+        s += y_scu ? y_rec[x - LCU_WIDTH] * 2                : state->tile->frame->rec->y[x0 + x + (y0 - 1) * stride] * 2;
+        s += y_scu ? y_rec[x - LCU_WIDTH + 1]                : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
+        s += y_scu && !(x0 && !x && !x_scu) ? y_rec[x - LCU_WIDTH - left_padding]     : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
+        sampled_luma_ref.top[x / 2] = s >> 3;
+      }
+    }
+  }
+
   if(x0) {
-    for(int y = 0; y < height * 2; y+=2) {
+    for (; available_left_below < height / 2; available_left_below++) {
+      int y_extension = y_scu + height * 2 + 4 * available_left_below;
+      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension);
+      if (pu->type == CU_NOTSET || y_extension > LCU_WIDTH) break;
+      if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
+    }
+    for(int y = 0; y < height * (available_left_below ? 4 : 2); y+=2) {
       int s = 4;
-      s += x_scu ? y_rec[y * LCU_WIDTH - 1] * 2       : state->tile->frame->rec->y[x0 - 1 + (y0 + y) * stride] * 2;
-      s += x_scu ? y_rec[y * LCU_WIDTH - 2]           : state->tile->frame->rec->y[x0 - 2 + (y0 + y) * stride];
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 1] * 2 : state->tile->frame->rec->y[x0 - 1 + (y0 + y + 1) * stride] * 2;
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2]     : state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride];
-      s +=         y_rec[y * LCU_WIDTH];
-      s +=         y_rec[(y + 1) * LCU_WIDTH];
+      s += x_scu ? y_rec[y * LCU_WIDTH - 2] * 2       : state->tile->frame->rec->y[x0 - 2 + (y0 + y) * stride] * 2;
+      s += x_scu ? y_rec[y * LCU_WIDTH - 1]           : state->tile->frame->rec->y[x0 - 1 + (y0 + y) * stride];
+      s += x_scu ? y_rec[y * LCU_WIDTH - 3]           : state->tile->frame->rec->y[x0 - 3 + (y0 + y) * stride];
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2] * 2 : state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride] * 2;
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 1]     : state->tile->frame->rec->y[x0 - 1 + (y0 + y + 1) * stride];
+      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 3]     : state->tile->frame->rec->y[x0 - 3 + (y0 + y + 1) * stride];
       sampled_luma_ref.left[y/2] = s >> 3;
     }
   }
 
-  if(y0) {
-    for(int x = 0; x < width*2; x += 2) {
-      bool left_padding = x0 || x;
-      int s = 4;
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2            : state->tile->frame->rec->y[x0 + x +(y0 - 2) * stride] * 2;
-      s += y_scu ? y_rec[x - LCU_WIDTH] * 2                : state->tile->frame->rec->y[x0 + x +(y0 - 1) * stride] * 2;
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2 - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 2) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH - left_padding]     : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1]            : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
-      s += y_scu ? y_rec[x - LCU_WIDTH + 1]                : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
-      sampled_luma_ref.top[x / 2] = s >> 3;
-    }
-  }
+
 
   // Downsample the reconstructed luma sample so that they can be mapped into the chroma
   // to generate the chroma prediction
-  for (int y = 0; y < height * 2; y+=2) {
-    for (int x = 0; x <  width * 2; x+=2) {
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x <  width; x++) {
       int s = 4;
       s += y_rec[2 * x] * 2;
       s += y_rec[2 * x + 1];
       // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
       // *except* when we are also at the edge of the frame, in which case we want to duplicate
       // the edge pixel
-      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + y0 * stride] : y_rec[2 * x - ((x + x0) > 0)];
+      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + y*2) * stride] : y_rec[2 * x - ((x + x0) > 0)];
       s += y_rec[2 * x + LCU_WIDTH] * 2;
       s += y_rec[2 * x + 1 + LCU_WIDTH];
-      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + 1) * stride] : y_rec[2 * x - ((x + x0) > 0) + stride];
-      sampled_luma[x / 2 + y / 2 * width] = s >> 3;
+      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + y * 2 + 1) * stride] : y_rec[2 * x - ((x + x0) > 0) + LCU_WIDTH];
+      sampled_luma[x + y * width] = s >> 3;
     }
-    y_rec += LCU_WIDTH;
+    y_rec += LCU_WIDTH * 2;
   }
 
   int16_t a, b, shift;
-  get_cclm_parameters(state, width, height, mode,x0, y0, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
+  get_cclm_parameters(state, width, height, mode,x0, y0, available_above_right, available_left_below, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
   cclm_params->shift = shift;
   cclm_params->a = a;
   cclm_params->b = b;
@@ -889,7 +920,8 @@ static void intra_recon_tb_leaf(
     state->tile->frame->height,
   };
   int x_scu = SUB_SCU(x);
-  const vector2d_t lcu_px = {x_scu >> shift, SUB_SCU(y) >> shift};
+  int y_scu = SUB_SCU(y);
+  const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift };
 
   kvz_intra_references refs;
   kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp);
@@ -901,26 +933,27 @@ static void intra_recon_tb_leaf(
     kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary);
   } else {
     kvz_pixel *y_rec = lcu->rec.y;
-    for (int y_ = 0; y_ < width * 2; y_ += 2) {
-      for (int x_ = 0; x_ < width * 2; x_ += 2) {
+    y_rec += x_scu + y_scu * LCU_WIDTH;
+    for (int y_ = 0; y_ < width; y_++) {
+      for (int x_ = 0; x_ < width; x_++) {
         int s = 4;
         s += y_rec[2 * x_] * 2;
         s += y_rec[2 * x_ + 1];
         // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
         // *except* when we are also at the edge of the frame, in which case we want to duplicate
         // the edge pixel
-        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + y * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
+        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2) * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
         s += y_rec[2 * x_ + LCU_WIDTH] * 2;
         s += y_rec[2 * x_ + 1 + LCU_WIDTH];
-        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + stride];
-        pred[x_ / 2 + y_ * width / 2] = s >> 3;
+        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH];
+        pred[x_  + y_ * width] = s >> 3;
       }
-      y_rec += LCU_WIDTH;
+      y_rec += LCU_WIDTH * 2;
     }
     if(cclm_params == NULL) {
       cclm_parameters_t temp_params;
       kvz_predict_cclm(
-        state, color, width, width, x, y, stride, intra_mode, lcu->rec.y, &refs, pred, &temp_params);
+        state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params);
     }
     else {
       linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width);
@@ -996,10 +1029,10 @@ void kvz_intra_recon_cu(
     const int32_t x2 = x + offset;
     const int32_t y2 = y + offset;
 
-    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
-    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
-    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
-    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, cclm_params, lcu);
+    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
+    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, lcu);
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs[3] = {
diff --git a/src/intra.h b/src/intra.h
index eb737be7..846d77b2 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -137,7 +137,7 @@ void kvz_predict_cclm(
   const int16_t y0,
   const int16_t stride,
   const int8_t mode,
-  kvz_pixel const* y_rec,
+  lcu_t* const lcu,
   kvz_intra_references* chroma_ref,
   kvz_pixel* dst,
   cclm_parameters_t* cclm_params
diff --git a/src/search.c b/src/search.c
index 685c8428..5691d361 100644
--- a/src/search.c
+++ b/src/search.c
@@ -241,6 +241,33 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree,
 }
 
 
+static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, int height, kvz_pixel *y_rec) {
+  if (!state->encoder_control->cfg.cclm) return;
+  int x_scu = SUB_SCU(x);
+  int y_scu = SUB_SCU(y);
+  y_rec += x_scu + y_scu * LCU_WIDTH;
+  int stride = state->tile->frame->source->stride;
+
+  for (int y_ = 0; y_ < height && y_ * 2 + y < state->encoder_control->cfg.height; y_++) {
+    for (int x_ = 0; x_ < width; x_++) {
+      int s = 4;
+      s += y_rec[2 * x_] * 2;
+      s += y_rec[2 * x_ + 1];
+      // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
+      // *except* when we are also at the edge of the frame, in which case we want to duplicate
+      // the edge pixel
+      s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2) * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
+      s += y_rec[2 * x_ + LCU_WIDTH] * 2;
+      s += y_rec[2 * x_ + 1 + LCU_WIDTH];
+      s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH];
+      int index = x / 2 + x_ + (y / 2 + y_ )* stride / 2;
+      state->tile->frame->cclm_luma_rec[index] = s >> 3;
+    }
+    y_rec += LCU_WIDTH * 2;
+  }
+}
+
+
 /**
 * Calculate RD cost for a Coding Unit.
 * \return Cost of block
@@ -711,6 +738,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                          cur_cu->intra.mode, -1, // skip chroma
                          NULL, NULL, lcu);
 
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+      );
+
       // TODO: This heavily relies to square CUs
       if ((depth != 4 || (x % 8 && y % 8)) && state->encoder_control->chroma_format != KVZ_CSP_400) {
         // There is almost no benefit to doing the chroma mode search for
@@ -863,7 +894,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // gets used, at least in the most obvious cases, while avoiding any
     // searching.
     if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH
-        && x + cu_width <= frame->width && y + cu_width <= frame->height)
+        && x + cu_width <= frame->width && y + cu_width <= frame->height && 0)
     {
       cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_tree[depth + 1], x_local, y_local);
 
@@ -913,6 +944,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       // Copy this CU's mode all the way down for use in adjacent CUs mode
       // search.
       work_tree_copy_down(x_local, y_local, depth, work_tree);
+      downsample_cclm_rec(
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+      );
 
       if (state->frame->slicetype != KVZ_SLICE_I) {
         // Reset HMVP to the beginning of this CU level search and add this CU as the mvp
@@ -925,6 +959,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // Need to copy modes down since the lower level of the work tree is used
     // when searching SMP and AMP blocks.
     work_tree_copy_down(x_local, y_local, depth, work_tree);
+    downsample_cclm_rec(
+      state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+    );
 
     if (state->frame->slicetype != KVZ_SLICE_I) {
       // Reset HMVP to the beginning of this CU level search and add this CU as the mvp
diff --git a/src/search_intra.c b/src/search_intra.c
index e4afdcf1..f225c777 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -488,7 +488,7 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
     assert(state->encoder_control->cfg.cclm);
     kvz_predict_cclm(
       state,
-      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu->rec.y, refs_u,  pred, &cclm_params);
+      COLOR_U, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u,  pred, &cclm_params);
   }
 
   kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width);
@@ -498,6 +498,12 @@ static void search_intra_chroma_rough(encoder_state_t * const state,
     //costs[i] += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costs[i] += satd_func(pred, orig_block);
   }
+  for (int i = 5; i < 8; i++) {
+    assert(state->encoder_control->cfg.cclm);
+    kvz_predict_cclm(
+      state,
+      COLOR_V, width, width, x_px, y_px, state->tile->frame->source->stride, modes[i], lcu, refs_u, pred, &cclm_params);
+  }
 
   kvz_sort_modes(modes, costs, 5);
 }
@@ -836,17 +842,22 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
 
 
   kvz_intra_references refs[2];
-  const vector2d_t luma_px = { x_px, y_px };
+  const vector2d_t luma_px = { x_px & ~7, y_px & ~7 };
   const vector2d_t pic_px = {
     state->tile->frame->width,
     state->tile->frame->height,
   };
-  kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp);
-  kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp);
 
-  cclm_parameters_t cclm_params[2] = {0};
 
   if (reconstruct_chroma) {
+
+    int c_width = MAX(32 >> (depth), 4);
+
+    kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_U, &luma_px, &pic_px, lcu, &refs[0], state->encoder_control->cfg.wpp);
+    kvz_intra_build_reference(MAX(LOG2_LCU_WIDTH - depth - 1, 2), COLOR_V, &luma_px, &pic_px, lcu, &refs[1], state->encoder_control->cfg.wpp);
+
+    cclm_parameters_t cclm_params[2] = { 0 };
+
     const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
     cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
 
@@ -864,7 +875,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
     for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) {
       chroma.mode = modes[chroma_mode_i];
       if (chroma.mode == -1) continue;
-      if(chroma.mode < 67) {
+      if(chroma.mode < 67 || depth == 0) {
         kvz_intra_recon_cu(state,
           x_px, y_px,
           depth,
@@ -872,18 +883,38 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
           NULL, NULL, lcu);
       }
       else {
+
         kvz_predict_cclm(
-          state, COLOR_U, 32 >> (depth), 32 >> (depth), x_px, y_px, state->tile->frame->source->stride, chroma.mode, lcu->rec.y, &refs[0], NULL, &cclm_params[0]);
+          state, COLOR_U,
+          c_width, c_width,
+          x_px & ~7, y_px & ~7,
+          state->tile->frame->source->stride,
+          chroma.mode, 
+          lcu,
+          &refs[0], NULL,
+          &cclm_params[0]);
+
         chroma.cclm[0] = cclm_params[0];
+
         kvz_predict_cclm(
-          state, COLOR_V, 32 >> (depth), 32 >> (depth), x_px, y_px, state->tile->frame->source->stride, chroma.mode, lcu->rec.y, &refs[1], NULL, &cclm_params[1]);
+          state, COLOR_V,
+          c_width, c_width,
+          x_px & ~7, y_px & ~7,
+          state->tile->frame->source->stride, 
+          chroma.mode, 
+          lcu, 
+          &refs[1], NULL,
+          &cclm_params[1]);
+
         chroma.cclm[1] = cclm_params[1];
 
-        kvz_intra_recon_cu(state,
+        kvz_intra_recon_cu(
+          state,
           x_px, y_px,
           depth,
           -1, chroma.mode, // skip luma
-          NULL, cclm_params, lcu);
+          NULL, cclm_params, lcu
+        );
       }
       chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
 
diff --git a/src/videoframe.c b/src/videoframe.c
index 76ab1da7..77919a84 100644
--- a/src/videoframe.c
+++ b/src/videoframe.c
@@ -46,7 +46,7 @@
 videoframe_t * kvz_videoframe_alloc(int32_t width,
                                     int32_t height,
                                     enum kvz_chroma_format chroma_format,
-                                    enum kvz_alf alf_type)
+                                    enum kvz_alf alf_type, bool cclm)
 {
   videoframe_t *frame = calloc(1, sizeof(videoframe_t));
   if (!frame) return 0;
@@ -59,8 +59,12 @@ videoframe_t * kvz_videoframe_alloc(int32_t width,
   frame->sao_luma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
   if (chroma_format != KVZ_CSP_400) {
     frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
+    if (cclm) {
+      assert(chroma_format == KVZ_CSP_420);
+      frame->cclm_luma_rec = MALLOC(kvz_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 7) & ~7) + FRAME_PADDING_LUMA) / 4);
+    }
   }
-
+  
   return frame;
 }
 
@@ -76,6 +80,9 @@ int kvz_videoframe_free(videoframe_t * const frame)
     kvz_image_free(frame->rec_lmcs);
     frame->source_lmcs_mapped = false;
   }
+  if(frame->cclm_luma_rec) {
+    FREE_POINTER(frame->cclm_luma_rec);
+  }
 
   kvz_image_free(frame->source);
   frame->source = NULL;
diff --git a/src/videoframe.h b/src/videoframe.h
index 3e8a6ed4..74963d85 100644
--- a/src/videoframe.h
+++ b/src/videoframe.h
@@ -53,6 +53,8 @@ typedef struct videoframe
   kvz_picture *rec;            //!< \brief Reconstructed image.
   kvz_picture *rec_lmcs;       //!< \brief LMCS mapped reconstructed image, if available, otherwise points to source.
 
+  kvz_pixel *cclm_luma_rec;    //!< \brief buffer for the downsampled luma reconstruction for cclm
+
   uint8_t* lmcs_avg_processed; //!< \brief For each LCU, indicates if already calculated average of border pixels is available
   int32_t* lmcs_avg;           //!< \brief Average of LCU border pixels
 
@@ -78,7 +80,7 @@ typedef struct videoframe
 } videoframe_t;
 
 
-videoframe_t *kvz_videoframe_alloc(int32_t width, int32_t height, enum kvz_chroma_format chroma_format, enum kvz_alf alf_type);
+videoframe_t *kvz_videoframe_alloc(int32_t width, int32_t height, enum kvz_chroma_format chroma_format, enum kvz_alf alf_type, bool cclm);
 int kvz_videoframe_free(videoframe_t * const frame);
 
 void kvz_videoframe_set_poc(videoframe_t * frame, int32_t poc);

From 80ddb60ccf0b8a892526726b68779bca94e099c1 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 24 Nov 2021 08:46:08 +0200
Subject: [PATCH 15/19] [cclm] fix cclm when deblocking is enabled

---
 src/intra.c      | 10 ++--------
 src/search.c     | 19 +++++++++++++++----
 src/videoframe.c |  4 ++++
 src/videoframe.h |  1 +
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index f786781c..59a47cd7 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -497,17 +497,11 @@ void kvz_predict_cclm(
     for (; available_above_right < width / 2; available_above_right++) {
       int x_extension = x_scu + width * 2 + 4 * available_above_right;
       cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4);
-      if (pu->type == CU_NOTSET || x_extension > LCU_WIDTH) break;
+      if (x_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
     }
     if(y_scu == 0) {
       if(!state->encoder_control->cfg.wpp) available_above_right = MIN(width / 2, (state->tile->frame->width - x0 - width * 2) / 4);
-      for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
-        bool left_padding = x0 || x;
-        sampled_luma_ref.top[x / 2] = (state->tile->frame->rec->y[x0 + x + (y0 - 1) * stride] * 2 +
-          state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride] +
-          state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride] + 
-          2) >> 2;
-      }
+      memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride / 2)], sizeof(kvz_pixel) * (width + available_above_right * 2));
     }
     else {
       for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
diff --git a/src/search.c b/src/search.c
index 5691d361..aaa98c40 100644
--- a/src/search.c
+++ b/src/search.c
@@ -241,7 +241,7 @@ static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree,
 }
 
 
-static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, int height, kvz_pixel *y_rec) {
+static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width, int height, kvz_pixel *y_rec, kvz_pixel extra_pixel) {
   if (!state->encoder_control->cfg.cclm) return;
   int x_scu = SUB_SCU(x);
   int y_scu = SUB_SCU(y);
@@ -265,6 +265,17 @@ static void downsample_cclm_rec(encoder_state_t *state, int x, int y, int width,
     }
     y_rec += LCU_WIDTH * 2;
   }
+  if((y + height * 2) % 64 == 0) {
+    int line = y / 64 * stride / 2;
+    y_rec -= LCU_WIDTH;
+    for (int i = 0; i < width; ++i) {
+      int s = 2;
+      s += y_rec[i * 2] * 2;
+      s += y_rec[i * 2 + 1];
+      s += !x_scu && !i && x ? extra_pixel : y_rec[i * 2 - ((i + x) > 0)] ;
+      state->tile->frame->cclm_luma_rec_top_line[i + x / 2 + line] = s >> 2;
+    }
+  }
 }
 
 
@@ -739,7 +750,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                          NULL, NULL, lcu);
 
       downsample_cclm_rec(
-        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
       );
 
       // TODO: This heavily relies to square CUs
@@ -945,7 +956,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       // search.
       work_tree_copy_down(x_local, y_local, depth, work_tree);
       downsample_cclm_rec(
-        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+        state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
       );
 
       if (state->frame->slicetype != KVZ_SLICE_I) {
@@ -960,7 +971,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // when searching SMP and AMP blocks.
     work_tree_copy_down(x_local, y_local, depth, work_tree);
     downsample_cclm_rec(
-      state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y
+      state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
     );
 
     if (state->frame->slicetype != KVZ_SLICE_I) {
diff --git a/src/videoframe.c b/src/videoframe.c
index 77919a84..a483202d 100644
--- a/src/videoframe.c
+++ b/src/videoframe.c
@@ -62,6 +62,7 @@ videoframe_t * kvz_videoframe_alloc(int32_t width,
     if (cclm) {
       assert(chroma_format == KVZ_CSP_420);
       frame->cclm_luma_rec = MALLOC(kvz_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) * (((height + 7) & ~7) + FRAME_PADDING_LUMA) / 4);
+      frame->cclm_luma_rec_top_line = MALLOC(kvz_pixel, (((width + 7) & ~7) + FRAME_PADDING_LUMA) / 2 * CEILDIV(height, 64));
     }
   }
   
@@ -83,6 +84,9 @@ int kvz_videoframe_free(videoframe_t * const frame)
   if(frame->cclm_luma_rec) {
     FREE_POINTER(frame->cclm_luma_rec);
   }
+  if(frame->cclm_luma_rec_top_line) {
+    FREE_POINTER(frame->cclm_luma_rec_top_line);
+  }
 
   kvz_image_free(frame->source);
   frame->source = NULL;
diff --git a/src/videoframe.h b/src/videoframe.h
index 74963d85..f77ec840 100644
--- a/src/videoframe.h
+++ b/src/videoframe.h
@@ -54,6 +54,7 @@ typedef struct videoframe
   kvz_picture *rec_lmcs;       //!< \brief LMCS mapped reconstructed image, if available, otherwise points to source.
 
   kvz_pixel *cclm_luma_rec;    //!< \brief buffer for the downsampled luma reconstruction for cclm
+  kvz_pixel *cclm_luma_rec_top_line;    //!< \brief buffer for the downsampled luma reconstruction for cclm
 
   uint8_t* lmcs_avg_processed; //!< \brief For each LCU, indicates if already calculated average of border pixels is available
   int32_t* lmcs_avg;           //!< \brief Average of LCU border pixels

From 85ff5f23b1cf380a7f00f62efe515b53ff8d5783 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 22 Nov 2021 07:08:03 +0200
Subject: [PATCH 16/19] [cclm] Fix accessing elements beyond the CU array

# Conflicts:
#	src/intra.c
---
 src/intra.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intra.c b/src/intra.c
index 59a47cd7..2d576208 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -522,7 +522,7 @@ void kvz_predict_cclm(
     for (; available_left_below < height / 2; available_left_below++) {
       int y_extension = y_scu + height * 2 + 4 * available_left_below;
       cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension);
-      if (pu->type == CU_NOTSET || y_extension > LCU_WIDTH) break;
+      if (y_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
       if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
     }
     for(int y = 0; y < height * (available_left_below ? 4 : 2); y+=2) {

From 697d4c0652e2eb9aa7c6df6c149d2210cf27f9d2 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Mon, 22 Nov 2021 08:15:08 +0200
Subject: [PATCH 17/19] [cclm] Add bits to RD calculation

---
 src/search_intra.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/search_intra.c b/src/search_intra.c
index f225c777..a232d3d9 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -825,7 +825,19 @@ double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, in
   if (chroma_mode == luma_mode) {
     mode_bits = CTX_ENTROPY_FBITS(ctx, 0);
   } else {
-    mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1);
+    if(chroma_mode > 67) {
+      mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1);
+    }
+    else {
+      ctx = &(state->cabac.ctx.cclm_model);
+      mode_bits = CTX_ENTROPY_FBITS(ctx, chroma_mode != 81);
+      if (chroma_mode != 81) mode_bits += 1;
+    }
+  }
+  // Technically this is encoded first but for this method of counting bits it does not matter
+  if(state->encoder_control->cfg.cclm) {
+    ctx = &(state->cabac.ctx.cclm_flag);
+    mode_bits += CTX_ENTROPY_FBITS(ctx, chroma_mode > 67);
   }
 
   return mode_bits;

From ce175c503fafa33dfb1944e90820c39a9679feaa Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Wed, 24 Nov 2021 09:36:36 +0200
Subject: [PATCH 18/19] [cclm] remove unnecessary calculation of the downsample
 luma reference

---
 src/intra.c | 53 +++++------------------------------------------------
 1 file changed, 5 insertions(+), 48 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 2d576208..6def8cfc 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -525,38 +525,12 @@ void kvz_predict_cclm(
       if (y_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
       if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
     }
-    for(int y = 0; y < height * (available_left_below ? 4 : 2); y+=2) {
-      int s = 4;
-      s += x_scu ? y_rec[y * LCU_WIDTH - 2] * 2       : state->tile->frame->rec->y[x0 - 2 + (y0 + y) * stride] * 2;
-      s += x_scu ? y_rec[y * LCU_WIDTH - 1]           : state->tile->frame->rec->y[x0 - 1 + (y0 + y) * stride];
-      s += x_scu ? y_rec[y * LCU_WIDTH - 3]           : state->tile->frame->rec->y[x0 - 3 + (y0 + y) * stride];
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 2] * 2 : state->tile->frame->rec->y[x0 - 2 + (y0 + y + 1) * stride] * 2;
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 1]     : state->tile->frame->rec->y[x0 - 1 + (y0 + y + 1) * stride];
-      s += x_scu ? y_rec[(y + 1) * LCU_WIDTH - 3]     : state->tile->frame->rec->y[x0 - 3 + (y0 + y + 1) * stride];
-      sampled_luma_ref.left[y/2] = s >> 3;
-    }
+    for(int i = 0; i < height + available_left_below * 2; i++) {
+      sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride/2) + x0 / 2 - 1];
+    }    
   }
 
-
-
-  // Downsample the reconstructed luma sample so that they can be mapped into the chroma
-  // to generate the chroma prediction
-  for (int y = 0; y < height; y++) {
-    for (int x = 0; x <  width; x++) {
-      int s = 4;
-      s += y_rec[2 * x] * 2;
-      s += y_rec[2 * x + 1];
-      // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
-      // *except* when we are also at the edge of the frame, in which case we want to duplicate
-      // the edge pixel
-      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + y*2) * stride] : y_rec[2 * x - ((x + x0) > 0)];
-      s += y_rec[2 * x + LCU_WIDTH] * 2;
-      s += y_rec[2 * x + 1 + LCU_WIDTH];
-      s += !x_scu && !x && x0 ? state->tile->frame->rec->y[x0 - 1 + (y0 + y * 2 + 1) * stride] : y_rec[2 * x - ((x + x0) > 0) + LCU_WIDTH];
-      sampled_luma[x + y * width] = s >> 3;
-    }
-    y_rec += LCU_WIDTH * 2;
-  }
+  kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride) / 4], sampled_luma, width, height, stride / 2, width);
 
   int16_t a, b, shift;
   get_cclm_parameters(state, width, height, mode,x0, y0, available_above_right, available_left_below, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
@@ -926,24 +900,7 @@ static void intra_recon_tb_leaf(
   if(intra_mode < 68) {
     kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary);
   } else {
-    kvz_pixel *y_rec = lcu->rec.y;
-    y_rec += x_scu + y_scu * LCU_WIDTH;
-    for (int y_ = 0; y_ < width; y_++) {
-      for (int x_ = 0; x_ < width; x_++) {
-        int s = 4;
-        s += y_rec[2 * x_] * 2;
-        s += y_rec[2 * x_ + 1];
-        // If we are at the edge of the CTU read the pixel from the frame reconstruct buffer,
-        // *except* when we are also at the edge of the frame, in which case we want to duplicate
-        // the edge pixel
-        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2) * stride] : y_rec[2 * x_ - ((x_ + x) > 0)];
-        s += y_rec[2 * x_ + LCU_WIDTH] * 2;
-        s += y_rec[2 * x_ + 1 + LCU_WIDTH];
-        s += !x_scu && !x_ && x ? state->tile->frame->rec->y[x - 1 + (y + y_ * 2 + 1) * stride] : y_rec[2 * x_ - ((x_ + x) > 0) + LCU_WIDTH];
-        pred[x_  + y_ * width] = s >> 3;
-      }
-      y_rec += LCU_WIDTH * 2;
-    }
+    kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width);
     if(cclm_params == NULL) {
       cclm_parameters_t temp_params;
       kvz_predict_cclm(

From 385e91399aa62c77deb877ee8feb8006dd770902 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 25 Nov 2021 12:32:27 +0200
Subject: [PATCH 19/19] [intra rdo] Raise maximum rd level to 4

Separate the rd option for chroma intra mode search from the full intra
mode search, i.e., rd=3 enables chroma mode search and rd=4 enables full
intra mode search.
---
 src/cli.c          | 4 ++--
 src/search.c       | 2 +-
 src/search_inter.c | 2 +-
 src/search_intra.c | 8 ++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/cli.c b/src/cli.c
index 270e924f..28d7b773 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -552,8 +552,8 @@ void print_help(void)
     "                                   - 0: Skip intra if inter is good enough.\n"
     "                                   - 1: Rough intra mode search with SATD.\n"
     "                                   - 2: Refine intra mode search with SSE.\n"
-    "                                   - 3: Try all intra modes and enable intra\n"
-    "                                        chroma mode search.\n"
+    "                                   - 3: Enable intra chroma mode search.\n"
+    "                                   - 4: Try all intra modes.\n"
     "      --(no-)mv-rdo          : Rate-distortion optimized motion vector costs\n"
     "                               [disabled]\n"
     "      --(no-)zero-coeff-rdo  : If a CU is set inter, check if forcing zero\n"
diff --git a/src/search.c b/src/search.c
index aaa98c40..3e715d53 100644
--- a/src/search.c
+++ b/src/search.c
@@ -760,7 +760,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         // into account, so there is less of a chanse of luma mode being
         // really bad for chroma.
         cclm_parameters_t cclm_params[2];
-        if (ctrl->cfg.rdo == 3) {
+        if (ctrl->cfg.rdo >= 3) {
           cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, cclm_params);
           lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         }
diff --git a/src/search_inter.c b/src/search_inter.c
index 3b83c5a4..6ecb422d 100644
--- a/src/search_inter.c
+++ b/src/search_inter.c
@@ -1937,7 +1937,7 @@ static void search_pu_inter(encoder_state_t * const state,
     }
 
     // TODO: this probably should have a separate command line option
-    if (cfg->rdo == 3) {
+    if (cfg->rdo >= 3) {
       search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost);
     }
   }
diff --git a/src/search_intra.c b/src/search_intra.c
index a232d3d9..e6890d5a 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -969,7 +969,7 @@ int8_t kvz_search_cu_intra_chroma(encoder_state_t * const state,
   const int8_t modes_in_depth[5] = { 1, 1, 1, 1, 2 };
   int num_modes = modes_in_depth[depth];
 
-  if (state->encoder_control->cfg.rdo == 3) {
+  if (state->encoder_control->cfg.rdo >= 3) {
     num_modes = state->encoder_control->cfg.cclm ? 8 : 5;
   }
 
@@ -1054,7 +1054,7 @@ void kvz_search_cu_intra(encoder_state_t * const state,
   kvz_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
 
   int8_t number_of_modes = 0;
-  bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 3);
+  bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 4);
   if (!skip_rough_search) {
     number_of_modes = search_intra_rough(state,
                                          ref_pixels, LCU_WIDTH,
@@ -1075,9 +1075,9 @@ void kvz_search_cu_intra(encoder_state_t * const state,
   const int32_t rdo_level = state->encoder_control->cfg.rdo;
   if (rdo_level >= 2 || skip_rough_search) {
     int number_of_modes_to_search;
-    if (rdo_level == 3) {
+    if (rdo_level == 4) {
       number_of_modes_to_search = 67;
-    } else if (rdo_level == 2) {
+    } else if (rdo_level == 2 || rdo_level == 3) {
       number_of_modes_to_search = (cu_width == 4) ? 3 : 2;
     } else {
       // Check only the predicted modes.