/*****************************************************************************
 * This file is part of uvg266 VVC encoder.
 *
 * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 * 
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * 
 * * Redistributions in binary form must reproduce the above copyright notice, this
 *   list of conditions and the following disclaimer in the documentation and/or
 *   other materials provided with the distribution.
 * 
 * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
 ****************************************************************************/

#include "intra.h"

#include <stdlib.h>

#include "image.h"
#include "kvz_math.h"
#include "mip_data.h"
#include "strategies/strategies-intra.h"
#include "tables.h"
#include "transform.h"
#include "videoframe.h"

// Tables for looking up the number of intra reference pixels based on
// prediction units coordinate within an LCU.
// generated by "tools/generate_ref_pixel_tables.py".
static const uint8_t num_ref_pixels_top[16][16] = {
  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 32, 28, 24, 20, 16, 12,  8,  4, 32, 28, 24, 20, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 32, 28, 24, 20, 16, 12,  8,  4, 32, 28, 24, 20, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4, 16, 12,  8,  4 },
  {  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 }
};
static const uint8_t num_ref_pixels_left[16][16] = {
  { 64,  4,  8,  4, 16,  4,  8,  4, 32,  4,  8,  4, 16,  4,  8,  4 },
  { 60,  4,  4,  4, 12,  4,  4,  4, 28,  4,  4,  4, 12,  4,  4,  4 },
  { 56,  4,  8,  4,  8,  4,  8,  4, 24,  4,  8,  4,  8,  4,  8,  4 },
  { 52,  4,  4,  4,  4,  4,  4,  4, 20,  4,  4,  4,  4,  4,  4,  4 },
  { 48,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4 },
  { 44,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4 },
  { 40,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 36,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 },
  { 32,  4,  8,  4, 16,  4,  8,  4, 32,  4,  8,  4, 16,  4,  8,  4 },
  { 28,  4,  4,  4, 12,  4,  4,  4, 28,  4,  4,  4, 12,  4,  4,  4 },
  { 24,  4,  8,  4,  8,  4,  8,  4, 24,  4,  8,  4,  8,  4,  8,  4 },
  { 20,  4,  4,  4,  4,  4,  4,  4, 20,  4,  4,  4,  4,  4,  4,  4 },
  { 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4, 16,  4,  8,  4 },
  { 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4, 12,  4,  4,  4 },
  { 8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4,  8,  4 },
  { 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4 }
};

int8_t kvz_intra_get_dir_luma_predictor(
  const uint32_t x,
  const uint32_t y,
  int8_t *preds,
  const cu_info_t *const cur_pu,
  const cu_info_t *const left_pu,
  const cu_info_t *const above_pu)
{
  enum {
    PLANAR_IDX = 0,
    DC_IDX = 1,
    HOR_IDX = 18,
    VER_IDX = 50,
  };

  int8_t number_of_candidates = 0;

  // The default mode if block is not coded yet is INTRA_PLANAR.
  int8_t left_intra_dir  = 0;
  if (left_pu && left_pu->type == CU_INTRA) {
    left_intra_dir = left_pu->intra.mode;
  }

  int8_t above_intra_dir = 0;
  if (above_pu && above_pu->type == CU_INTRA && y % LCU_WIDTH != 0) {
    above_intra_dir = above_pu->intra.mode;
  }

  const int offset = 61;
  const int mod = 64;

  preds[0] = PLANAR_IDX;
  preds[1] = DC_IDX;
  preds[2] = VER_IDX;
  preds[3] = HOR_IDX;
  preds[4] = VER_IDX - 4;
  preds[5] = VER_IDX + 4;

  // If the predictions are the same, add new predictions
  if (left_intra_dir == above_intra_dir) {
    number_of_candidates = 1;
    if (left_intra_dir > DC_IDX) { // angular modes
      preds[0] = PLANAR_IDX;
      preds[1] = left_intra_dir;
      preds[2] = ((left_intra_dir + offset) % mod) + 2;
      preds[3] = ((left_intra_dir - 1) % mod) + 2;
      preds[4] = ((left_intra_dir + offset - 1) % mod) + 2;
      preds[5] = (left_intra_dir % mod) + 2;
    }
  } else { // If we have two distinct predictions
    number_of_candidates = 2;
    uint8_t max_cand_mode_idx = preds[0] > preds[1] ? 0 : 1;
    
    if (left_intra_dir > DC_IDX && above_intra_dir > DC_IDX) {
      preds[0] = PLANAR_IDX;
      preds[1] = left_intra_dir;
      preds[2] = above_intra_dir;
      max_cand_mode_idx = preds[1] > preds[2] ? 1 : 2;
      uint8_t min_cand_mode_idx = preds[1] > preds[2] ? 2 : 1;

      if (preds[max_cand_mode_idx] - preds[min_cand_mode_idx] == 1) {
        preds[3] = ((preds[min_cand_mode_idx] + offset) % mod) + 2;
        preds[4] = ((preds[max_cand_mode_idx] - 1) % mod) + 2;
        preds[5] = ((preds[min_cand_mode_idx] + offset - 1) % mod) + 2;
      } else  if (preds[max_cand_mode_idx] - preds[min_cand_mode_idx] >= 62) {
        preds[3] = ((preds[min_cand_mode_idx] - 1) % mod) + 2; 
        preds[4] = ((preds[max_cand_mode_idx] + offset) % mod) + 2;
        preds[5] = (preds[min_cand_mode_idx] % mod) + 2;
      } else  if (preds[max_cand_mode_idx] - preds[min_cand_mode_idx] == 2) {
        preds[3] = ((preds[min_cand_mode_idx] - 1) % mod) + 2;
        preds[4] = ((preds[min_cand_mode_idx] + offset) % mod) + 2;
        preds[5] = ((preds[max_cand_mode_idx] - 1) % mod) + 2;
      } else {
        preds[3] = ((preds[min_cand_mode_idx] + offset) % mod) + 2;
        preds[4] = ((preds[min_cand_mode_idx] - 1) % mod) + 2;
        preds[5] = ((preds[max_cand_mode_idx] + offset) % mod) + 2;
      }
    } else if(left_intra_dir + above_intra_dir >= 2){  // Add DC mode if it's not present, otherwise VER_IDX.
      preds[0] = PLANAR_IDX;
      preds[1] = (left_intra_dir < above_intra_dir) ? above_intra_dir : left_intra_dir;
      
      max_cand_mode_idx = 1;

      preds[2] = ((preds[max_cand_mode_idx] + offset) % mod) + 2;
      preds[3] = ((preds[max_cand_mode_idx] - 1) % mod) + 2;
      preds[4] = ((preds[max_cand_mode_idx] +offset - 1) % mod) + 2;
      preds[5] = ( preds[max_cand_mode_idx] % mod) + 2;
    }
  }

  return number_of_candidates;
}

static void intra_filter_reference(
  int_fast8_t log2_width,
  kvz_intra_references *refs)
{
  if (refs->filtered_initialized) {
    return;
  } else {
    refs->filtered_initialized = true;
  }

  const int_fast8_t ref_width = 2 * (1 << log2_width) + 1;
  kvz_intra_ref *ref = &refs->ref;
  kvz_intra_ref *filtered_ref = &refs->filtered_ref;

  // Starting point at top left for both iterations
  filtered_ref->left[0] = (ref->left[1] + 2 * ref->left[0] + ref->top[1] + 2) >> 2;
  filtered_ref->top[0] = filtered_ref->left[0];

  // TODO: use block height here instead of ref_width
  // Top to bottom
  for (int_fast8_t y = 1; y < ref_width - 1; ++y) {
    kvz_pixel *p = &ref->left[y];
    filtered_ref->left[y] = (p[-1] + 2 * p[0] + p[1] + 2) >> 2;
  }
  // Bottom left (not filtered) 
  filtered_ref->left[ref_width - 1] = ref->left[ref_width - 1];

  // Left to right
  for (int_fast8_t x = 1; x < ref_width - 1; ++x) {
    kvz_pixel *p = &ref->top[x];
    filtered_ref->top[x] = (p[-1] + 2 * p[0] + p[1] + 2) >> 2;
  }
  // Top right (not filtered)
  filtered_ref->top[ref_width - 1] = ref->top[ref_width - 1];
}


/**
* \brief Generate dc prediction.
* \param log2_width    Log2 of width, range 2..5.
* \param ref_top       Pointer to -1 index of above reference, length=width*2+1.
* \param ref_left      Pointer to -1 index of left reference, length=width*2+1.
* \param dst           Buffer of size width*width.
* \param multi_ref_idx Multi reference line index for use with MRL.
*/
static void intra_pred_dc(
  const int_fast8_t log2_width,
  const kvz_pixel *const ref_top,
  const kvz_pixel *const ref_left,
  kvz_pixel *const out_block,
  const uint8_t multi_ref_idx)
{
  int_fast8_t width = 1 << log2_width;

  int_fast16_t sum = 0;
  for (int_fast8_t i = 0; i < width; ++i) {
    sum += ref_top[i + 1 + multi_ref_idx];
    sum += ref_left[i + 1 + multi_ref_idx];
  }
  
  // JVET_K0122
  // TODO: take non-square blocks into account
  const int denom     = width << 1;
  const int divShift  = kvz_math_floor_log2(denom);
  const int divOffset = denom >> 1;
  
  const kvz_pixel dc_val = (sum + divOffset) >> divShift;
  //const kvz_pixel dc_val = (sum + width) >> (log2_width + 1);
  const int_fast16_t block_size = 1 << (log2_width * 2);

  for (int_fast16_t i = 0; i < block_size; ++i) {
    out_block[i] = dc_val;
  }
}


enum lm_mode
{
  LM_CHROMA_IDX = 81,
  LM_CHROMA_L_IDX = 82,
  LM_CHROMA_T_IDX = 83,
};


static void get_cclm_parameters(
  encoder_state_t const* const state,
  int8_t width, int8_t height, int8_t mode,
  int x0, int y0, int avai_above_right_units, int avai_left_below_units,
  kvz_intra_ref* luma_src, kvz_intra_references*chroma_ref,
  int16_t *a, int16_t*b, int16_t*shift) {

  const int base_unit_size = 1 << (6 - PU_DEPTH_INTRA_MAX);

  // TODO: take into account YUV422
  const int unit_w = base_unit_size >> 1;
  const int unit_h = base_unit_size >> 1;

  const int c_height = height;
  const int c_width = width;
  height *= 2;
  width *= 2;

  const int tu_width_in_units = c_width / unit_w;
  const int tu_height_in_units = c_height / unit_h;


  //int top_template_samp_num = width; // for MDLM, the template sample number is 2W or 2H;
  //int left_template_samp_num = height;

  // These are used for calculating some stuff for non-square CUs
  //int total_above_units = (top_template_samp_num + (unit_w - 1)) / unit_w;
  //int total_left_units = (left_template_samp_num + (unit_h - 1)) / unit_h;
  //int total_units = total_left_units + total_above_units + 1;
  //int above_right_units = total_above_units - tu_width_in_units;
  //int left_below_units = total_left_units - tu_height_in_units;
  //int avai_above_right_units = 0;  // TODO these are non zero only with non-square CUs
  //int avai_left_below_units = 0;
  int avai_above_units = CLIP(0, tu_height_in_units, y0/base_unit_size);
  int avai_left_units = CLIP(0, tu_width_in_units, x0 / base_unit_size);

  bool above_available = avai_above_units != 0;
  bool left_available = avai_left_units != 0;
    
  char internal_bit_depth = state->encoder_control->bitdepth;

  int min_luma[2] = { MAX_INT, 0 };
  int max_luma[2] = { -MAX_INT, 0 };
  
  kvz_pixel* src;
  int actualTopTemplateSampNum = 0;
  int actualLeftTemplateSampNum = 0;
  if (mode == LM_CHROMA_T_IDX)
  {
    left_available = 0;
    avai_above_right_units = avai_above_right_units > (c_height / unit_w) ? c_height / unit_w : avai_above_right_units;
    actualTopTemplateSampNum = unit_w * (avai_above_units + avai_above_right_units);
  }
  else if (mode == LM_CHROMA_L_IDX)
  {
    above_available = 0;
    avai_left_below_units = avai_left_below_units > (c_width / unit_h) ? c_width / unit_h : avai_left_below_units;
    actualLeftTemplateSampNum = unit_h * (avai_left_units + avai_left_below_units);
  }
  else if (mode == LM_CHROMA_IDX)
  {
    actualTopTemplateSampNum = c_width;
    actualLeftTemplateSampNum = c_height;
  }
  int startPos[2]; //0:Above, 1: Left
  int pickStep[2];

  int aboveIs4 = left_available ? 0 : 1;
  int leftIs4 = above_available ? 0 : 1;

  startPos[0] = actualTopTemplateSampNum >> (2 + aboveIs4);
  pickStep[0] = MAX(1, actualTopTemplateSampNum >> (1 + aboveIs4));

  startPos[1] = actualLeftTemplateSampNum >> (2 + leftIs4);
  pickStep[1] = MAX(1, actualLeftTemplateSampNum >> (1 + leftIs4));

  kvz_pixel selectLumaPix[4] = { 0, 0, 0, 0 };
  kvz_pixel selectChromaPix[4] = { 0, 0, 0, 0 };

  int cntT, cntL;
  cntT = cntL = 0;
  int cnt = 0;
  if (above_available)
  {
    cntT = MIN(actualTopTemplateSampNum, (1 + aboveIs4) << 1);
    src = luma_src->top;
    const kvz_pixel* cur = chroma_ref->ref.top + 1;
    for (int pos = startPos[0]; cnt < cntT; pos += pickStep[0], cnt++)
    {
      selectLumaPix[cnt] = src[pos];
      selectChromaPix[cnt] = cur[pos];
    }
  }

  if (left_available)
  {
    cntL = MIN(actualLeftTemplateSampNum, (1 + leftIs4) << 1);
    src = luma_src->left;
    const kvz_pixel* cur = chroma_ref->ref.left + 1;
    for (int pos = startPos[1], cnt = 0; cnt < cntL; pos += pickStep[1], cnt++)
    {
      selectLumaPix[cnt + cntT] = src[pos];
      selectChromaPix[cnt + cntT] = cur[pos];
    }
  }
  cnt = cntL + cntT;

  if (cnt == 2)
  {
    selectLumaPix[3] = selectLumaPix[0]; selectChromaPix[3] = selectChromaPix[0];
    selectLumaPix[2] = selectLumaPix[1]; selectChromaPix[2] = selectChromaPix[1];
    selectLumaPix[0] = selectLumaPix[1]; selectChromaPix[0] = selectChromaPix[1];
    selectLumaPix[1] = selectLumaPix[3]; selectChromaPix[1] = selectChromaPix[3];
  }

  int minGrpIdx[2] = { 0, 2 };
  int maxGrpIdx[2] = { 1, 3 };
  int* tmpMinGrp = minGrpIdx;
  int* tmpMaxGrp = maxGrpIdx;
  if (selectLumaPix[tmpMinGrp[0]] > selectLumaPix[tmpMinGrp[1]])
  {
    SWAP(tmpMinGrp[0], tmpMinGrp[1], int);
  }
  if (selectLumaPix[tmpMaxGrp[0]] > selectLumaPix[tmpMaxGrp[1]])
  {
    SWAP(tmpMaxGrp[0], tmpMaxGrp[1], int);
  }
  if (selectLumaPix[tmpMinGrp[0]] > selectLumaPix[tmpMaxGrp[1]])
  {
    SWAP(tmpMinGrp, tmpMaxGrp, int*);
  }
  if (selectLumaPix[tmpMinGrp[1]] > selectLumaPix[tmpMaxGrp[0]])
  {
    SWAP(tmpMinGrp[1], tmpMaxGrp[0], int);
  }

  min_luma[0] = (selectLumaPix[tmpMinGrp[0]] + selectLumaPix[tmpMinGrp[1]] + 1) >> 1;
  min_luma[1] = (selectChromaPix[tmpMinGrp[0]] + selectChromaPix[tmpMinGrp[1]] + 1) >> 1;
  max_luma[0] = (selectLumaPix[tmpMaxGrp[0]] + selectLumaPix[tmpMaxGrp[1]] + 1) >> 1;
  max_luma[1] = (selectChromaPix[tmpMaxGrp[0]] + selectChromaPix[tmpMaxGrp[1]] + 1) >> 1;

  if (left_available || above_available)
  {
    int diff = max_luma[0] - min_luma[0];
    if (diff > 0)
    {
      int diffC = max_luma[1] - min_luma[1];
      int x = kvz_math_floor_log2(diff);
      static const uint8_t DivSigTable[1 << 4] = {
        // 4bit significands - 8 ( MSB is omitted )
        0,  7,  6,  5,  5,  4,  4,  3,  3,  2,  2,  1,  1,  1,  1,  0
      };
      int normDiff = (diff << 4 >> x) & 15;
      int v = DivSigTable[normDiff] | 8;
      x += normDiff != 0;

      int y = diffC ? kvz_math_floor_log2(abs(diffC)) + 1 : 0;
      int add = 1 << y >> 1;
      *a = (diffC * v + add) >> y;
      *shift = 3 + x - y;
      if (*shift < 1)
      {
        *shift = 1;
        *a = ((*a == 0) ? 0 : (*a < 0) ? -15 : 15);   // a=Sign(a)*15
      }
      *b = min_luma[1] - ((*a * min_luma[0]) >> *shift);
    }
    else
    {
      *a = 0;
      *b = min_luma[1];
      *shift = 0;
    }
  }
  else
  {
    *a = 0;

    *b = 1 << (internal_bit_depth - 1);

    *shift = 0;
  }
}

static void linear_transform_cclm(cclm_parameters_t* cclm_params, kvz_pixel * src, kvz_pixel * dst, int stride, int height) {
  int scale = cclm_params->a;
  int shift = cclm_params->shift;
  int offset = cclm_params->b;
  for (int y = 0; y < height; ++y) {
    for (int x=0; x < stride; ++x) {
      int val = src[x + y * stride] * scale;
      val >>= shift;
      val += offset;
      val = CLIP_TO_PIXEL(val);
      dst[x + y * stride] = val;
    }
  }
}


void kvz_predict_cclm(
  encoder_state_t const* const state,
  const color_t color,
  const int8_t width,
  const int8_t height,
  const int16_t x0,
  const int16_t y0,
  const int16_t stride,
  const int8_t mode,
  lcu_t* const lcu,
  kvz_intra_references* chroma_ref,
  kvz_pixel* dst,
  cclm_parameters_t* cclm_params
)
{
  assert(mode == LM_CHROMA_IDX || mode == LM_CHROMA_L_IDX || mode == LM_CHROMA_T_IDX);
  assert(state->encoder_control->cfg.cclm);

  
  kvz_intra_ref sampled_luma_ref;
  kvz_pixel sampled_luma[LCU_CHROMA_SIZE];

  int x_scu = SUB_SCU(x0);
  int y_scu = SUB_SCU(y0);

  int available_above_right = 0;
  int available_left_below = 0;


  kvz_pixel *y_rec = lcu->rec.y + x_scu + y_scu * LCU_WIDTH;

  // Essentially what this does is that it uses 6-tap filtering to downsample
  // the luma intra references down to match the resolution of the chroma channel.
  // The luma reference is only needed when we are not on the edge of the picture.
  // Because the reference pixels that are needed on the edge of the ctu this code
  // is kinda messy but what can you do

  if (y0) {
    for (; available_above_right < width / 2; available_above_right++) {
      int x_extension = x_scu + width * 2 + 4 * available_above_right;
      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_extension, y_scu - 4);
      if (x_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
    }
    if(y_scu == 0) {
      if(!state->encoder_control->cfg.wpp) available_above_right = MIN(width / 2, (state->tile->frame->width - x0 - width * 2) / 4);
      memcpy(sampled_luma_ref.top, &state->tile->frame->cclm_luma_rec_top_line[x0 / 2 + (y0 / 64 - 1) * (stride / 2)], sizeof(kvz_pixel) * (width + available_above_right * 2));
    }
    else {
      for (int x = 0; x < width * (available_above_right ? 4 : 2); x += 2) {
        bool left_padding = x0 || x;
        int s = 4;
        s += y_scu ? y_rec[x - LCU_WIDTH * 2] * 2            : state->tile->frame->rec->y[x0 + x + (y0 - 2) * stride] * 2;
        s += y_scu ? y_rec[x - LCU_WIDTH * 2 + 1]            : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 2) * stride];
        s += y_scu && !(x0 && !x && !x_scu) ? y_rec[x - LCU_WIDTH * 2 - left_padding] : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 2) * stride];
        s += y_scu ? y_rec[x - LCU_WIDTH] * 2                : state->tile->frame->rec->y[x0 + x + (y0 - 1) * stride] * 2;
        s += y_scu ? y_rec[x - LCU_WIDTH + 1]                : state->tile->frame->rec->y[x0 + x + 1 + (y0 - 1) * stride];
        s += y_scu && !(x0 && !x && !x_scu) ? y_rec[x - LCU_WIDTH - left_padding]     : state->tile->frame->rec->y[x0 + x - left_padding + (y0 - 1) * stride];
        sampled_luma_ref.top[x / 2] = s >> 3;
      }
    }
  }

  if(x0) {
    for (; available_left_below < height / 2; available_left_below++) {
      int y_extension = y_scu + height * 2 + 4 * available_left_below;
      cu_info_t* pu = LCU_GET_CU_AT_PX(lcu, x_scu - 4, y_extension);
      if (y_extension >= LCU_WIDTH || pu->type == CU_NOTSET) break;
      if(x_scu == 32 && y_scu == 0 && pu->depth == 0) break;
    }
    for(int i = 0; i < height + available_left_below * 2; i++) {
      sampled_luma_ref.left[i] = state->tile->frame->cclm_luma_rec[(y0/2 + i) * (stride/2) + x0 / 2 - 1];
    }    
  }

  kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x0 / 2 + (y0 * stride) / 4], sampled_luma, width, height, stride / 2, width);

  int16_t a, b, shift;
  get_cclm_parameters(state, width, height, mode,x0, y0, available_above_right, available_left_below, &sampled_luma_ref, chroma_ref, &a, &b, &shift);
  cclm_params->shift = shift;
  cclm_params->a = a;
  cclm_params->b = b;

  if(dst)
    linear_transform_cclm(cclm_params, sampled_luma, dst, width, height);
}


int kvz_get_mip_flag_context(int x, int y, int width, int height, lcu_t* const lcu, cu_array_t* const cu_a) {
  assert(!(lcu && cu_a));
  int context = 0;
  
  if (lcu) {
    int x_local = SUB_SCU(x);
    int y_local = SUB_SCU(y);
    if (x) {
      context += LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local)->intra.mip_flag;
    }
    if (y) {
      context += LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1)->intra.mip_flag;
    }
    context = (width > 2 * height || height > 2 * width) ? 3 : context;
  }
  else {
    if (x > 0) {
      context += kvz_cu_array_at_const(cu_a, x - 1, y)->intra.mip_flag;
    }
    if (y > 0) {
      context += kvz_cu_array_at_const(cu_a, x, y - 1)->intra.mip_flag;
    }
    context = (width > 2 * height || height > 2 * width) ? 3 : context;
  }
  return context;
}


void kvz_mip_boundary_downsampling_1D(int* reduced_dst, const int* const ref_src, int src_len, int dst_len)
{
  if (dst_len < src_len)
  {
    // Create reduced boundary by downsampling
    uint16_t down_smp_factor = src_len / dst_len;
    const int log2_factor = kvz_math_floor_log2(down_smp_factor);
    const int rounding_offset = (1 << (log2_factor - 1));

    uint16_t src_idx = 0;
    for (uint16_t dst_idx = 0; dst_idx < dst_len; dst_idx++)
    {
      int sum = 0;
      for (int k = 0; k < down_smp_factor; k++)
      {
        sum += ref_src[src_idx++];
      }
      reduced_dst[dst_idx] = (sum + rounding_offset) >> log2_factor;
    }
  }
  else
  {
    // Copy boundary if no downsampling is needed
    for (uint16_t i = 0; i < dst_len; ++i)
    {
      reduced_dst[i] = ref_src[i];
    }
  }
}


void kvz_mip_reduced_pred(int* const output,
                          const int* const input,
                          const uint8_t* matrix,
                          const bool transpose,
                          const int red_bdry_size,
                          const int red_pred_size,
                          const int size_id,
                          const int in_offset,
                          const int in_offset_tr)
{
  const int input_size = 2 * red_bdry_size;

  // Use local buffer for transposed result
  int out_buf_transposed[LCU_WIDTH * LCU_WIDTH];
  int* const out_ptr = transpose ? out_buf_transposed : output;

  int sum = 0;
  for (int i = 0; i < input_size; i++) { 
    sum += input[i];
  }
  const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum;
  assert((input_size == 4 * (input_size >> 2)) && "MIP input size must be divisible by four");

  const uint8_t* weight = matrix;
  const int input_offset = transpose ? in_offset_tr : in_offset;

  const bool red_size = (size_id == 2);
  int pos_res = 0;
  for (int y = 0; y < red_pred_size; y++) {
    for (int x = 0; x < red_pred_size; x++) {
      if (red_size) {
        weight -= 1;
      }
      int tmp0 = red_size ? 0 : (input[0] * weight[0]);
      int tmp1 = input[1] * weight[1];
      int tmp2 = input[2] * weight[2];
      int tmp3 = input[3] * weight[3];
      for (int i = 4; i < input_size; i += 4) {
        tmp0 += input[i] * weight[i];
        tmp1 += input[i + 1] * weight[i + 1];
        tmp2 += input[i + 2] * weight[i + 2];
        tmp3 += input[i + 3] * weight[i + 3];
      }
      out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset);
      pos_res++;
      weight += input_size;
    }
  }

  if (transpose) {
    for (int y = 0; y < red_pred_size; y++) {
      for (int x = 0; x < red_pred_size; x++) {
        output[y * red_pred_size + x] = out_ptr[x * red_pred_size + y];
      }
    }
  }
}


void kvz_mip_pred_upsampling_1D(int* const dst, const int* const src, const int* const boundary,
                                const uint16_t src_size_ups_dim, const uint16_t src_size_orth_dim,
                                const uint16_t src_step, const uint16_t src_stride,
                                const uint16_t dst_step, const uint16_t dst_stride,
                                const uint16_t boundary_step,
                                const uint16_t ups_factor)
{
  const int log2_factor = kvz_math_floor_log2(ups_factor);
  assert(ups_factor >= 2 && "Upsampling factor must be at least 2.");
  const int rounding_offset = 1 << (log2_factor - 1);

  uint16_t idx_orth_dim = 0;
  const int* src_line = src;
  int* dst_line = dst;
  const int* boundary_line = boundary + boundary_step - 1;
  while (idx_orth_dim < src_size_orth_dim)
  {
    uint16_t idx_upsample_dim = 0;
    const int* before = boundary_line;
    const int* behind = src_line;
    int* cur_dst = dst_line;
    while (idx_upsample_dim < src_size_ups_dim)
    {
      uint16_t pos = 1;
      int scaled_before = (*before) << log2_factor;
      int scaled_behind = 0;
      while (pos <= ups_factor)
      {
        scaled_before -= *before;
        scaled_behind += *behind;
        *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor;

        pos++;
        cur_dst += dst_step;
      }

      idx_upsample_dim++;
      before = behind;
      behind += src_step;
    }

    idx_orth_dim++;
    src_line += src_stride;
    dst_line += dst_stride;
    boundary_line += boundary_step;
  }
}


/** \brief Matrix weighted intra prediction.
*/
void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* const refs,
                     const uint16_t pred_block_width, const uint16_t pred_block_height,
                     const color_t color,
                     kvz_pixel* dst,
                     const int mip_mode, const bool mip_transp)
{
  // MIP prediction uses int values instead of kvz_pixel as some temp values may be negative
  
  kvz_pixel* out = dst;
  int result[32*32] = {0};
  const int mode_idx = mip_mode;

  // *** INPUT PREP ***

  // Initialize prediction parameters START
  uint16_t width = pred_block_width;
  uint16_t height = pred_block_height;

  int size_id; // Prediction block type
  if (width == 4 && height == 4) {
    size_id = 0;
  }
  else if (width == 4 || height == 4 || (width == 8 && height == 8)) {
    size_id = 1;
  }
  else {
    size_id = 2;
  }

  // Reduced boundary and prediction sizes
  int red_bdry_size = (size_id == 0) ? 2 : 4;
  int red_pred_size = (size_id < 2) ? 4 : 8;

  // Upsampling factors
  uint16_t ups_hor_factor = width / red_pred_size;
  uint16_t ups_ver_factor = height / red_pred_size;

  // Upsampling factors must be powers of two
  assert(!((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1))) != 0) && "Horizontal upsampling factor must be power of two.");
  assert(!((ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1))) != 0) && "Vertical upsampling factor must be power of two.");

  // Initialize prediction parameters END

  int ref_samples_top[INTRA_REF_LENGTH]; 
  int ref_samples_left[INTRA_REF_LENGTH];

  for (int i = 1; i < INTRA_REF_LENGTH; i++) {
    ref_samples_top[i-1] =  (int)refs->ref.top[i]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init
    ref_samples_left[i-1] = (int)refs->ref.left[i];
  }

  // Compute reduced boundary with Haar-downsampling
  const int input_size = 2 * red_bdry_size;

  int red_bdry[MIP_MAX_INPUT_SIZE];
  int red_bdry_trans[MIP_MAX_INPUT_SIZE];

  int* const top_reduced = &red_bdry[0];
  int* const left_reduced = &red_bdry[red_bdry_size];

  kvz_mip_boundary_downsampling_1D(top_reduced, ref_samples_top, width, red_bdry_size);
  kvz_mip_boundary_downsampling_1D(left_reduced, ref_samples_left, height, red_bdry_size);

  // Transposed reduced boundaries
  int* const left_reduced_trans = &red_bdry_trans[0];
  int* const top_reduced_trans = &red_bdry_trans[red_bdry_size];

  for (int x = 0; x < red_bdry_size; x++) {
    top_reduced_trans[x] = top_reduced[x];
  }
  for (int y = 0; y < red_bdry_size; y++) {
    left_reduced_trans[y] = left_reduced[y];
  }

  int input_offset = red_bdry[0];
  int input_offset_trans = red_bdry_trans[0];

  const bool has_first_col = (size_id < 2);
  // First column of matrix not needed for large blocks
  red_bdry[0] = has_first_col ? ((1 << (KVZ_BIT_DEPTH - 1)) - input_offset) : 0;
  red_bdry_trans[0] = has_first_col ? ((1 << (KVZ_BIT_DEPTH - 1)) - input_offset_trans) : 0;

  for (int i = 1; i < input_size; ++i) {
    red_bdry[i] -= input_offset;
    red_bdry_trans[i] -= input_offset_trans;
  }

  // *** INPUT PREP *** END

  // *** BLOCK PREDICT ***

  const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1);
  const bool transpose = mip_transp;

  const uint8_t* matrix;
  switch (size_id) {
    case 0: 
      matrix = &kvz_mip_matrix_4x4[mode_idx][0][0];
      break;
    case 1: 
      matrix = &kvz_mip_matrix_8x8[mode_idx][0][0];
      break;
    case 2: 
      matrix = &kvz_mip_matrix_16x16[mode_idx][0][0];
      break;
    default:
      assert(false && "Invalid MIP size id.");
  }

  // Max possible size is red_pred_size * red_pred_size, red_pred_size can be either 4 or 8
  int red_pred_buffer[8*8];
  int* const reduced_pred = need_upsampling ? red_pred_buffer : result;

  const int* const reduced_bdry = transpose ? red_bdry_trans : red_bdry;

  kvz_mip_reduced_pred(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans);
  if (need_upsampling) {
    const int* ver_src = reduced_pred;
    uint16_t ver_src_step = width;
    
    if (ups_hor_factor > 1) {
      int* const hor_dst = result + (ups_ver_factor - 1) * width;
      ver_src = hor_dst;
      ver_src_step *= ups_ver_factor;

      kvz_mip_pred_upsampling_1D(hor_dst, reduced_pred, ref_samples_left,
        red_pred_size, red_pred_size,
        1, red_pred_size, 1, ver_src_step,
        ups_ver_factor, ups_hor_factor);
    }

    if (ups_ver_factor > 1) {
      kvz_mip_pred_upsampling_1D(result, ver_src, ref_samples_top,
        red_pred_size, width,
        ver_src_step, 1, width, 1,
        1, ups_ver_factor);
    }
  }

  // Assign and cast values from temp array to output
  for (int i = 0; i < 32 * 32; i++) {
    out[i] = (kvz_pixel)result[i];
  }
  // *** BLOCK PREDICT *** END
}


void kvz_intra_predict(
  encoder_state_t *const state,
  kvz_intra_references *refs,
  int_fast8_t log2_width,
  int_fast8_t mode,
  color_t color,
  kvz_pixel *dst,
  bool filter_boundary,
  const uint8_t multi_ref_idx)
{
  const int_fast8_t width = 1 << log2_width;
  const kvz_config *cfg = &state->encoder_control->cfg;

  // MRL only for luma
  uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;

  const kvz_intra_ref *used_ref = &refs->ref;
  if (cfg->intra_smoothing_disabled || color != COLOR_Y || mode == 1 || width == 4 || multi_ref_index) {
    // For chroma, DC and 4x4 blocks, always use unfiltered reference.
  } else if (mode == 0) {
    // Otherwise, use filtered for planar.
    if (width * width > 32) {
      used_ref = &refs->filtered_ref;
    }
  } else {
    // Angular modes use smoothed reference pixels, unless the mode is close
    // to being either vertical or horizontal.
    static const int kvz_intra_hor_ver_dist_thres[8] = {24, 24, 24, 14, 2, 0, 0, 0 };
    int filter_threshold = kvz_intra_hor_ver_dist_thres[(log2_width + log2_width) >> 1];
    int dist_from_vert_or_hor = MIN(abs(mode - 50), abs(mode - 18));
    if (dist_from_vert_or_hor > filter_threshold) {

      static const int16_t modedisp2sampledisp[32] = { 0,    1,    2,    3,    4,    6,     8,   10,   12,   14,   16,   18,   20,   23,   26,   29,   32,   35,   39,  45,  51,  57,  64,  73,  86, 102, 128, 171, 256, 341, 512, 1024 };
      const int_fast8_t mode_disp = (mode >= 34) ? mode - 50 : 18 - mode;
      const int_fast8_t sample_disp = (mode_disp < 0 ? -1 : 1) * modedisp2sampledisp[abs(mode_disp)];
      if ((abs(sample_disp) & 0x1F) == 0) {
        used_ref = &refs->filtered_ref;
      }
    }
  }

  if (used_ref == &refs->filtered_ref && !refs->filtered_initialized) {
    intra_filter_reference(log2_width, refs);
  }

  if (mode == 0) {
    kvz_intra_pred_planar(log2_width, used_ref->top, used_ref->left, dst);
  } else if (mode == 1) {
    intra_pred_dc(log2_width, used_ref->top, used_ref->left, dst, multi_ref_index);
  } else {
    kvz_angular_pred(log2_width, mode, color, used_ref->top, used_ref->left, dst, multi_ref_index);
  }

  // pdpc
  // bool pdpcCondition = (mode == 0 || mode == 1 || mode == 18 || mode == 50);
  bool pdpcCondition = (mode == 0 || mode == 1); // Planar and DC
  if (pdpcCondition && multi_ref_index == 0) // Cannot be used with MRL.
  {
    kvz_pdpc_planar_dc(mode, width, log2_width, used_ref, dst);
  }
}


void kvz_intra_build_reference_any(
  const int_fast8_t log2_width,
  const color_t color,
  const vector2d_t *const luma_px,
  const vector2d_t *const pic_px,
  const lcu_t *const lcu,
  kvz_intra_references *const refs,
  const uint8_t multi_ref_idx,
  kvz_pixel *extra_ref_lines)
{
  assert(log2_width >= 2 && log2_width <= 5);

  refs->filtered_initialized = false;
  kvz_pixel *out_left_ref = &refs->ref.left[0];
  kvz_pixel *out_top_ref = &refs->ref.top[0];

  const kvz_pixel dc_val = 1 << (KVZ_BIT_DEPTH - 1); //TODO: add used bitdepth as a variable
  const int is_chroma = color != COLOR_Y ? 1 : 0;
  // TODO: height for non-square blocks
  const int_fast8_t width = 1 << log2_width;

  // Get multi ref index from CU under prediction or reconstrcution. Do not use MRL if not luma
  const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0;
  assert(multi_ref_index < MAX_REF_LINE_IDX);

  // Convert luma coordinates to chroma coordinates for chroma.
  const vector2d_t lcu_px = {
    luma_px->x % LCU_WIDTH,
    luma_px->y % LCU_WIDTH
  };
  const vector2d_t px = {
    lcu_px.x >> is_chroma,
    lcu_px.y >> is_chroma,
  };

  // Init pointers to LCUs reconstruction buffers, such that index 0 refers to block coordinate 0.
  const kvz_pixel *left_ref;
  bool extra_ref = false;
  // On the left LCU edge, if left neighboring LCU is available, 
  // left_ref needs to point to correct extra reference line if MRL is used.
  if (luma_px->x > 0 && lcu_px.x == 0 && multi_ref_index != 0) {
    left_ref = &extra_ref_lines[multi_ref_index * 128];
    extra_ref = true;
  }
  else {
    left_ref = !color ? &lcu->left_ref.y[1] : (color == 1) ? &lcu->left_ref.u[1] : &lcu->left_ref.v[1];
  }

  const kvz_pixel *top_ref = !color ? &lcu->top_ref.y[1] : (color == 1) ? &lcu->top_ref.u[1] : &lcu->top_ref.v[1];
  const kvz_pixel *rec_ref = !color ? lcu->rec.y : (color == 1) ? lcu->rec.u : lcu->rec.v;

  // Init top borders pointer to point to the correct place in the correct reference array.
  const kvz_pixel *top_border;
  if (px.y) {
    top_border = &rec_ref[px.x + (px.y - 1 - multi_ref_index) * (LCU_WIDTH >> is_chroma)];
  } else {
    top_border = &top_ref[px.x]; // Top row, no need for multi_ref_index
  }

  // Init left borders pointer to point to the correct place in the correct reference array.
  const kvz_pixel *left_border;
  int left_stride; // Distance between reference samples.
  if (px.x) {
    left_border = &rec_ref[px.x - 1 - multi_ref_index + px.y * (LCU_WIDTH >> is_chroma)];
    left_stride = LCU_WIDTH >> is_chroma;
  } else {
    if (extra_ref) {
      left_border = &left_ref[MAX_REF_LINE_IDX];
    }
    else {
      left_border = &left_ref[px.y];
    }
    left_stride = 1;
  }

  // Generate left reference.
  if (luma_px->x > 0) {
    // Get the number of reference pixels based on the PU coordinate within the LCU.
    int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;

    // Limit the number of available pixels based on block size and dimensions
    // of the picture.
    // TODO: height for non-square blocks
    px_available_left = MIN(px_available_left, width * 2 + multi_ref_index);
    px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);

    // Copy pixels from coded CUs.
    for (int i = 0; i < px_available_left; ++i) {
      // Reserve space for top left reference
      out_left_ref[i + 1 + multi_ref_index] = left_border[i * left_stride];
    }
    // Extend the last pixel for the rest of the reference values.
    kvz_pixel nearest_pixel = left_border[(px_available_left - 1) * left_stride];
    for (int i = px_available_left; i < width * 2 + multi_ref_index * 2; ++i) {
      out_left_ref[i + 1 + multi_ref_index] = nearest_pixel;
    }
  } else {
    // If we are on the left edge, extend the first pixel of the top row.
    kvz_pixel nearest_pixel = luma_px->y > 0 ? top_border[0] : dc_val;
    for (int i = 0; i < width * 2 + multi_ref_index; i++) {
      // Reserve space for top left reference
      out_left_ref[i + 1 + multi_ref_index] = nearest_pixel;
    }
  }

  // Generate top-left reference
  if (multi_ref_index)
  {
    if (luma_px->x > 0 && luma_px->y > 0) {
      // If the block is at an LCU border, the top-left must be copied from
      // the border that points to the LCUs 1D reference buffer.

      // Inner picture cases
      if (px.x == 0 && px.y == 0) {
        // LCU top left corner case. Multi ref will be 0.
        out_left_ref[0] = out_left_ref[1];
        out_top_ref[0] = out_left_ref[1];
      }
      else if (px.x == 0) {
        // LCU left border case
        kvz_pixel *top_left_corner = &extra_ref_lines[multi_ref_index * 128];
        for (int i = 0; i <= multi_ref_index; ++i) {
          out_left_ref[i] = left_border[(i - 1 - multi_ref_index) * left_stride];
          out_top_ref[i] = top_left_corner[(128 * -i) + MAX_REF_LINE_IDX - 1 - multi_ref_index];
        }
      }
      else if (px.y == 0) {
        // LCU top border case. Multi ref will be 0.
        out_left_ref[0] = top_border[-1];
        out_top_ref[0] = top_border[-1];
      }
      else {
        // Inner case
        for (int i = 0; i <= multi_ref_index; ++i) {
          out_left_ref[i] = left_border[(i - 1 - multi_ref_index) * left_stride];
          out_top_ref[i] = top_border[i - 1 - multi_ref_index];
        }
      }
    }
    else {
      // Picture border cases
      if (px.x == 0 && px.y == 0) {
        // Top left picture corner case. Multi ref will be 0.
        out_left_ref[0] = out_left_ref[1];
        out_top_ref[0] = out_left_ref[1];
      }
      else if (px.x == 0) {
        // Picture left border case. Reference pixel cannot be taken from outside LCU border
        kvz_pixel nearest = out_left_ref[1 + multi_ref_index];
        for (int i = 0; i <= multi_ref_index; ++i) {
          out_left_ref[i] = nearest;
          out_top_ref[i] = nearest;
        }
      }
      else {
        // Picture top border case. Multi ref will be 0.
        out_left_ref[0] = top_border[-1];
        out_top_ref[0] = top_border[-1];
      }
    }
  }
  else {
    if (luma_px->x > 0 && luma_px->y > 0) {
      // If the block is at an LCU border, the top-left must be copied from
      // the border that points to the LCUs 1D reference buffer.
      if (px.x == 0) {
        out_left_ref[0] = left_border[-1 * left_stride];
        out_top_ref[0] = left_border[-1 * left_stride];
      }
      else {
        out_left_ref[0] = top_border[-1];
        out_top_ref[0] = top_border[-1];
      }
    }
    else {
      // Copy reference clockwise.
      out_left_ref[0] = out_left_ref[1];
      out_top_ref[0] = out_left_ref[1];
    }
  }

  // Generate top reference.
  if (luma_px->y > 0) {
    // Get the number of reference pixels based on the PU coordinate within the LCU.
    int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;

    // Limit the number of available pixels based on block size and dimensions
    // of the picture.
    px_available_top = MIN(px_available_top, width * 2 + multi_ref_index);
    px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma);

    // Copy all the pixels we can.
    for (int i = 0; i < px_available_top; ++i) {
      out_top_ref[i + 1 + multi_ref_index] = top_border[i];
    }
    // Extend the last pixel for the rest of the reference values.
    kvz_pixel nearest_pixel = top_border[px_available_top - 1];
    for (int i = px_available_top; i < width * 2 + multi_ref_index * 2; ++i) {
      out_top_ref[i + 1 + multi_ref_index] = nearest_pixel;
    }
  } else {
    // Extend nearest pixel.
    kvz_pixel nearest_pixel = luma_px->x > 0 ? left_border[0] : dc_val;
    for (int i = 0; i < width * 2 + multi_ref_index; i++) {
      out_top_ref[i + 1] = nearest_pixel;
    }
  }
}

void kvz_intra_build_reference_inner(
  const int_fast8_t log2_width,
  const color_t color,
  const vector2d_t *const luma_px,
  const vector2d_t *const pic_px,
  const lcu_t *const lcu,
  kvz_intra_references *const refs,
  bool entropy_sync,
  const uint8_t multi_ref_idx,
  kvz_pixel* extra_ref_lines)
{
  assert(log2_width >= 2 && log2_width <= 5);

  refs->filtered_initialized = false;
  kvz_pixel * __restrict out_left_ref = &refs->ref.left[0];
  kvz_pixel * __restrict out_top_ref = &refs->ref.top[0];

  const int is_chroma = color != COLOR_Y ? 1 : 0;
  // TODO: height for non-sqaure blocks
  const int_fast8_t width = 1 << log2_width;

  // Get multiRefIdx from CU under prediction. Do not use MRL if not luma
  const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0;
  assert(multi_ref_index < MAX_REF_LINE_IDX);

  // Convert luma coordinates to chroma coordinates for chroma.
  const vector2d_t lcu_px = {
    luma_px->x % LCU_WIDTH,
    luma_px->y % LCU_WIDTH
  };
  const vector2d_t px = {
    lcu_px.x >> is_chroma,
    lcu_px.y >> is_chroma,
  };

  // Init pointers to LCUs reconstruction buffers, such that index 0 refers to block coordinate 0.
  const kvz_pixel* left_ref;
  bool extra_ref = false;
  // On the left LCU edge, if left neighboring LCU is available, 
  // left_ref needs to point to correct extra reference line if MRL is used.
  if (lcu_px.x == 0 && multi_ref_index != 0) {
    left_ref = &extra_ref_lines[multi_ref_index * 128];
    extra_ref = true;
  }
  else {
    left_ref = !color ? &lcu->left_ref.y[1] : (color == 1) ? &lcu->left_ref.u[1] : &lcu->left_ref.v[1];
  }

  const kvz_pixel * __restrict top_ref = !color ? &lcu->top_ref.y[1] : (color == 1) ? &lcu->top_ref.u[1] : &lcu->top_ref.v[1];
  const kvz_pixel * __restrict rec_ref = !color ? lcu->rec.y : (color == 1) ? lcu->rec.u : lcu->rec.v;

  // Init top borders pointer to point to the correct place in the correct reference array.
  const kvz_pixel * __restrict top_border;
  if (px.y) {
    top_border = &rec_ref[px.x + (px.y - 1 - multi_ref_index) * (LCU_WIDTH >> is_chroma)];
  } else {
    top_border = &top_ref[px.x]; // At the top line. No need for multi_ref_index
  }

  // Init left borders pointer to point to the correct place in the correct reference array.
  const kvz_pixel * __restrict left_border;
  int left_stride; // Distance between reference samples.
  if (px.x) {
    left_border = &rec_ref[px.x - 1 - multi_ref_index + px.y * (LCU_WIDTH >> is_chroma)];
    left_stride = LCU_WIDTH >> is_chroma;
  } else {
    if (extra_ref) {
      left_border = &left_ref[MAX_REF_LINE_IDX];
    }
    else {
      left_border = &left_ref[px.y];
    }
    left_stride = 1;
  }

// Generate top-left reference
  if (multi_ref_index)
  {
    // Inner picture cases
    if (px.x == 0 && px.y == 0) {
      // LCU top left corner case. Multi ref will be 0.
      out_left_ref[0] = out_left_ref[1];
      out_top_ref[0] = out_left_ref[1];
    }
    else if (px.x == 0) {
      // LCU left border case
      kvz_pixel* top_left_corner = &extra_ref_lines[multi_ref_index * 128];
      for (int i = 0; i <= multi_ref_index; ++i) {
        out_left_ref[i] = left_border[(i - 1 - multi_ref_index) * left_stride];
        out_top_ref[i] = top_left_corner[(128 * -i) + MAX_REF_LINE_IDX - 1 - multi_ref_index];
      }
    }
    else if (px.y == 0) {
      // LCU top border case. Multi ref will be 0.
      out_left_ref[0] = top_border[-1];
      out_top_ref[0] = top_border[-1];
    }
    else {
      // Inner case
      for (int i = 0; i <= multi_ref_index; ++i) {
        out_left_ref[i] = left_border[(i - 1 - multi_ref_index) * left_stride];
        out_top_ref[i] = top_border[i - 1 - multi_ref_index];
      }
    }
  }
  else {
    // If the block is at an LCU border, the top-left must be copied from
    // the border that points to the LCUs 1D reference buffer.
    if (px.x == 0) {
      out_left_ref[0] = left_border[-1 * left_stride];
      out_top_ref[0] = left_border[-1 * left_stride];
    }
    else {
      out_left_ref[0] = top_border[-1];
      out_top_ref[0] = top_border[-1];
    }
  }
  // Generate left reference.

// Get the number of reference pixels based on the PU coordinate within the LCU.
  int px_available_left = num_ref_pixels_left[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;

  // Limit the number of available pixels based on block size and dimensions
  // of the picture.
  px_available_left = MIN(px_available_left, width * 2);
  px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma);

  // Copy pixels from coded CUs.
  int i = multi_ref_index;  // Offset by multi_ref_index
  do {
    out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride];
    out_left_ref[i + 2] = left_border[(i + 1 - multi_ref_index) * left_stride];
    out_left_ref[i + 3] = left_border[(i + 2 - multi_ref_index) * left_stride];
    out_left_ref[i + 4] = left_border[(i + 3 - multi_ref_index) * left_stride];
    i += 4;
  } while (i < px_available_left);

  // Extend the last pixel for the rest of the reference values.
  kvz_pixel nearest_pixel = out_left_ref[i];
  for (; i < width * 2; i += 4) {
    out_left_ref[i + 1] = nearest_pixel;
    out_left_ref[i + 2] = nearest_pixel;
    out_left_ref[i + 3] = nearest_pixel;
    out_left_ref[i + 4] = nearest_pixel;
  }

  // Extend for MRL
  if (multi_ref_index) {
    for (; i < width * 2 + multi_ref_index; ++i) {
      out_left_ref[i + 1] = nearest_pixel;
    }
  }

  // Generate top reference.

  // Get the number of reference pixels based on the PU coordinate within the LCU.
  int px_available_top = num_ref_pixels_top[lcu_px.y / 4][lcu_px.x / 4] >> is_chroma;

  // Limit the number of available pixels based on block size and dimensions
  // of the picture.
  px_available_top = MIN(px_available_top, width * 2 + multi_ref_index);
  px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma);

  if (entropy_sync && px.y == 0) px_available_top = MIN(px_available_top, ((LCU_WIDTH >> is_chroma) - px.x) -1);

  // Copy all the pixels we can.
  i = 0;
  do {
    memcpy(out_top_ref + i + 1 + multi_ref_index, top_border + i, 4 * sizeof(kvz_pixel));
    i += 4;
  } while (i < px_available_top);

  // Extend the last pixel for the rest of the reference values.
  nearest_pixel = out_top_ref[i + multi_ref_index];
  for (; i < (width + multi_ref_index) * 2; i += 4) {
    out_top_ref[i + 1 + multi_ref_index] = nearest_pixel;
    out_top_ref[i + 2 + multi_ref_index] = nearest_pixel;
    out_top_ref[i + 3 + multi_ref_index] = nearest_pixel;
    out_top_ref[i + 4 + multi_ref_index] = nearest_pixel;
  }
}

void kvz_intra_build_reference(
  const int_fast8_t log2_width,
  const color_t color,
  const vector2d_t *const luma_px,
  const vector2d_t *const pic_px,
  const lcu_t *const lcu,
  kvz_intra_references *const refs,
  bool entropy_sync,
  kvz_pixel *extra_ref_lines,
  uint8_t multi_ref_idx)
{
  assert(!(extra_ref_lines == NULL && multi_ref_idx != 0) && "Trying to use MRL with NULL extra references.");

  // Much logic can be discarded if not on the edge
  if (luma_px->x > 0 && luma_px->y > 0) {
    kvz_intra_build_reference_inner(log2_width, color, luma_px, pic_px, lcu, refs, entropy_sync, multi_ref_idx, extra_ref_lines);
  } else {
    kvz_intra_build_reference_any(log2_width, color, luma_px, pic_px, lcu, refs, multi_ref_idx, extra_ref_lines);
  }
}

static void intra_recon_tb_leaf(
  encoder_state_t *const state,
  int x,
  int y,
  int depth,
  int8_t intra_mode,
  cclm_parameters_t *cclm_params,
  lcu_t *lcu,
  color_t color,
  uint8_t multi_ref_idx,
  bool use_mip,
  bool mip_transp)
{
  const kvz_config *cfg = &state->encoder_control->cfg;
  const int shift = color == COLOR_Y ? 0 : 1;

  int log2width = LOG2_LCU_WIDTH - depth;
  if (color != COLOR_Y && depth < MAX_PU_DEPTH) {
    // Chroma width is half of luma width, when not at maximum depth.
    log2width -= 1;
  }
  const int width = 1 << log2width;
  const int height = width; // TODO: proper height for non-square blocks
  const int lcu_width = LCU_WIDTH >> shift;

  const vector2d_t luma_px = { x, y };
  const vector2d_t pic_px = {
    state->tile->frame->width,
    state->tile->frame->height,
  };
  int x_scu = SUB_SCU(x);
  int y_scu = SUB_SCU(y);
  const vector2d_t lcu_px = {x_scu >> shift, y_scu >> shift };
  uint8_t multi_ref_index = color == COLOR_Y ? multi_ref_idx : 0;

  kvz_intra_references refs;
  // Extra reference lines for use with MRL. Extra lines needed only for left edge.
  kvz_pixel extra_refs[128 * MAX_REF_LINE_IDX] = { 0 };

  if (luma_px.x > 0 && lcu_px.x == 0 && lcu_px.y > 0 && multi_ref_index != 0) {
    videoframe_t* const frame = state->tile->frame;

    // Copy extra ref lines, including ref line 1 and top left corner.
    for (int i = 0; i < MAX_REF_LINE_IDX; ++i) {
      int height = (LCU_WIDTH >> depth) * 2 + MAX_REF_LINE_IDX;
      height = MIN(height, (LCU_WIDTH - lcu_px.y + MAX_REF_LINE_IDX)); // Cut short if on bottom LCU edge. Cannot take references from below since they don't exist.
      height = MIN(height, pic_px.y - luma_px.y + MAX_REF_LINE_IDX);
      kvz_pixels_blit(&frame->rec->y[(luma_px.y - MAX_REF_LINE_IDX) * frame->rec->stride + luma_px.x - (1 + i)],
        &extra_refs[i * 128],
        1, height,
        frame->rec->stride, 1);
    }
  }
  kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs, cfg->wpp, extra_refs, multi_ref_index);

  kvz_pixel pred[32 * 32];
  int stride = state->tile->frame->source->stride;
  const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
  if(intra_mode < 68) {
    if (use_mip) {
      assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]");
      kvz_mip_predict(state, &refs, width, height, color, pred, intra_mode, mip_transp);
    }
    else {
      kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary, multi_ref_index);
    }
  } else {
    kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width);
    if(cclm_params == NULL) {
      cclm_parameters_t temp_params;
      kvz_predict_cclm(
        state, color, width, width, x, y, stride, intra_mode, lcu, &refs, pred, &temp_params);
    }
    else {
      linear_transform_cclm(&cclm_params[color == COLOR_U ? 0 : 1], pred, pred, width, width);
    }
  }

  const int index = lcu_px.x + lcu_px.y * lcu_width;
  kvz_pixel *block = NULL;
  kvz_pixel *block2 = NULL;
  switch (color) {
    case COLOR_Y:
      block = &lcu->rec.y[index];
      break;
    case COLOR_U:
      block = &lcu->rec.u[index];
      block2 = &lcu->rec.joint_u[index];
      break;
    case COLOR_V:
      block = &lcu->rec.v[index];
      block2 = &lcu->rec.joint_v[index];
      break;
    default: break;
  }

  kvz_pixels_blit(pred, block , width, width, width, lcu_width);
  if(color != COLOR_Y && cfg->jccr) {
    kvz_pixels_blit(pred, block2, width, width, width, lcu_width);
  }
}

/**
 * \brief Reconstruct an intra CU
 *
 * \param state         encoder state
 * \param x             x-coordinate of the CU in luma pixels
 * \param y             y-coordinate of the CU in luma pixels
 * \param depth         depth in the CU tree
 * \param mode_luma     intra mode for luma, or -1 to skip luma recon
 * \param mode_chroma   intra mode for chroma, or -1 to skip chroma recon
 * \param cur_cu        pointer to the CU, or NULL to fetch CU from LCU
 * \param cclm_params   pointer for the cclm_parameters, can be NULL if the mode is not cclm mode
 * \param mip_flag      indicates whether the passed mode_luma is a MIP mode
 * \param mip_transp    indicates whether the used MIP mode is transposed
 * \param lcu           containing LCU
 */
void kvz_intra_recon_cu(
  encoder_state_t *const state,
  int x,
  int y,
  int depth,
  int8_t mode_luma,
  int8_t mode_chroma,
  cu_info_t *cur_cu,
  cclm_parameters_t *cclm_params,
  uint8_t multi_ref_idx,
  bool mip_flag,
  bool mip_transp,
  lcu_t *lcu)
{
  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
  const int8_t width = LCU_WIDTH >> depth;
  if (cur_cu == NULL) {
    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
  }
  uint8_t multi_ref_index = multi_ref_idx;
  bool use_mip = mip_flag;
  bool mip_transposed = mip_transp;

  // Reset CBFs because CBFs might have been set
  // for depth earlier
  if (mode_luma >= 0) {
    cbf_clear(&cur_cu->cbf, depth, COLOR_Y);
  }
  if (mode_chroma >= 0) {
    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
  }

  if (depth == 0 || cur_cu->tr_depth > depth) {

    const int offset = width / 2;
    const int32_t x2 = x + offset;
    const int32_t y2 = y + offset;

    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);

    // Propagate coded block flags from child CUs to parent CU.
    uint16_t child_cbfs[3] = {
      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
      LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
    };

    if (mode_luma != -1 && depth <= MAX_DEPTH) {
      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
    }
    if (mode_chroma != -1 && depth <= MAX_DEPTH) {
      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
    }
  } else {
    const bool has_luma = mode_luma != -1;
    const bool has_chroma = mode_chroma != -1 &&  (x % 8 == 0 && y % 8 == 0);
   
    // Process a leaf TU.
    if (has_luma) {
      intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y, multi_ref_index, use_mip, mip_transposed);
    }
    if (has_chroma) {
      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0, false, false);
      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0, false, false);
    }

    kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
  }
}