From e7da0712e538b16033f16c6e425b278e72a58fb6 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 15 Dec 2021 16:25:08 +0200
Subject: [PATCH 01/28] [mip] Add commandline option for MIP.

---
 src/cfg.c     | 5 +++++
 src/cli.c     | 3 +++
 src/kvazaar.h | 3 +++
 3 files changed, 11 insertions(+)
diff --git a/src/cfg.c b/src/cfg.c
index 10946d28..2be8c8c6 100644
--- a/src/cfg.c
+++ b/src/cfg.c
@@ -203,6 +203,8 @@ int kvz_config_init(kvz_config *cfg)
   cfg->chroma_scale_out[2][0] = cfg->chroma_scale_in[2][0] = -1;
 
   cfg->mrl = false;
+  
+  cfg->mip = false;
 
   parse_qp_map(cfg, 0);
 
@@ -1488,6 +1490,9 @@ int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
   else if OPT("mrl") {
     cfg->mrl = atobool(value);
   }
+  else if OPT("mip") {
+    cfg->mip = atobool(value);
+  }
   else if OPT("jccr") {
     cfg->jccr = (bool)atobool(value);
   }
diff --git a/src/cli.c b/src/cli.c
index 9bacb135..baa5a07a 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -175,6 +175,8 @@ static const struct option long_options[] = {
   { "chroma-qp-out",      required_argument, NULL, 0 },
   { "mrl",                      no_argument, NULL, 0 },
   { "no-mrl",                   no_argument, NULL, 0 },
+  { "mip",                      no_argument, NULL, 0 },
+  { "no-mip",                   no_argument, NULL, 0 },
   { "jccr",                     no_argument, NULL, 0 },
   { "no-jccr",                  no_argument, NULL, 0 },
   { "amvr",                     no_argument, NULL, 0 },
@@ -626,6 +628,7 @@ void print_help(void)
     "      --(no-)tmvp            : Temporal motion vector prediction [enabled]\n"
     "      --(no-)mrl             : Enable use of multiple reference lines in intra\n"
     "                               predictions.\n"
+    "      --(no-)mip             : Enable matrix weighted intra prediction."
     "      --mts <string>         : Multiple Transform Selection [off].\n"
     "                               (Currently only implemented for intra\n"
     "                               and has effect only when rd >= 2)\n"
diff --git a/src/kvazaar.h b/src/kvazaar.h
index 9b4da0cb..00052f83 100644
--- a/src/kvazaar.h
+++ b/src/kvazaar.h
@@ -517,6 +517,9 @@ typedef struct kvz_config
   /** \brief enable use of multiple reference lines in intra prediction */
   int8_t mrl; 
 
+  /** \brief enable matrix weighted intra prediction */
+  int8_t mip;
+
 
   int8_t jccr;
 

From 07d78e07a68d3dafc207a8723f34452ca0d8093c Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 22 Dec 2021 15:42:15 +0200
Subject: [PATCH 02/28] [mip] Add defines.

---
 src/global.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/global.h b/src/global.h
index 7d8507b8..8b959f99 100644
--- a/src/global.h
+++ b/src/global.h
@@ -216,6 +216,11 @@ typedef int16_t mv_t;
 */
 #define MAX_REF_LINE_IDX 3
 
+#define MIP_MAX_INPUT_SIZE 8
+#define MIP_MAX_REDUCED_OUTPUT_SAMPLES 64
+#define MIP_MAX_WIDTH 64
+#define MIP_MAX_HEIGHT 64
+
 /**
  * \brief Number of pixels to delay deblocking.
  *

From 746ae8ab85101b4b2da72d38e64efb97be6be7ab Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 22 Dec 2021 15:42:56 +0200
Subject: [PATCH 03/28] [mip] WIP Implement mip functions.

---
 src/intra.c | 229 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 229 insertions(+)

diff --git a/src/intra.c b/src/intra.c
index 01f0a767..545a4740 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -544,6 +544,235 @@ void kvz_predict_cclm(
     linear_transform_cclm(cclm_params, sampled_luma, dst, width, height);
 }
 
+
+void kvz_mip_boundary_downsampling(int* reduced_dst, const kvz_pixel* const ref_src, int src_len, int dst_len)
+{
+  if (dst_len < src_len)
+  {
+    // Create reduced boundary by downsampling
+    uint16_t down_smp_factor = src_len / dst_len;
+
+    // Calculate floor log2. TODO: find a better / faster solution
+    int tmp = 0;
+    if (down_smp_factor & 0xffff0000) {
+      down_smp_factor >>= 16;
+      tmp += 16;
+    }
+    if (down_smp_factor & 0xff00) {
+      down_smp_factor >>= 8;
+      tmp += 8;
+    }
+    if (down_smp_factor & 0xf0) {
+      down_smp_factor >>= 4;
+      tmp += 4;
+    }
+    if (down_smp_factor & 0xc) {
+      down_smp_factor >>= 2;
+      tmp += 2;
+    }
+    if (down_smp_factor & 0x2) {
+      down_smp_factor >>= 1;
+      tmp += 1;
+    }
+
+    const int log2_factor = tmp;
+    const int rounding_offset = (1 << (log2_factor - 1));
+
+    uint16_t src_idx = 0;
+    for (uint16_t dst_idx = 0; dst_idx < dst_len; dst_idx++)
+    {
+      int sum = 0;
+      for (int k = 0; k < down_smp_factor; k++)
+      {
+        sum += ref_src[src_idx++];
+      }
+      reduced_dst[dst_idx] = (sum + rounding_offset) >> log2_factor;
+    }
+  }
+  else
+  {
+    // Copy boundary if no downsampling is needed
+    for (uint16_t i = 0; i < dst_len; ++i)
+    {
+      reduced_dst[i] = ref_src[i];
+    }
+  }
+}
+
+
+void kvz_mip_reduced_pred(kvz_pixel* const output,
+                          const kvz_pixel* const input,
+                          const uint8_t* matrix,
+                          const bool transpose,
+                          const int red_bdry_size,
+                          const int red_pred_size,
+                          const int size_id)
+{
+  const int input_size = 2 * red_bdry_size;
+
+  // Use local buffer for transposed result
+  kvz_pixel* out_buf_transposed = MALLOC(kvz_pixel, red_pred_size * red_pred_size); // TODO: get rid of MALLOC & FREE
+  kvz_pixel* const out_ptr = transpose ? out_buf_transposed : output;
+
+  int sum = 0;
+  for (int i = 0; i < input_size; i++) { 
+    sum += input[i];
+  }
+  const int offset = (1 << (MIP_SHIFT_MATRIX - 1)) - MIP_OFFSET_MATRIX * sum;
+  assert((input_size == 4 * (input_size >> 2)) && "MIP input size must be divisible by four");
+
+  const uint8_t* weight = matrix;
+  const int input_offset = transpose ? m_inputOffsetTransp : m_inputOffset;
+
+  const bool red_size = (size_id == 2);
+  int pos_res = 0;
+  for (int y = 0; y < m_reducedPredSize; y++)
+  {
+    for (int x = 0; x < m_reducedPredSize; x++)
+    {
+      if (red_size) weight -= 1;
+      int tmp0 = red_size ? 0 : (input[0] * weight[0]);
+      int tmp1 = input[1] * weight[1];
+      int tmp2 = input[2] * weight[2];
+      int tmp3 = input[3] * weight[3];
+      for (int i = 4; i < input_size; i += 4)
+      {
+        tmp0 += input[i] * weight[i];
+        tmp1 += input[i + 1] * weight[i + 1];
+        tmp2 += input[i + 2] * weight[i + 2];
+        tmp3 += input[i + 3] * weight[i + 3];
+      }
+      out_ptr[posRes++] = ClipBD<int>(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + inputOffset, bitDepth);
+
+      weight += inputSize;
+    }
+  }
+
+  if (transpose)
+  {
+    for (int y = 0; y < m_reducedPredSize; y++)
+    {
+      for (int x = 0; x < m_reducedPredSize; x++)
+      {
+        result[y * m_reducedPredSize + x] = resPtr[x * m_reducedPredSize + y];
+      }
+    }
+  }
+
+  FREE_POINTER(out_buf_transposed);
+}
+
+
+void kvz_mip_pred_upsampling()
+{
+
+}
+
+
+/** \brief Matrix weighted intra prediction.
+*/
+void kvz_mip_predict(encoder_state_t const* const state,
+                     kvz_intra_references* const refs,
+                     const uint16_t pred_block_width,
+                     const uint16_t pred_block_height)
+{
+  // Separate this function into smaller bits if needed
+  
+  int* result; // TODO: pass the dst buffer to this function
+
+  // *** INPUT PREP ***
+
+  // Initialize prediction parameters START
+  uint16_t width = pred_block_width;
+  uint16_t height = pred_block_height;
+
+  int size_id; // Prediction block type
+  if (width == 4 && height == 4) {
+    size_id = 0;
+  }
+  else if (width == 4 || height == 4 || (width == 8 && height == 8)) {
+    size_id = 1;
+  }
+  else {
+    size_id = 2;
+  }
+
+  // Reduced boundary and prediction sizes
+  int red_bdry_size = (size_id == 0) ? 2 : 4;
+  int red_pred_size = (size_id < 2) ? 4 : 8;
+
+  // Upsampling factors
+  unsigned int ups_hor_factor = width / red_pred_size;
+  unsigned int ups_ver_factor = height / red_pred_size;
+
+  // Upsampling factors must be powers of two
+  assert((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1)) != 0) && "Horizontal upsampling factor must be power of two.");
+  assert((ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1)) != 0) && "Vertical upsampling factor must be power of two.");
+
+  // Initialize prediction parameters END
+
+  kvz_pixel* ref_samples_top = refs->ref.top; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init
+  kvz_pixel* ref_samples_left = refs->ref.left;
+
+  // Compute reduced boundary with Haar-downsampling
+  const int input_size = 2 * red_bdry_size;
+
+  kvz_pixel red_bdry[MIP_MAX_INPUT_SIZE];
+  kvz_pixel red_bdry_trans[MIP_MAX_INPUT_SIZE];
+
+  kvz_pixel* const top_reduced = red_bdry[0];
+  kvz_pixel* const left_reduced = red_bdry[red_bdry_size];
+
+  kvz_mip_boundary_downsampling(top_reduced, ref_samples_top, width, red_bdry_size);
+  kvz_mip_boundary_downsampling(left_reduced, ref_samples_left, height, red_bdry_size);
+
+  // Transposed reduced boundaries
+  int* const left_reduced_trans = red_bdry_trans[0];
+  int* const top_reduced_trans = red_bdry_trans[red_bdry_size];
+
+  for (int x = 0; x < red_bdry_size; x++) {
+    top_reduced_trans[x] = top_reduced[x];
+  }
+  for (int y = 0; y < red_bdry_size; y++) {
+    left_reduced_trans[y] = left_reduced[y];
+  }
+
+  int input_offset = red_bdry[0];
+  int input_offset_trans = red_bdry_trans[0];
+
+  const bool has_first_col = (size_id < 2);
+  // First column of matrix not needed for large blocks
+  red_bdry[0] = has_first_col ? ((1 << (KVZ_BIT_DEPTH - 1)) - input_offset) : 0;
+  red_bdry_trans[0] = has_first_col ? ((1 << (KVZ_BIT_DEPTH - 1)) - input_offset_trans) : 0;
+
+  for (int i = 1; i < input_size; ++i) {
+    red_bdry[i] -= input_offset;
+    red_bdry_trans[i] -= input_offset_trans;
+  }
+
+  // *** INPUT PREP *** END
+
+  // *** BLOCK PREDICT ***
+
+  const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1);
+  const bool transpose = 0; // TODO: pass transpose
+
+  const uint8_t* matrix = 0; // TODO: function for fetching correct matrix
+  kvz_pixel* red_pred_buffer = MALLOC(kvz_pixel, red_pred_size * red_pred_size); // TODO: get rid of MALLOC and FREE
+  kvz_pixel* const reduced_pred = need_upsampling ? red_pred_buffer[0] : result;
+
+  const int* const reduced_bdry = transpose ? red_bdry_trans[0] : red_bdry[0];
+
+  kvz_mip_reduced_pred(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id);
+  if (need_upsampling) {
+    kvz_mip_pred_upsampling(result, reduced_pred);
+  }
+
+  FREE_POINTER(red_pred_buffer);
+  // *** BLOCK PREDITC *** END
+}
+
+
 void kvz_intra_predict(
   encoder_state_t *const state,
   kvz_intra_references *refs,

From 3bbef3dff682be0ad8f1efa8d738e7fd928d8a25 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 22 Dec 2021 16:25:01 +0200
Subject: [PATCH 04/28] [mip] Add mip weight matrices.

---
 build/kvazaar_lib/kvazaar_lib.vcxproj         |   1 +
 build/kvazaar_lib/kvazaar_lib.vcxproj.filters |   5 +-
 src/Makefile.am                               |   1 +
 src/intra.c                                   |   1 +
 src/mip_data.h                                | 885 ++++++++++++++++++
 5 files changed, 891 insertions(+), 2 deletions(-)
 create mode 100644 src/mip_data.h

diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj b/build/kvazaar_lib/kvazaar_lib.vcxproj
index 67ee5ac4..de6bd428 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj
@@ -267,6 +267,7 @@
     <ClInclude Include="..\..\src\input_frame_buffer.h" />
     <ClInclude Include="..\..\src\kvazaar_internal.h" />
     <ClInclude Include="..\..\src\kvz_math.h" />
+    <ClInclude Include="..\..\src\mip_data.h" />
     <ClInclude Include="..\..\src\ml_intra_cu_depth_pred.h" />
     <ClInclude Include="..\..\src\search_inter.h" />
     <ClInclude Include="..\..\src\search_intra.h" />
diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
index 06ee72a1..63cbd564 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
@@ -266,7 +266,6 @@
     <ClCompile Include="..\..\src\strategies\avx2\alf-avx2.c">
       <Filter>Optimization\strategies\avx2</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\debug.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -498,7 +497,9 @@
     <ClInclude Include="..\..\src\strategies\avx2\alf-avx2.h">
       <Filter>Optimization\strategies\avx2</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\src\debug.h" />
+    <ClInclude Include="..\..\src\mip_data.h">
+      <Filter>Reconstruction</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">
diff --git a/src/Makefile.am b/src/Makefile.am
index 57e50e41..1412aeef 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -90,6 +90,7 @@ libkvazaar_la_SOURCES = \
 	kvazaar.c \
 	kvazaar_internal.h \
 	kvz_math.h \
+    mip_data.h \
 	ml_intra_cu_depth_pred.c \
 	ml_intra_cu_depth_pred.h \
 	nal.c \
diff --git a/src/intra.c b/src/intra.c
index 545a4740..28715a20 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -36,6 +36,7 @@
 
 #include "image.h"
 #include "kvz_math.h"
+#include "mip_data.h"
 #include "strategies/strategies-intra.h"
 #include "tables.h"
 #include "transform.h"
diff --git a/src/mip_data.h b/src/mip_data.h
new file mode 100644
index 00000000..b789994d
--- /dev/null
+++ b/src/mip_data.h
@@ -0,0 +1,885 @@
+/*****************************************************************************
+ * This file is part of uvg266 VVC encoder.
+ *
+ * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ *
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+ /**
+* \ingroup Reconstruction
+* \file
+* MIP weight matrix data.
+*/
+
+/** \file     MipData.h
+\brief    weight and bias data for matrix-based intra prediction (MIP)
+*/
+
+#define MIP_SHIFT_MATRIX 6
+#define MIP_OFFSET_MATRIX 32
+
+// NOTE: these matrices need to be aligned if used with avx2
+const uint8_t mip_matrix_4x4[16][16][4] =
+{
+  {
+    {   32,   30,   90,   28},
+    {   32,   32,   72,   28},
+    {   34,   77,   53,   30},
+    {   51,  124,   36,   37},
+    {   31,   31,   95,   37},
+    {   33,   31,   70,   50},
+    {   52,   80,   25,   60},
+    {   78,  107,    1,   65},
+    {   31,   29,   37,   95},
+    {   38,   34,   19,  101},
+    {   73,   85,    0,   81},
+    {   92,   99,    0,   65},
+    {   34,   29,   14,  111},
+    {   48,   48,    7,  100},
+    {   80,   91,    0,   74},
+    {   89,   97,    0,   64}
+  },
+  {
+    {   31,   23,   34,   29},
+    {   31,   43,   34,   31},
+    {   30,   95,   34,   32},
+    {   29,  100,   35,   33},
+    {   31,   23,   34,   29},
+    {   31,   43,   34,   31},
+    {   30,   95,   34,   32},
+    {   29,   99,   35,   33},
+    {   31,   24,   35,   29},
+    {   31,   44,   34,   31},
+    {   30,   95,   35,   32},
+    {   29,   99,   35,   33},
+    {   31,   24,   35,   30},
+    {   31,   44,   35,   31},
+    {   30,   95,   35,   32},
+    {   29,   99,   35,   33}
+  },
+  {
+    {   32,   32,   36,   58},
+    {   32,   29,   26,   66},
+    {   36,   37,   23,   61},
+    {   79,   84,    3,   37},
+    {   32,   32,   30,   69},
+    {   33,   29,   24,   71},
+    {   44,   16,   21,   70},
+    {   96,   18,    0,   57},
+    {   32,   31,   24,   74},
+    {   33,   30,   23,   71},
+    {   36,   24,   24,   71},
+    {   59,    9,   16,   68},
+    {   32,   32,   23,   75},
+    {   33,   30,   24,   70},
+    {   32,   30,   25,   71},
+    {   36,   26,   25,   70}
+  },
+  {
+    {   32,   33,   34,   32},
+    {   32,   30,   22,   38},
+    {   29,   46,   25,   38},
+    {   53,  123,   28,   22},
+    {   32,   33,   30,   37},
+    {   32,   30,   21,   38},
+    {   32,   40,   24,   38},
+    {   64,  116,   26,   17},
+    {   32,   32,   23,   49},
+    {   32,   30,   21,   39},
+    {   34,   39,   24,   37},
+    {   72,  109,   23,   16},
+    {   33,   31,   17,   60},
+    {   32,   31,   21,   39},
+    {   35,   41,   24,   37},
+    {   72,  106,   22,   18}
+  },
+  {
+    {   34,   25,   89,   20},
+    {   38,   32,   47,   24},
+    {   40,   86,   29,   27},
+    {   38,   98,   32,   29},
+    {   34,   31,   94,   40},
+    {   44,   25,   83,   27},
+    {   54,   72,   43,   16},
+    {   47,   94,   33,   22},
+    {   33,   31,   36,   94},
+    {   43,   23,   51,   76},
+    {   62,   55,   64,   25},
+    {   57,   89,   38,   15},
+    {   32,   32,   28,  101},
+    {   38,   26,   33,   94},
+    {   55,   38,   68,   47},
+    {   59,   80,   52,   16}
+  },
+  {
+    {   28,   30,   68,   29},
+    {   23,   48,   23,   48},
+    {   39,   98,   16,   42},
+    {   84,   86,   20,   17},
+    {   25,   31,   52,   74},
+    {   38,   68,    5,   70},
+    {   95,   78,    7,   21},
+    {  127,   54,   12,    0},
+    {   30,   47,   14,  107},
+    {   79,   76,    0,   53},
+    {  127,   59,    7,    1},
+    {  127,   51,    9,    0},
+    {   50,   71,    1,   96},
+    {  109,   69,    7,   25},
+    {  127,   56,    9,    0},
+    {  123,   53,   13,    0}
+  },
+  {
+    {   40,   20,   72,   18},
+    {   48,   29,   44,   18},
+    {   53,   81,   35,   18},
+    {   48,   96,   33,   22},
+    {   45,   23,   79,   49},
+    {   61,   21,   56,   49},
+    {   72,   52,   32,   48},
+    {   65,   69,   20,   50},
+    {   41,   27,   29,   96},
+    {   49,   22,   28,   94},
+    {   52,   22,   28,   93},
+    {   49,   27,   27,   92},
+    {   37,   29,   26,   98},
+    {   39,   28,   28,   97},
+    {   38,   28,   30,   97},
+    {   38,   29,   30,   95}
+  },
+  {
+    {   33,   27,   43,   27},
+    {   32,   29,   31,   31},
+    {   31,   73,   33,   31},
+    {   35,  104,   34,   28},
+    {   32,   30,   63,   22},
+    {   33,   26,   33,   29},
+    {   33,   57,   33,   30},
+    {   37,  100,   35,   27},
+    {   32,   31,   85,   25},
+    {   34,   25,   39,   25},
+    {   35,   39,   32,   28},
+    {   40,   91,   35,   25},
+    {   32,   30,   77,   50},
+    {   34,   26,   54,   22},
+    {   37,   31,   34,   27},
+    {   45,   75,   34,   23}
+  },
+  {
+    {   34,   25,   77,   19},
+    {   36,   34,   56,   24},
+    {   41,   83,   39,   30},
+    {   47,   96,   28,   35},
+    {   34,   31,   70,   65},
+    {   38,   29,   53,   77},
+    {   43,   36,   37,   83},
+    {   48,   39,   28,   83},
+    {   33,   31,   31,   98},
+    {   33,   31,   30,   99},
+    {   34,   30,   31,   98},
+    {   36,   29,   31,   96},
+    {   32,   32,   30,   97},
+    {   32,   32,   31,   96},
+    {   31,   33,   33,   96},
+    {   32,   33,   34,   94}
+  },
+  {
+    {   30,   30,   93,   19},
+    {   31,   59,   67,   34},
+    {   31,   79,   36,   59},
+    {   30,   67,   17,   79},
+    {   30,   38,   68,   69},
+    {   29,   40,   43,   91},
+    {   26,   35,   32,  101},
+    {   23,   32,   30,  101},
+    {   26,   34,   30,  101},
+    {   23,   33,   30,  102},
+    {   20,   32,   31,  102},
+    {   18,   33,   32,  102},
+    {   23,   33,   31,  100},
+    {   20,   34,   32,  100},
+    {   18,   35,   33,  100},
+    {   18,   35,   33,  100}
+  },
+  {
+    {   31,   54,   90,   26},
+    {   32,   60,   53,   61},
+    {   34,   49,   37,   84},
+    {   34,   39,   35,   89},
+    {   35,   38,   41,   88},
+    {   35,   35,   32,   96},
+    {   35,   31,   33,   96},
+    {   35,   32,   35,   94},
+    {   34,   34,   30,   97},
+    {   35,   32,   33,   95},
+    {   35,   32,   34,   94},
+    {   35,   34,   34,   93},
+    {   34,   34,   34,   93},
+    {   35,   34,   34,   93},
+    {   35,   34,   34,   92},
+    {   36,   34,   35,   91}
+  },
+  {
+    {   32,   29,   54,   24},
+    {   31,   32,   34,   29},
+    {   31,   43,   34,   29},
+    {   32,   67,   36,   28},
+    {   31,   34,   69,   37},
+    {   31,   35,   46,   33},
+    {   30,   35,   39,   33},
+    {   30,   42,   39,   36},
+    {   31,   35,   39,   88},
+    {   30,   38,   41,   84},
+    {   30,   39,   40,   81},
+    {   39,   46,   38,   78},
+    {   31,   36,   34,   96},
+    {   34,   38,   37,   93},
+    {   55,   42,   38,   82},
+    {   89,   53,   38,   65}
+  },
+  {
+    {   32,   33,   43,   29},
+    {   32,   30,   29,   33},
+    {   31,   47,   31,   33},
+    {   33,  100,   31,   31},
+    {   32,   33,   74,   25},
+    {   32,   32,   34,   31},
+    {   32,   33,   30,   33},
+    {   32,   68,   30,   32},
+    {   32,   31,   91,   40},
+    {   32,   32,   58,   26},
+    {   31,   31,   30,   32},
+    {   31,   42,   30,   33},
+    {   32,   31,   49,   85},
+    {   32,   31,   83,   35},
+    {   31,   33,   48,   29},
+    {   31,   36,   32,   33}
+  },
+  {
+    {   31,   29,   81,   35},
+    {   32,   28,   34,   50},
+    {   31,   75,   16,   43},
+    {   34,  103,   29,   32},
+    {   32,   32,   53,   78},
+    {   31,   28,   36,   88},
+    {   30,   52,   18,   73},
+    {   52,   88,   17,   35},
+    {   32,   32,   35,   94},
+    {   30,   31,   35,   95},
+    {   36,   29,   31,   92},
+    {  100,   43,   16,   40},
+    {   32,   32,   35,   93},
+    {   30,   32,   38,   93},
+    {   55,   18,   37,   83},
+    {  127,    0,   30,   40}
+  },
+  {
+    {   31,   22,   47,   30},
+    {   31,   48,   25,   34},
+    {   30,   95,   31,   32},
+    {   32,  103,   33,   32},
+    {   30,   24,   57,   31},
+    {   30,   47,   26,   34},
+    {   31,   95,   31,   32},
+    {   43,   97,   35,   25},
+    {   29,   26,   44,   63},
+    {   37,   38,   24,   47},
+    {   74,   63,   28,   20},
+    {  110,   58,   34,    3},
+    {   46,   22,    5,  108},
+    {   93,    5,    9,   77},
+    {  127,    0,   17,   52},
+    {  127,    0,   15,   50}
+  },
+  {
+    {   32,   27,   68,   24},
+    {   35,   23,   35,   28},
+    {   35,   64,   29,   29},
+    {   37,  104,   33,   28},
+    {   32,   32,   91,   40},
+    {   36,   23,   67,   36},
+    {   49,   23,   39,   28},
+    {   60,   67,   30,   20},
+    {   32,   32,   36,   95},
+    {   35,   29,   38,   93},
+    {   50,   16,   30,   84},
+    {   72,   16,   15,   65},
+    {   32,   32,   27,  100},
+    {   33,   32,   29,  100},
+    {   37,   29,   30,   98},
+    {   48,   21,   29,   90}
+  }
+};
+
+const uint8_t mip_matrix_8x8[8][16][8] =
+{
+  {
+    {   30,   63,   46,   37,   25,   33,   33,   34},
+    {   30,   60,   66,   38,   32,   31,   32,   33},
+    {   29,   45,   74,   42,   32,   32,   32,   33},
+    {   30,   39,   62,   58,   32,   33,   32,   33},
+    {   30,   66,   55,   39,   32,   30,   30,   36},
+    {   29,   54,   69,   40,   33,   31,   31,   33},
+    {   28,   48,   71,   43,   32,   33,   32,   33},
+    {   28,   41,   72,   46,   32,   34,   32,   33},
+    {   30,   66,   56,   40,   32,   33,   28,   33},
+    {   29,   55,   69,   39,   33,   33,   30,   32},
+    {   27,   46,   72,   43,   33,   33,   32,   33},
+    {   27,   42,   69,   48,   32,   34,   32,   33},
+    {   30,   63,   55,   40,   32,   33,   35,   30},
+    {   29,   56,   66,   40,   33,   33,   33,   30},
+    {   27,   47,   69,   44,   33,   33,   33,   32},
+    {   27,   42,   65,   50,   32,   34,   32,   33}
+  },
+  {
+    {   32,   33,   30,   31,   74,   30,   31,   32},
+    {   33,   56,   28,   30,   41,   29,   32,   32},
+    {   33,   77,   52,   26,   29,   34,   30,   32},
+    {   33,   37,   80,   41,   31,   34,   30,   32},
+    {   32,   32,   33,   31,   59,   76,   28,   31},
+    {   33,   31,   31,   30,   78,   40,   28,   32},
+    {   33,   47,   28,   29,   53,   27,   31,   31},
+    {   33,   61,   44,   28,   34,   32,   31,   31},
+    {   32,   31,   34,   30,   26,   64,   76,   27},
+    {   32,   31,   34,   29,   45,   86,   36,   29},
+    {   33,   27,   34,   29,   73,   55,   25,   32},
+    {   33,   33,   34,   30,   62,   33,   30,   31},
+    {   32,   31,   34,   30,   30,   29,   58,   74},
+    {   32,   31,   35,   29,   27,   53,   77,   35},
+    {   32,   30,   36,   29,   40,   80,   44,   31},
+    {   33,   28,   37,   30,   58,   60,   31,   33}
+  },
+  {
+    {   32,   51,   27,   32,   27,   50,   29,   32},
+    {   32,   95,   42,   29,   29,   42,   30,   32},
+    {   32,   27,   99,   34,   31,   41,   29,   32},
+    {   32,   34,   21,  104,   31,   42,   30,   32},
+    {   32,   45,   30,   32,    9,   88,   40,   30},
+    {   32,   77,   38,   30,    9,   76,   38,   30},
+    {   32,   38,   78,   33,   14,   67,   37,   30},
+    {   32,   30,   30,   87,   20,   59,   38,   31},
+    {   33,   37,   32,   32,   27,   18,  106,   34},
+    {   34,   44,   34,   31,   25,   17,  108,   31},
+    {   36,   39,   45,   31,   24,   15,  108,   30},
+    {   37,   31,   31,   54,   25,   14,  101,   32},
+    {   36,   33,   32,   30,   29,   37,   13,  110},
+    {   39,   32,   32,   29,   27,   37,   15,  108},
+    {   44,   33,   31,   27,   25,   37,   16,  106},
+    {   47,   30,   31,   32,   25,   34,   19,  102}
+  },
+  {
+    {   32,   48,   35,   35,   47,   68,   31,   31},
+    {   32,   33,   59,   40,   27,   71,   33,   30},
+    {   32,   29,   47,   65,   24,   62,   37,   30},
+    {   33,   33,   31,   81,   26,   50,   42,   32},
+    {   32,   30,   40,   38,   30,   70,   55,   31},
+    {   32,   20,   46,   50,   26,   55,   64,   31},
+    {   33,   30,   29,   66,   25,   41,   72,   33},
+    {   36,   34,   27,   69,   26,   31,   67,   39},
+    {   33,   28,   36,   40,   30,   26,   85,   47},
+    {   36,   27,   33,   50,   31,   20,   79,   53},
+    {   43,   30,   26,   57,   28,   17,   67,   62},
+    {   51,   27,   28,   55,   22,   23,   49,   70},
+    {   38,   29,   32,   39,   28,   30,   22,  104},
+    {   51,   31,   28,   43,   24,   31,   17,  102},
+    {   69,   23,   30,   40,   15,   38,   10,   95},
+    {   77,   13,   35,   38,    8,   43,    8,   90}
+  },
+  {
+    {   32,   38,   32,   33,  101,   40,   29,   32},
+    {   32,   40,   37,   32,  100,   36,   30,   32},
+    {   32,   37,   46,   35,   94,   33,   30,   31},
+    {   33,   34,   30,   62,   81,   35,   30,   31},
+    {   32,   32,   33,   32,   22,  102,   39,   29},
+    {   32,   31,   33,   33,   26,  104,   34,   28},
+    {   33,   33,   33,   33,   31,  103,   32,   28},
+    {   33,   32,   34,   36,   37,   94,   33,   28},
+    {   32,   33,   32,   32,   34,   24,   99,   36},
+    {   32,   34,   33,   33,   33,   30,   98,   32},
+    {   33,   33,   34,   33,   31,   37,   95,   29},
+    {   33,   33,   33,   36,   30,   46,   85,   31},
+    {   32,   33,   32,   33,   30,   34,   23,  104},
+    {   32,   34,   33,   33,   31,   32,   30,   98},
+    {   32,   33,   34,   34,   31,   29,   39,   91},
+    {   33,   33,   32,   37,   32,   30,   47,   82}
+  },
+  {
+    {   32,   52,   48,   31,   38,   76,   26,   32},
+    {   33,   19,   62,   50,   25,   50,   51,   31},
+    {   33,   30,   20,   74,   29,   29,   54,   51},
+    {   34,   35,   23,   56,   31,   25,   41,   76},
+    {   33,   25,   38,   39,   28,   39,   83,   35},
+    {   35,   28,   25,   47,   31,   23,   57,   74},
+    {   37,   35,   22,   38,   31,   27,   30,  101},
+    {   38,   32,   33,   29,   30,   31,   27,  103},
+    {   34,   32,   27,   37,   32,   25,   41,   92},
+    {   38,   33,   28,   32,   30,   31,   18,  111},
+    {   40,   32,   33,   27,   29,   33,   18,  111},
+    {   40,   32,   34,   27,   28,   33,   23,  105},
+    {   35,   32,   30,   33,   31,   33,   20,  107},
+    {   38,   31,   33,   30,   29,   33,   21,  106},
+    {   40,   32,   33,   29,   29,   34,   22,  105},
+    {   40,   32,   33,   30,   29,   34,   24,  101}
+  },
+  {
+    {   32,   28,   31,   33,   92,   33,   30,   31},
+    {   33,   30,   28,   33,   71,   26,   32,   30},
+    {   33,   60,   26,   33,   47,   28,   33,   30},
+    {   33,   63,   44,   36,   37,   31,   33,   30},
+    {   33,   30,   31,   33,   43,   90,   33,   29},
+    {   33,   28,   29,   34,   71,   71,   26,   30},
+    {   33,   30,   26,   33,   86,   45,   28,   30},
+    {   33,   38,   29,   32,   74,   32,   33,   29},
+    {   33,   32,   30,   32,   29,   41,   95,   27},
+    {   34,   31,   29,   33,   26,   71,   73,   22},
+    {   34,   31,   29,   33,   37,   88,   46,   25},
+    {   33,   32,   28,   34,   55,   75,   36,   28},
+    {   34,   31,   30,   32,   33,   27,   43,   89},
+    {   35,   32,   28,   33,   33,   23,   77,   59},
+    {   34,   33,   28,   33,   30,   35,   91,   37},
+    {   34,   34,   28,   34,   33,   53,   74,   31}
+  },
+  {
+    {   33,   49,   26,   32,   26,   52,   28,   31},
+    {   33,   71,   72,   24,   30,   32,   34,   31},
+    {   32,   23,   70,   68,   32,   32,   32,   32},
+    {   31,   33,   21,  106,   33,   32,   32,   33},
+    {   34,   47,   32,   29,    5,   86,   44,   26},
+    {   34,   44,   89,   28,   28,   37,   33,   30},
+    {   32,   27,   46,   89,   33,   31,   31,   32},
+    {   30,   33,   20,  107,   33,   33,   32,   33},
+    {   35,   39,   42,   27,   26,   24,   92,   35},
+    {   34,   27,   87,   43,   30,   34,   38,   31},
+    {   31,   31,   32,  100,   32,   33,   30,   32},
+    {   29,   32,   22,  106,   33,   33,   32,   33},
+    {   35,   29,   47,   32,   32,   32,   17,  100},
+    {   34,   24,   69,   60,   34,   33,   28,   44},
+    {   31,   33,   31,   99,   32,   33,   32,   31},
+    {   29,   33,   25,  103,   33,   33,   32,   35}
+  }
+};
+
+const uint8_t mip_matrix_16x16[6][64][7] =
+{
+  {
+    {   42,   37,   33,   27,   44,   33,   35},
+    {   71,   39,   34,   24,   36,   35,   36},
+    {   77,   46,   35,   33,   30,   34,   36},
+    {   64,   60,   35,   33,   31,   32,   36},
+    {   49,   71,   38,   32,   32,   31,   36},
+    {   42,   66,   50,   33,   31,   32,   36},
+    {   40,   52,   67,   33,   31,   32,   35},
+    {   38,   43,   75,   33,   32,   32,   35},
+    {   56,   40,   33,   26,   43,   38,   36},
+    {   70,   49,   34,   30,   28,   38,   38},
+    {   65,   57,   36,   34,   28,   33,   39},
+    {   59,   60,   39,   33,   30,   31,   38},
+    {   55,   60,   43,   33,   30,   31,   38},
+    {   51,   61,   47,   33,   30,   32,   37},
+    {   46,   62,   51,   34,   30,   32,   37},
+    {   42,   60,   55,   33,   31,   32,   37},
+    {   60,   42,   34,   30,   37,   43,   38},
+    {   68,   52,   35,   35,   22,   37,   40},
+    {   62,   58,   37,   34,   28,   31,   40},
+    {   58,   59,   41,   33,   30,   30,   39},
+    {   56,   59,   44,   34,   30,   31,   38},
+    {   53,   60,   45,   33,   30,   31,   38},
+    {   49,   65,   45,   33,   30,   31,   38},
+    {   45,   64,   47,   33,   31,   32,   38},
+    {   59,   44,   35,   31,   34,   43,   41},
+    {   66,   53,   36,   35,   25,   31,   43},
+    {   61,   58,   38,   34,   29,   30,   40},
+    {   59,   57,   41,   33,   30,   31,   39},
+    {   57,   58,   43,   33,   30,   31,   39},
+    {   54,   61,   43,   33,   31,   31,   39},
+    {   51,   64,   43,   33,   31,   31,   39},
+    {   48,   64,   45,   33,   32,   31,   39},
+    {   57,   45,   35,   30,   35,   40,   44},
+    {   65,   54,   37,   33,   33,   24,   44},
+    {   63,   56,   38,   34,   30,   29,   39},
+    {   61,   56,   41,   34,   30,   32,   39},
+    {   58,   58,   42,   33,   31,   31,   39},
+    {   54,   62,   41,   33,   31,   31,   39},
+    {   51,   65,   42,   33,   31,   31,   39},
+    {   48,   63,   43,   33,   32,   31,   39},
+    {   55,   46,   35,   30,   36,   38,   47},
+    {   65,   53,   37,   32,   36,   26,   40},
+    {   65,   54,   38,   33,   31,   30,   38},
+    {   63,   55,   39,   33,   30,   32,   38},
+    {   59,   58,   40,   33,   31,   31,   39},
+    {   54,   64,   40,   33,   31,   30,   40},
+    {   49,   66,   40,   32,   32,   30,   41},
+    {   48,   64,   42,   32,   32,   30,   41},
+    {   54,   46,   35,   30,   34,   39,   49},
+    {   64,   52,   36,   32,   34,   34,   35},
+    {   65,   53,   37,   33,   32,   32,   37},
+    {   63,   55,   38,   33,   31,   31,   39},
+    {   59,   60,   38,   33,   31,   31,   40},
+    {   54,   64,   38,   33,   32,   30,   40},
+    {   49,   66,   39,   33,   32,   29,   41},
+    {   47,   64,   42,   32,   33,   29,   42},
+    {   51,   46,   35,   31,   33,   37,   54},
+    {   61,   51,   36,   32,   33,   38,   36},
+    {   63,   53,   37,   32,   32,   34,   37},
+    {   62,   55,   37,   33,   32,   32,   39},
+    {   58,   59,   37,   33,   32,   31,   40},
+    {   53,   63,   38,   33,   32,   31,   40},
+    {   49,   64,   40,   33,   33,   30,   41},
+    {   46,   62,   42,   33,   33,   30,   42}
+  },
+  {
+    {   39,   34,   33,   58,   44,   31,   32},
+    {   60,   38,   32,   40,   51,   30,   31},
+    {   73,   49,   31,   39,   48,   32,   31},
+    {   60,   73,   30,   39,   46,   33,   32},
+    {   43,   87,   35,   38,   45,   33,   32},
+    {   35,   78,   54,   36,   45,   33,   32},
+    {   33,   47,   86,   35,   44,   33,   32},
+    {   31,   17,  114,   34,   44,   34,   33},
+    {   43,   37,   32,   53,   70,   30,   31},
+    {   53,   50,   30,   42,   72,   31,   30},
+    {   52,   66,   30,   39,   70,   32,   30},
+    {   46,   78,   35,   37,   68,   34,   30},
+    {   43,   75,   48,   37,   66,   34,   30},
+    {   40,   62,   68,   35,   65,   35,   30},
+    {   33,   37,   97,   33,   62,   37,   31},
+    {   26,   14,  122,   32,   59,   38,   33},
+    {   40,   39,   33,   34,   87,   37,   30},
+    {   45,   54,   32,   34,   84,   41,   29},
+    {   41,   70,   35,   33,   83,   40,   29},
+    {   37,   73,   44,   32,   82,   40,   30},
+    {   37,   65,   60,   31,   81,   41,   29},
+    {   35,   48,   82,   30,   79,   43,   29},
+    {   28,   27,  108,   28,   76,   45,   30},
+    {   19,   11,  127,   27,   70,   46,   32},
+    {   38,   40,   34,   27,   73,   62,   28},
+    {   39,   54,   35,   30,   73,   62,   28},
+    {   33,   65,   41,   29,   75,   59,   28},
+    {   30,   65,   53,   27,   76,   58,   29},
+    {   29,   53,   72,   26,   77,   58,   29},
+    {   27,   35,   95,   24,   77,   60,   28},
+    {   19,   19,  117,   23,   74,   61,   30},
+    {    9,   16,  127,   23,   68,   60,   34},
+    {   35,   40,   35,   29,   44,   89,   30},
+    {   33,   51,   39,   29,   49,   86,   30},
+    {   28,   57,   49,   28,   53,   83,   30},
+    {   24,   52,   65,   26,   56,   82,   30},
+    {   22,   39,   86,   24,   58,   82,   30},
+    {   18,   22,  108,   23,   59,   82,   31},
+    {   10,   13,  125,   22,   58,   80,   33},
+    {    0,   19,  127,   22,   56,   74,   40},
+    {   33,   40,   36,   31,   28,   90,   45},
+    {   29,   46,   44,   29,   31,   92,   43},
+    {   24,   45,   58,   28,   34,   91,   43},
+    {   19,   37,   78,   26,   37,   91,   43},
+    {   15,   22,   99,   25,   38,   91,   42},
+    {   11,   11,  118,   24,   39,   90,   44},
+    {    2,   11,  127,   23,   41,   85,   48},
+    {    0,   17,  127,   23,   43,   75,   55},
+    {   31,   37,   39,   30,   28,   54,   82},
+    {   27,   37,   52,   28,   30,   58,   79},
+    {   22,   30,   70,   27,   32,   58,   79},
+    {   15,   19,   91,   26,   33,   58,   79},
+    {   10,    8,  111,   25,   34,   58,   79},
+    {    5,    2,  125,   25,   35,   57,   80},
+    {    0,    9,  127,   25,   36,   53,   84},
+    {    0,   13,  127,   25,   39,   47,   88},
+    {   28,   29,   46,   28,   39,    2,  123},
+    {   24,   24,   62,   27,   41,    1,  125},
+    {   19,   14,   81,   25,   43,    0,  126},
+    {   13,    4,  101,   24,   44,    0,  127},
+    {    6,    0,  116,   23,   45,    0,  127},
+    {    0,    0,  126,   23,   45,    1,  127},
+    {    0,    4,  127,   25,   44,    2,  127},
+    {    0,    9,  127,   25,   44,    3,  127}
+  },
+  {
+    {   30,   32,   32,   42,   34,   32,   32},
+    {   63,   26,   34,   16,   38,   32,   32},
+    {   98,   26,   34,   25,   34,   33,   32},
+    {   75,   61,   30,   31,   32,   33,   32},
+    {   36,   94,   32,   30,   33,   32,   32},
+    {   26,   76,   58,   30,   33,   32,   32},
+    {   30,   39,   91,   31,   32,   33,   31},
+    {   32,   23,  105,   32,   32,   32,   32},
+    {   34,   30,   33,   31,   52,   29,   32},
+    {   66,   24,   34,   11,   41,   33,   32},
+    {   97,   28,   34,   24,   34,   33,   32},
+    {   71,   65,   30,   30,   32,   33,   32},
+    {   34,   92,   35,   30,   33,   32,   32},
+    {   26,   70,   64,   29,   34,   32,   32},
+    {   30,   37,   94,   30,   33,   32,   31},
+    {   32,   23,  105,   31,   33,   33,   31},
+    {   37,   29,   33,    8,   79,   27,   32},
+    {   71,   22,   35,    5,   50,   32,   32},
+    {   98,   29,   34,   23,   34,   34,   32},
+    {   66,   70,   30,   31,   31,   33,   32},
+    {   31,   92,   38,   30,   33,   32,   32},
+    {   26,   66,   68,   29,   34,   32,   31},
+    {   30,   34,   97,   30,   34,   33,   31},
+    {   31,   22,  106,   30,   34,   33,   31},
+    {   40,   28,   34,    0,   76,   46,   28},
+    {   76,   21,   35,    0,   55,   35,   32},
+    {   97,   32,   34,   21,   37,   33,   33},
+    {   61,   75,   29,   30,   32,   32,   32},
+    {   29,   92,   40,   29,   33,   32,   32},
+    {   26,   62,   73,   29,   34,   32,   31},
+    {   29,   32,   99,   30,   34,   33,   30},
+    {   31,   22,  107,   30,   34,   33,   31},
+    {   42,   27,   34,    1,   48,   79,   25},
+    {   80,   20,   35,    0,   48,   47,   31},
+    {   94,   36,   32,   17,   40,   33,   33},
+    {   55,   80,   29,   27,   35,   31,   32},
+    {   27,   90,   43,   28,   34,   32,   31},
+    {   26,   58,   76,   29,   33,   33,   30},
+    {   29,   30,  101,   29,   34,   34,   30},
+    {   31,   21,  108,   29,   35,   34,   30},
+    {   44,   26,   34,    6,   30,   80,   40},
+    {   81,   21,   35,    0,   41,   52,   35},
+    {   90,   41,   31,   14,   41,   35,   33},
+    {   51,   82,   29,   24,   37,   32,   32},
+    {   27,   87,   47,   27,   35,   32,   31},
+    {   26,   54,   79,   29,   34,   33,   30},
+    {   29,   29,  102,   28,   34,   33,   30},
+    {   31,   21,  108,   28,   35,   33,   31},
+    {   47,   26,   34,    7,   34,   44,   75},
+    {   80,   24,   34,    0,   41,   41,   50},
+    {   84,   45,   31,   12,   40,   36,   36},
+    {   49,   81,   31,   22,   37,   33,   32},
+    {   28,   81,   51,   26,   35,   33,   31},
+    {   28,   51,   81,   28,   34,   33,   30},
+    {   29,   30,  101,   28,   35,   33,   31},
+    {   31,   22,  107,   28,   35,   33,   32},
+    {   48,   27,   34,   10,   40,   16,   97},
+    {   75,   27,   34,    3,   42,   26,   66},
+    {   77,   47,   33,   12,   40,   32,   43},
+    {   49,   75,   36,   21,   37,   33,   35},
+    {   32,   72,   55,   25,   36,   33,   32},
+    {   30,   49,   81,   27,   35,   33,   31},
+    {   30,   32,   98,   28,   35,   32,   32},
+    {   31,   24,  104,   28,   35,   32,   33}
+  },
+  {
+    {   36,   29,   33,   43,   47,   29,   31},
+    {   74,   20,   35,   19,   47,   34,   32},
+    {   92,   35,   32,   29,   31,   40,   34},
+    {   53,   80,   26,   33,   28,   36,   37},
+    {   24,   91,   41,   31,   31,   31,   38},
+    {   25,   57,   74,   31,   32,   30,   37},
+    {   32,   28,   99,   32,   32,   29,   36},
+    {   34,   20,  105,   33,   32,   30,   35},
+    {   50,   26,   34,   33,   74,   30,   31},
+    {   75,   28,   33,   23,   46,   47,   33},
+    {   64,   58,   29,   30,   26,   46,   40},
+    {   31,   85,   37,   31,   27,   33,   44},
+    {   22,   67,   64,   30,   31,   28,   42},
+    {   29,   35,   93,   31,   32,   27,   40},
+    {   33,   20,  105,   32,   33,   27,   37},
+    {   34,   19,  106,   33,   32,   29,   36},
+    {   51,   29,   33,   25,   72,   51,   30},
+    {   61,   42,   31,   30,   31,   60,   39},
+    {   40,   70,   34,   32,   24,   41,   50},
+    {   22,   72,   54,   30,   31,   27,   50},
+    {   25,   44,   83,   30,   33,   25,   44},
+    {   32,   23,  102,   32,   33,   26,   40},
+    {   34,   18,  107,   32,   33,   28,   37},
+    {   34,   19,  105,   33,   32,   30,   35},
+    {   45,   35,   32,   30,   39,   79,   33},
+    {   43,   53,   33,   35,   24,   53,   55},
+    {   27,   67,   45,   32,   29,   27,   61},
+    {   22,   53,   72,   30,   33,   22,   52},
+    {   28,   31,   95,   31,   33,   25,   43},
+    {   32,   20,  105,   32,   33,   27,   38},
+    {   34,   18,  107,   32,   32,   29,   36},
+    {   34,   20,  105,   33,   31,   31,   35},
+    {   38,   40,   32,   35,   23,   72,   54},
+    {   31,   55,   39,   34,   29,   32,   73},
+    {   22,   57,   60,   31,   35,   18,   64},
+    {   25,   39,   86,   31,   35,   22,   49},
+    {   30,   24,  101,   32,   33,   27,   40},
+    {   33,   19,  106,   32,   32,   30,   36},
+    {   34,   18,  107,   33,   31,   31,   35},
+    {   34,   20,  104,   33,   31,   32,   34},
+    {   33,   42,   35,   34,   28,   39,   82},
+    {   26,   51,   50,   33,   34,   18,   80},
+    {   23,   46,   74,   31,   35,   20,   59},
+    {   27,   32,   93,   32,   34,   26,   44},
+    {   31,   22,  103,   32,   32,   30,   37},
+    {   33,   19,  106,   33,   31,   31,   35},
+    {   34,   19,  106,   33,   31,   32,   34},
+    {   35,   21,  103,   34,   31,   32,   34},
+    {   29,   41,   41,   33,   34,   20,   92},
+    {   24,   44,   62,   34,   35,   18,   73},
+    {   24,   37,   83,   34,   33,   25,   52},
+    {   28,   28,   97,   33,   32,   30,   40},
+    {   32,   23,  103,   33,   31,   32,   36},
+    {   34,   20,  105,   34,   30,   33,   34},
+    {   35,   20,  104,   34,   30,   33,   33},
+    {   35,   22,  102,   34,   30,   33,   34},
+    {   27,   38,   51,   34,   34,   20,   86},
+    {   26,   37,   71,   35,   34,   24,   64},
+    {   27,   33,   87,   35,   32,   30,   47},
+    {   30,   28,   96,   34,   31,   32,   39},
+    {   32,   24,  100,   35,   30,   32,   36},
+    {   34,   23,  101,   34,   30,   33,   34},
+    {   35,   23,  101,   34,   30,   32,   34},
+    {   34,   24,   99,   35,   30,   33,   34}
+  },
+  {
+    {   39,   30,   31,   67,   33,   34,   31},
+    {   72,   21,   32,   43,   39,   33,   31},
+    {  100,   23,   32,   35,   39,   34,   31},
+    {   75,   63,   24,   32,   38,   34,   32},
+    {   32,   98,   26,   29,   37,   35,   32},
+    {   22,   77,   55,   29,   36,   35,   31},
+    {   31,   37,   90,   31,   35,   35,   32},
+    {   35,   22,  100,   33,   33,   36,   33},
+    {   47,   29,   32,   74,   54,   32,   31},
+    {   71,   24,   32,   60,   50,   36,   30},
+    {   86,   31,   30,   46,   48,   37,   30},
+    {   65,   63,   25,   34,   46,   39,   30},
+    {   33,   85,   32,   28,   43,   40,   30},
+    {   26,   64,   60,   27,   39,   41,   30},
+    {   33,   33,   87,   29,   35,   41,   31},
+    {   37,   23,   93,   32,   33,   41,   32},
+    {   41,   32,   32,   45,   84,   32,   32},
+    {   55,   31,   32,   50,   70,   40,   30},
+    {   62,   37,   31,   45,   61,   45,   29},
+    {   53,   55,   31,   36,   55,   48,   29},
+    {   38,   63,   40,   29,   48,   50,   28},
+    {   34,   49,   60,   27,   43,   51,   29},
+    {   38,   30,   78,   28,   38,   50,   31},
+    {   40,   24,   83,   30,   36,   48,   33},
+    {   35,   33,   33,   29,   75,   58,   29},
+    {   39,   35,   33,   34,   68,   59,   29},
+    {   41,   39,   34,   36,   61,   62,   29},
+    {   41,   43,   37,   33,   54,   64,   28},
+    {   41,   43,   45,   30,   48,   65,   29},
+    {   42,   36,   56,   27,   44,   63,   30},
+    {   42,   30,   65,   27,   41,   60,   33},
+    {   42,   28,   68,   28,   37,   56,   36},
+    {   33,   34,   33,   31,   42,   88,   30},
+    {   31,   36,   34,   31,   44,   84,   31},
+    {   31,   37,   35,   32,   43,   83,   31},
+    {   35,   35,   39,   32,   40,   82,   31},
+    {   40,   32,   44,   31,   38,   81,   31},
+    {   44,   30,   48,   30,   37,   78,   33},
+    {   44,   30,   52,   28,   37,   72,   36},
+    {   43,   30,   55,   29,   35,   66,   40},
+    {   32,   33,   33,   34,   25,   85,   48},
+    {   30,   34,   34,   33,   25,   88,   44},
+    {   30,   34,   36,   34,   25,   90,   41},
+    {   33,   32,   38,   34,   25,   90,   40},
+    {   38,   29,   41,   34,   26,   88,   40},
+    {   42,   29,   41,   33,   27,   85,   41},
+    {   43,   30,   42,   31,   28,   80,   43},
+    {   42,   31,   45,   31,   30,   72,   47},
+    {   32,   33,   33,   33,   26,   54,   79},
+    {   31,   32,   34,   35,   20,   68,   68},
+    {   32,   32,   35,   36,   17,   76,   62},
+    {   34,   31,   36,   36,   17,   79,   59},
+    {   37,   29,   37,   36,   18,   78,   58},
+    {   39,   29,   37,   35,   20,   77,   58},
+    {   41,   30,   37,   34,   22,   74,   58},
+    {   40,   31,   40,   32,   26,   68,   59},
+    {   33,   31,   34,   33,   29,   31,   98},
+    {   34,   30,   34,   35,   23,   45,   88},
+    {   34,   31,   34,   36,   20,   54,   82},
+    {   35,   31,   34,   36,   18,   59,   78},
+    {   36,   31,   34,   37,   19,   60,   76},
+    {   38,   30,   34,   36,   20,   61,   74},
+    {   39,   31,   35,   35,   22,   60,   73},
+    {   39,   31,   37,   34,   24,   59,   71}
+  },
+  {
+    {   30,   33,   32,   55,   32,   32,   32},
+    {   47,   30,   31,   29,   36,   32,   32},
+    {   81,   28,   32,   28,   34,   32,   32},
+    {   85,   46,   29,   32,   32,   33,   32},
+    {   54,   82,   26,   32,   32,   33,   32},
+    {   30,   90,   38,   31,   32,   33,   32},
+    {   30,   56,   73,   31,   33,   32,   32},
+    {   37,   21,  102,   32,   32,   32,   32},
+    {   33,   32,   31,   68,   39,   31,   31},
+    {   38,   32,   31,   43,   34,   33,   31},
+    {   63,   30,   31,   29,   34,   32,   32},
+    {   82,   37,   30,   29,   33,   32,   32},
+    {   71,   63,   27,   31,   32,   33,   32},
+    {   44,   86,   30,   30,   33,   33,   32},
+    {   33,   72,   55,   30,   32,   32,   31},
+    {   37,   37,   86,   31,   32,   33,   31},
+    {   34,   33,   32,   60,   61,   29,   32},
+    {   36,   33,   31,   56,   38,   32,   31},
+    {   51,   30,   31,   38,   33,   33,   32},
+    {   75,   31,   31,   30,   33,   33,   32},
+    {   80,   47,   29,   30,   32,   33,   31},
+    {   60,   73,   27,   30,   33,   33,   31},
+    {   41,   78,   41,   30,   33,   32,   31},
+    {   38,   53,   68,   30,   32,   33,   31},
+    {   33,   33,   32,   43,   77,   35,   30},
+    {   35,   33,   31,   55,   54,   29,   32},
+    {   43,   32,   31,   46,   39,   31,   32},
+    {   64,   30,   31,   35,   34,   33,   32},
+    {   79,   37,   30,   31,   32,   33,   31},
+    {   73,   57,   28,   30,   32,   33,   31},
+    {   54,   73,   33,   30,   32,   33,   31},
+    {   43,   64,   52,   30,   32,   33,   31},
+    {   33,   33,   32,   34,   68,   58,   28},
+    {   34,   33,   31,   45,   70,   33,   31},
+    {   38,   33,   31,   48,   52,   29,   32},
+    {   54,   31,   31,   40,   39,   31,   32},
+    {   73,   32,   31,   34,   34,   33,   31},
+    {   77,   45,   29,   31,   32,   32,   32},
+    {   65,   63,   30,   31,   31,   33,   31},
+    {   51,   66,   42,   30,   32,   33,   31},
+    {   33,   32,   32,   34,   44,   81,   31},
+    {   34,   33,   31,   38,   66,   52,   28},
+    {   36,   33,   30,   44,   62,   34,   31},
+    {   47,   31,   31,   43,   48,   30,   32},
+    {   64,   31,   31,   38,   38,   32,   32},
+    {   75,   38,   30,   33,   34,   32,   32},
+    {   71,   53,   30,   31,   32,   33,   32},
+    {   59,   61,   37,   30,   32,   33,   32},
+    {   33,   32,   31,   35,   31,   71,   54},
+    {   34,   33,   31,   37,   49,   70,   33},
+    {   36,   33,   31,   41,   60,   48,   30},
+    {   43,   32,   31,   43,   54,   35,   31},
+    {   56,   31,   31,   40,   44,   32,   32},
+    {   68,   35,   30,   36,   37,   32,   32},
+    {   70,   45,   30,   33,   34,   33,   32},
+    {   63,   55,   35,   31,   33,   33,   32},
+    {   33,   32,   31,   33,   34,   36,   87},
+    {   34,   32,   31,   36,   38,   62,   52},
+    {   36,   33,   31,   39,   50,   57,   36},
+    {   41,   33,   31,   41,   53,   43,   33},
+    {   50,   33,   31,   41,   48,   36,   32},
+    {   59,   35,   31,   37,   41,   34,   32},
+    {   65,   42,   31,   35,   36,   33,   32},
+    {   62,   49,   35,   33,   34,   34,   33}
+  }
+};

From cb2ccce753617216d24dcc36576a7c6f5a2afd98 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 22 Dec 2021 23:14:44 +0200
Subject: [PATCH 05/28] [mip] WIP Implement mip functions.

---
 src/intra.c | 57 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 28715a20..b5c254c1 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -607,7 +607,9 @@ void kvz_mip_reduced_pred(kvz_pixel* const output,
                           const bool transpose,
                           const int red_bdry_size,
                           const int red_pred_size,
-                          const int size_id)
+                          const int size_id,
+                          const int in_offset,
+                          const int in_offset_tr)
 {
   const int input_size = 2 * red_bdry_size;
 
@@ -623,39 +625,35 @@ void kvz_mip_reduced_pred(kvz_pixel* const output,
   assert((input_size == 4 * (input_size >> 2)) && "MIP input size must be divisible by four");
 
   const uint8_t* weight = matrix;
-  const int input_offset = transpose ? m_inputOffsetTransp : m_inputOffset;
+  const int input_offset = transpose ? in_offset_tr : in_offset;
 
   const bool red_size = (size_id == 2);
   int pos_res = 0;
-  for (int y = 0; y < m_reducedPredSize; y++)
-  {
-    for (int x = 0; x < m_reducedPredSize; x++)
-    {
-      if (red_size) weight -= 1;
+  for (int y = 0; y < red_pred_size; y++) {
+    for (int x = 0; x < red_pred_size; x++) {
+      if (red_size) {
+        weight -= 1;
+      }
       int tmp0 = red_size ? 0 : (input[0] * weight[0]);
       int tmp1 = input[1] * weight[1];
       int tmp2 = input[2] * weight[2];
       int tmp3 = input[3] * weight[3];
-      for (int i = 4; i < input_size; i += 4)
-      {
+      for (int i = 4; i < input_size; i += 4) {
         tmp0 += input[i] * weight[i];
         tmp1 += input[i + 1] * weight[i + 1];
         tmp2 += input[i + 2] * weight[i + 2];
         tmp3 += input[i + 3] * weight[i + 3];
       }
-      out_ptr[posRes++] = ClipBD<int>(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + inputOffset, bitDepth);
-
-      weight += inputSize;
+      out_ptr[pos_res] = CLIP_TO_PIXEL(((tmp0 + tmp1 + tmp2 + tmp3 + offset) >> MIP_SHIFT_MATRIX) + input_offset);
+      pos_res++;
+      weight += input_size;
     }
   }
 
-  if (transpose)
-  {
-    for (int y = 0; y < m_reducedPredSize; y++)
-    {
-      for (int x = 0; x < m_reducedPredSize; x++)
-      {
-        result[y * m_reducedPredSize + x] = resPtr[x * m_reducedPredSize + y];
+  if (transpose) {
+    for (int y = 0; y < red_pred_size; y++) {
+      for (int x = 0; x < red_pred_size; x++) {
+        output[y * red_pred_size + x] = out_ptr[x * red_pred_size + y];
       }
     }
   }
@@ -680,6 +678,7 @@ void kvz_mip_predict(encoder_state_t const* const state,
   // Separate this function into smaller bits if needed
   
   int* result; // TODO: pass the dst buffer to this function
+  const int mode_idx = 0; // TODO: pass mode
 
   // *** INPUT PREP ***
 
@@ -758,19 +757,33 @@ void kvz_mip_predict(encoder_state_t const* const state,
   const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1);
   const bool transpose = 0; // TODO: pass transpose
 
-  const uint8_t* matrix = 0; // TODO: function for fetching correct matrix
+  uint8_t* matrix;
+  switch (size_id) {
+    case 0: 
+      matrix = &mip_matrix_4x4[mode_idx][0][0];
+      break;
+    case 1: 
+      matrix = &mip_matrix_8x8[mode_idx][0][0];
+      break;
+    case 2: 
+      matrix = &mip_matrix_16x16[mode_idx][0][0];
+      break;
+    default:
+      assert(false && "Invalid MIP size id.");
+  }
+
   kvz_pixel* red_pred_buffer = MALLOC(kvz_pixel, red_pred_size * red_pred_size); // TODO: get rid of MALLOC and FREE
   kvz_pixel* const reduced_pred = need_upsampling ? red_pred_buffer[0] : result;
 
   const int* const reduced_bdry = transpose ? red_bdry_trans[0] : red_bdry[0];
 
-  kvz_mip_reduced_pred(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id);
+  kvz_mip_reduced_pred(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans);
   if (need_upsampling) {
     kvz_mip_pred_upsampling(result, reduced_pred);
   }
 
   FREE_POINTER(red_pred_buffer);
-  // *** BLOCK PREDITC *** END
+  // *** BLOCK PREDICT *** END
 }
 
 

From 0b9568b4669fd0ed48f1c9245560ed232b670bd2 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 11 Jan 2022 11:03:08 +0200
Subject: [PATCH 06/28] Re-add debug files to project file filters. Makefile
 spaces to tabs.

---
 build/kvazaar_lib/kvazaar_lib.vcxproj.filters | 2 ++
 src/Makefile.am                               | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
index 63cbd564..45e196d1 100644
--- a/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
+++ b/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
@@ -266,6 +266,7 @@
     <ClCompile Include="..\..\src\strategies\avx2\alf-avx2.c">
       <Filter>Optimization\strategies\avx2</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\debug.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -500,6 +501,7 @@
     <ClInclude Include="..\..\src\mip_data.h">
       <Filter>Reconstruction</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\debug.h" />
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">
diff --git a/src/Makefile.am b/src/Makefile.am
index 1412aeef..20db2fc0 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -90,7 +90,7 @@ libkvazaar_la_SOURCES = \
 	kvazaar.c \
 	kvazaar_internal.h \
 	kvz_math.h \
-    mip_data.h \
+	mip_data.h \
 	ml_intra_cu_depth_pred.c \
 	ml_intra_cu_depth_pred.h \
 	nal.c \

From e672f9b24a10c0d309979afd2b914734eb2067cf Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 11 Jan 2022 15:39:37 +0200
Subject: [PATCH 07/28] [mip] Implement MIP functions.

---
 src/intra.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 98 insertions(+), 12 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index b5c254c1..5d61a4f4 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -546,7 +546,7 @@ void kvz_predict_cclm(
 }
 
 
-void kvz_mip_boundary_downsampling(int* reduced_dst, const kvz_pixel* const ref_src, int src_len, int dst_len)
+void kvz_mip_boundary_downsampling(kvz_pixel* reduced_dst, const kvz_pixel* const ref_src, int src_len, int dst_len)
 {
   if (dst_len < src_len)
   {
@@ -662,9 +662,76 @@ void kvz_mip_reduced_pred(kvz_pixel* const output,
 }
 
 
-void kvz_mip_pred_upsampling()
+void kvz_mip_pred_upsampling_1D(kvz_pixel* const dst, const kvz_pixel* const src, const kvz_pixel* const boundary,
+                                const uint16_t src_size_ups_dim, const uint16_t src_size_orth_dim,
+                                const uint16_t src_step, const uint16_t src_stride,
+                                const uint16_t dst_step, const uint16_t dst_stride,
+                                const uint16_t boundary_step,
+                                const uint16_t ups_factor)
 {
+  // Calculate floor log2. TODO: find a better / faster solution
+  uint16_t upsample_factor = ups_factor;
+  int tmp = 0;
+  if (upsample_factor & 0xffff0000) {
+    upsample_factor >>= 16;
+    tmp += 16;
+  }
+  if (upsample_factor & 0xff00) {
+    upsample_factor >>= 8;
+    tmp += 8;
+  }
+  if (upsample_factor & 0xf0) {
+    upsample_factor >>= 4;
+    tmp += 4;
+  }
+  if (upsample_factor & 0xc) {
+    upsample_factor >>= 2;
+    tmp += 2;
+  }
+  if (upsample_factor & 0x2) {
+    upsample_factor >>= 1;
+    tmp += 1;
+  }
 
+  const int log2_factor = tmp;
+  assert(ups_factor >= 2 && "Upsampling factor must be at least 2.");
+  const int rounding_offset = 1 << (log2_factor - 1);
+
+  uint16_t idx_orth_dim = 0;
+  const kvz_pixel* src_line = src;
+  kvz_pixel* dst_line = dst;
+  const kvz_pixel* boundary_line = boundary + boundary_step - 1;
+  while (idx_orth_dim < src_size_orth_dim)
+  {
+    uint16_t idx_upsample_dim = 0;
+    const kvz_pixel* before = boundary_line;
+    const kvz_pixel* behind = src_line;
+    kvz_pixel* cur_dst = dst_line;
+    while (idx_upsample_dim < src_size_ups_dim)
+    {
+      uint16_t pos = 1;
+      int scaled_before = (*before) << log2_factor;
+      int scaled_behind = 0;
+      while (pos <= ups_factor)
+      {
+        scaled_before -= *before;
+        scaled_behind += *behind;
+        *cur_dst = (scaled_before + scaled_behind + rounding_offset) >> log2_factor;
+
+        pos++;
+        cur_dst += dst_step;
+      }
+
+      idx_upsample_dim++;
+      before = behind;
+      behind += src_step;
+    }
+
+    idx_orth_dim++;
+    src_line += src_stride;
+    dst_line += dst_stride;
+    boundary_line += boundary_step;
+  }
 }
 
 
@@ -677,7 +744,7 @@ void kvz_mip_predict(encoder_state_t const* const state,
 {
   // Separate this function into smaller bits if needed
   
-  int* result; // TODO: pass the dst buffer to this function
+  kvz_pixel* result; // TODO: pass the dst buffer to this function
   const int mode_idx = 0; // TODO: pass mode
 
   // *** INPUT PREP ***
@@ -702,8 +769,8 @@ void kvz_mip_predict(encoder_state_t const* const state,
   int red_pred_size = (size_id < 2) ? 4 : 8;
 
   // Upsampling factors
-  unsigned int ups_hor_factor = width / red_pred_size;
-  unsigned int ups_ver_factor = height / red_pred_size;
+  uint16_t ups_hor_factor = width / red_pred_size;
+  uint16_t ups_ver_factor = height / red_pred_size;
 
   // Upsampling factors must be powers of two
   assert((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1)) != 0) && "Horizontal upsampling factor must be power of two.");
@@ -720,15 +787,15 @@ void kvz_mip_predict(encoder_state_t const* const state,
   kvz_pixel red_bdry[MIP_MAX_INPUT_SIZE];
   kvz_pixel red_bdry_trans[MIP_MAX_INPUT_SIZE];
 
-  kvz_pixel* const top_reduced = red_bdry[0];
-  kvz_pixel* const left_reduced = red_bdry[red_bdry_size];
+  kvz_pixel* const top_reduced = &red_bdry[0];
+  kvz_pixel* const left_reduced = &red_bdry[red_bdry_size];
 
   kvz_mip_boundary_downsampling(top_reduced, ref_samples_top, width, red_bdry_size);
   kvz_mip_boundary_downsampling(left_reduced, ref_samples_left, height, red_bdry_size);
 
   // Transposed reduced boundaries
-  int* const left_reduced_trans = red_bdry_trans[0];
-  int* const top_reduced_trans = red_bdry_trans[red_bdry_size];
+  kvz_pixel* const left_reduced_trans = &red_bdry_trans[0];
+  kvz_pixel* const top_reduced_trans = &red_bdry_trans[red_bdry_size];
 
   for (int x = 0; x < red_bdry_size; x++) {
     top_reduced_trans[x] = top_reduced[x];
@@ -773,13 +840,32 @@ void kvz_mip_predict(encoder_state_t const* const state,
   }
 
   kvz_pixel* red_pred_buffer = MALLOC(kvz_pixel, red_pred_size * red_pred_size); // TODO: get rid of MALLOC and FREE
-  kvz_pixel* const reduced_pred = need_upsampling ? red_pred_buffer[0] : result;
+  kvz_pixel* const reduced_pred = need_upsampling ? red_pred_buffer : result;
 
-  const int* const reduced_bdry = transpose ? red_bdry_trans[0] : red_bdry[0];
+  const kvz_pixel* const reduced_bdry = transpose ? red_bdry_trans : red_bdry;
 
   kvz_mip_reduced_pred(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans);
   if (need_upsampling) {
-    kvz_mip_pred_upsampling(result, reduced_pred);
+    const kvz_pixel* ver_src = reduced_pred;
+    uint16_t ver_src_step = width;
+    
+    if (ups_hor_factor > 1) {
+      kvz_pixel* const hor_dst = result + (ups_ver_factor - 1) * width;
+      ver_src = hor_dst;
+      ver_src_step *= ups_ver_factor;
+
+      kvz_mip_pred_upsampling_1D(hor_dst, reduced_pred, ref_samples_left,
+        red_pred_size, red_pred_size,
+        1, red_pred_size, 1, ver_src_step,
+        ups_ver_factor, ups_hor_factor);
+    }
+
+    if (ups_ver_factor > 1) {
+      kvz_mip_pred_upsampling_1D(result, ver_src, ref_samples_top,
+        red_pred_size, width,
+        ver_src_step, 1, width, 1,
+        1, ups_ver_factor);
+    }
   }
 
   FREE_POINTER(red_pred_buffer);

From 59a86f339ec023259f7f8c30bedcb1e695a266c4 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 20 Jan 2022 00:11:50 +0200
Subject: [PATCH 08/28] [mip] Implement MIP search.

---
 src/cu.h           |   2 +
 src/intra.c        |  60 ++++++++++++--------
 src/intra.h        |  13 +++++
 src/search.c       |  16 ++++--
 src/search_intra.c | 138 +++++++++++++++++++++++++++++++++++++++++----
 src/search_intra.h |   4 +-
 6 files changed, 194 insertions(+), 39 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index be081f61..779fe1fd 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -169,6 +169,8 @@ typedef struct
       int8_t mode;
       int8_t mode_chroma;
       uint8_t multi_ref_idx;
+      bool mip_flag;
+      bool mip_is_transposed;
     } intra;
     struct {
       mv_t    mv[2][2];  // \brief Motion vectors for L0 and L1
diff --git a/src/intra.c b/src/intra.c
index 5d61a4f4..d1723faa 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -546,14 +546,14 @@ void kvz_predict_cclm(
 }
 
 
-void kvz_mip_boundary_downsampling(kvz_pixel* reduced_dst, const kvz_pixel* const ref_src, int src_len, int dst_len)
+void kvz_mip_boundary_downsampling_1D(kvz_pixel* reduced_dst, const kvz_pixel* const ref_src, int src_len, int dst_len)
 {
   if (dst_len < src_len)
   {
     // Create reduced boundary by downsampling
     uint16_t down_smp_factor = src_len / dst_len;
 
-    // Calculate floor log2. TODO: find a better / faster solution
+    // Calculate floor log2. MIP_TODO: find a better / faster solution
     int tmp = 0;
     if (down_smp_factor & 0xffff0000) {
       down_smp_factor >>= 16;
@@ -614,7 +614,7 @@ void kvz_mip_reduced_pred(kvz_pixel* const output,
   const int input_size = 2 * red_bdry_size;
 
   // Use local buffer for transposed result
-  kvz_pixel* out_buf_transposed = MALLOC(kvz_pixel, red_pred_size * red_pred_size); // TODO: get rid of MALLOC & FREE
+  kvz_pixel* out_buf_transposed = MALLOC(kvz_pixel, red_pred_size * red_pred_size); // MIP_TODO: get rid of MALLOC & FREE
   kvz_pixel* const out_ptr = transpose ? out_buf_transposed : output;
 
   int sum = 0;
@@ -669,7 +669,7 @@ void kvz_mip_pred_upsampling_1D(kvz_pixel* const dst, const kvz_pixel* const src
                                 const uint16_t boundary_step,
                                 const uint16_t ups_factor)
 {
-  // Calculate floor log2. TODO: find a better / faster solution
+  // Calculate floor log2. MIP_TODO: find a better / faster solution
   uint16_t upsample_factor = ups_factor;
   int tmp = 0;
   if (upsample_factor & 0xffff0000) {
@@ -737,15 +737,16 @@ void kvz_mip_pred_upsampling_1D(kvz_pixel* const dst, const kvz_pixel* const src
 
 /** \brief Matrix weighted intra prediction.
 */
-void kvz_mip_predict(encoder_state_t const* const state,
-                     kvz_intra_references* const refs,
-                     const uint16_t pred_block_width,
-                     const uint16_t pred_block_height)
+void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* const refs,
+                     const uint16_t pred_block_width, const uint16_t pred_block_height,
+                     const color_t color,
+                     kvz_pixel* dst,
+                     const int mip_mode, const bool mip_transp)
 {
   // Separate this function into smaller bits if needed
   
-  kvz_pixel* result; // TODO: pass the dst buffer to this function
-  const int mode_idx = 0; // TODO: pass mode
+  kvz_pixel* result = dst;
+  const int mode_idx = mip_mode;
 
   // *** INPUT PREP ***
 
@@ -790,8 +791,8 @@ void kvz_mip_predict(encoder_state_t const* const state,
   kvz_pixel* const top_reduced = &red_bdry[0];
   kvz_pixel* const left_reduced = &red_bdry[red_bdry_size];
 
-  kvz_mip_boundary_downsampling(top_reduced, ref_samples_top, width, red_bdry_size);
-  kvz_mip_boundary_downsampling(left_reduced, ref_samples_left, height, red_bdry_size);
+  kvz_mip_boundary_downsampling_1D(top_reduced, ref_samples_top, width, red_bdry_size);
+  kvz_mip_boundary_downsampling_1D(left_reduced, ref_samples_left, height, red_bdry_size);
 
   // Transposed reduced boundaries
   kvz_pixel* const left_reduced_trans = &red_bdry_trans[0];
@@ -822,7 +823,7 @@ void kvz_mip_predict(encoder_state_t const* const state,
   // *** BLOCK PREDICT ***
 
   const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1);
-  const bool transpose = 0; // TODO: pass transpose
+  const bool transpose = mip_transp;
 
   uint8_t* matrix;
   switch (size_id) {
@@ -839,7 +840,7 @@ void kvz_mip_predict(encoder_state_t const* const state,
       assert(false && "Invalid MIP size id.");
   }
 
-  kvz_pixel* red_pred_buffer = MALLOC(kvz_pixel, red_pred_size * red_pred_size); // TODO: get rid of MALLOC and FREE
+  kvz_pixel* red_pred_buffer = MALLOC(kvz_pixel, red_pred_size * red_pred_size); // MIP_TODO: get rid of MALLOC and FREE
   kvz_pixel* const reduced_pred = need_upsampling ? red_pred_buffer : result;
 
   const kvz_pixel* const reduced_bdry = transpose ? red_bdry_trans : red_bdry;
@@ -1357,7 +1358,9 @@ static void intra_recon_tb_leaf(
   cclm_parameters_t *cclm_params,
   lcu_t *lcu,
   color_t color,
-  uint8_t multi_ref_idx)
+  uint8_t multi_ref_idx,
+  bool use_mip,
+  bool mip_transp)
 {
   const kvz_config *cfg = &state->encoder_control->cfg;
   const int shift = color == COLOR_Y ? 0 : 1;
@@ -1368,6 +1371,7 @@ static void intra_recon_tb_leaf(
     log2width -= 1;
   }
   const int width = 1 << log2width;
+  const int height = width; // TODO: proper height for non-square blocks
   const int lcu_width = LCU_WIDTH >> shift;
 
   const vector2d_t luma_px = { x, y };
@@ -1404,7 +1408,13 @@ static void intra_recon_tb_leaf(
   int stride = state->tile->frame->source->stride;
   const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
   if(intra_mode < 68) {
-    kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary, multi_ref_index);
+    if (use_mip) {
+      assert(intra_mode < 16 && "MIP mode must be between [0, 16]");
+      kvz_mip_predict(state, &refs, width, height, color, pred, intra_mode, mip_transp);
+    }
+    else {
+      kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary, multi_ref_index);
+    }
   } else {
     kvz_pixels_blit(&state->tile->frame->cclm_luma_rec[x / 2 + (y * stride) / 4], pred, width, width, stride / 2, width);
     if(cclm_params == NULL) {
@@ -1464,6 +1474,8 @@ void kvz_intra_recon_cu(
   cu_info_t *cur_cu,
   cclm_parameters_t *cclm_params,
   uint8_t multi_ref_idx,
+  bool mip_flag,
+  bool mip_transp,
   lcu_t *lcu)
 {
   const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
@@ -1472,6 +1484,8 @@ void kvz_intra_recon_cu(
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
   uint8_t multi_ref_index = multi_ref_idx;
+  bool use_mip = mip_flag;
+  bool mip_transposed = mip_transp;
 
   // Reset CBFs because CBFs might have been set
   // for depth earlier
@@ -1489,10 +1503,10 @@ void kvz_intra_recon_cu(
     const int32_t x2 = x + offset;
     const int32_t y2 = y + offset;
 
-    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, lcu);
-    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, lcu);
-    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, lcu);
-    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, lcu);
+    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
+    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
+    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
+    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, NULL, multi_ref_index, use_mip, mip_transposed, lcu);
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs[3] = {
@@ -1513,11 +1527,11 @@ void kvz_intra_recon_cu(
     const bool has_chroma = mode_chroma != -1 &&  (x % 8 == 0 && y % 8 == 0);
     // Process a leaf TU.
     if (has_luma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y, multi_ref_index);
+      intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y, multi_ref_index, use_mip, mip_transposed);
     }
     if (has_chroma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0);
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0, use_mip, mip_transposed);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0, use_mip, mip_transposed);
     }
 
     kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
diff --git a/src/intra.h b/src/intra.h
index 1d05fea0..436e20bf 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -130,6 +130,8 @@ void kvz_intra_recon_cu(
   cu_info_t *cur_cu,
   cclm_parameters_t* cclm_params,
   uint8_t multi_ref_idx,
+  bool mip_flag,
+  bool mip_transp,
   lcu_t *lcu);
 
 
@@ -146,4 +148,15 @@ void kvz_predict_cclm(
   kvz_intra_references* chroma_ref,
   kvz_pixel* dst,
   cclm_parameters_t* cclm_params
+);
+
+void kvz_mip_predict(
+  encoder_state_t const * const state,
+  kvz_intra_references * refs,
+  const uint16_t width,
+  const uint16_t height,
+  const color_t color,
+  kvz_pixel* dst,
+  const int mip_mode,
+  const bool mip_transp
 );
\ No newline at end of file
diff --git a/src/search.c b/src/search.c
index 5c8be359..beedbf0f 100644
--- a/src/search.c
+++ b/src/search.c
@@ -727,14 +727,18 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       int8_t intra_trafo;
       double intra_cost;
       uint8_t multi_ref_index = 0;
+      bool mip_flag;
+      bool mip_transposed;
       kvz_search_cu_intra(state, x, y, depth, lcu,
-                          &intra_mode, &intra_trafo, &intra_cost, &multi_ref_index);
+                          &intra_mode, &intra_trafo, &intra_cost, &multi_ref_index, &mip_flag, &mip_transposed);
       if (intra_cost < cost) {
         cost = intra_cost;
         cur_cu->type = CU_INTRA;
         cur_cu->part_size = depth > MAX_DEPTH ? SIZE_NxN : SIZE_2Nx2N;
         cur_cu->intra.mode = intra_mode;
         cur_cu->intra.multi_ref_idx = multi_ref_index;
+        cur_cu->intra.mip_flag = mip_flag;
+        cur_cu->intra.mip_is_transposed = mip_transposed;
 
         //If the CU is not split from 64x64 block, the MTS is disabled for that CU.
         cur_cu->tr_idx = (depth > 0) ? intra_trafo : 0;
@@ -751,7 +755,9 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                          x, y,
                          depth,
                          cur_cu->intra.mode, -1, // skip chroma
-                         NULL, NULL, cur_cu->intra.multi_ref_idx, lcu);
+                         NULL, NULL, cur_cu->intra.multi_ref_idx, 
+                         cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed, 
+                         lcu);
 
       downsample_cclm_rec(
         state, x, y, cu_width / 2, cu_width / 2, lcu->rec.y, lcu->left_ref.y[64]
@@ -773,7 +779,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                            x & ~7, y & ~7, // TODO: as does this
                            depth,
                            -1, cur_cu->intra.mode_chroma, // skip luma
-                           NULL, cclm_params, 0, lcu);
+                           NULL, cclm_params, 0, cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed,
+                           lcu);
       }
     } else if (cur_cu->type == CU_INTER) {
 
@@ -933,7 +940,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                            x, y,
                            depth,
                            cur_cu->intra.mode, mode_chroma,
-                           NULL,NULL, 0, lcu);
+                           NULL,NULL, 0, cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed,
+                           lcu);
 
         cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
         if (has_chroma) {
diff --git a/src/search_intra.c b/src/search_intra.c
index 8615565a..c5503e24 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -333,7 +333,9 @@ static double search_intra_trdepth(encoder_state_t * const state,
         x_px, y_px,
         depth,
         intra_mode, -1,
-        pred_cu, cclm_params, pred_cu->intra.multi_ref_idx, lcu);
+        pred_cu, cclm_params, pred_cu->intra.multi_ref_idx, 
+        pred_cu->intra.mip_flag, pred_cu->intra.mip_is_transposed,
+        lcu);
 
       // TODO: Not sure if this should be 0 or 1 but at least seems to work with 1
       if (pred_cu->tr_idx > 1)
@@ -361,7 +363,9 @@ static double search_intra_trdepth(encoder_state_t * const state,
         x_px, y_px,
         depth,
         -1, chroma_mode,
-        pred_cu, cclm_params, 0, lcu);
+        pred_cu, cclm_params, 0, 
+        pred_cu->intra.mip_flag, pred_cu->intra.mip_is_transposed,
+        lcu);
       best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
     }
     pred_cu->tr_skip = best_tr_idx == MTS_SKIP;
@@ -715,6 +719,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
                              int8_t *intra_preds,
                              int modes_to_check,
                              int8_t modes[67], int8_t trafo[67], double costs[67],
+                             int num_mip_modes,
+                             int8_t mip_modes[32], int8_t mip_trafo[32], double mip_costs[32],
                              lcu_t *lcu,
                              uint8_t multi_ref_idx)
 {
@@ -749,6 +755,47 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
     }
   }
 
+  // MIP_TODO: implement this inside the standard intra for loop. Code duplication is bad.
+  // MIP search
+  const int transp_off = num_mip_modes >> 1;
+  for (int mip_mode = 0; mip_mode < num_mip_modes; ++mip_mode) {
+    int rdo_bitcost = 0; // MIP_TODO: MIP needs own bit cost calculation
+
+    mip_costs[mip_mode] = rdo_bitcost * (int)(state->lambda + 0.5); // MIP_TODO: check if this is also correct in the case when MIP is used.
+
+    const bool is_transposed = (mip_modes[mip_mode] >= transp_off ? true : false);
+    // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream.
+    // Modes [16, 31] are indicated with the separate transpose flag.
+    int8_t pred_mode = (is_transposed ? mip_modes[mip_mode] - transp_off : mip_modes[mip_mode]);
+
+    // Perform transform split search and save mode RD cost for the best one.
+    cu_info_t pred_cu;
+    pred_cu.depth = depth;
+    pred_cu.type = CU_INTRA;
+    pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); // TODO: non-square blocks
+    pred_cu.intra.mode = pred_mode;
+    pred_cu.intra.mode_chroma = pred_mode;
+    pred_cu.intra.multi_ref_idx = 0;
+    pred_cu.intra.mip_is_transposed = is_transposed;
+    pred_cu.intra.mip_flag = true;
+    pred_cu.joint_cb_cr = 0;
+    FILL(pred_cu.cbf, 0);
+
+    // Reset transform split data in lcu.cu for this area.
+    kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
+
+    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_mode, MAX_INT, &pred_cu, lcu, NULL, -1);
+    mip_costs[mip_mode] += mode_cost;
+    mip_trafo[mip_mode] = pred_cu.tr_idx;
+
+    // MIP_TODO: check if ET is viable when MIP is used
+    // Early termination if no coefficients has to be coded
+    if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) {
+      modes_to_check = mip_mode + 1;
+      break;
+    }
+  }
+
   for(int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
     int rdo_bitcost = kvz_luma_mode_bits(state, modes[rdo_mode], intra_preds, multi_ref_idx);
 
@@ -762,6 +809,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
     pred_cu.intra.mode = modes[rdo_mode];
     pred_cu.intra.mode_chroma = modes[rdo_mode];
     pred_cu.intra.multi_ref_idx = multi_ref_idx;
+    pred_cu.intra.mip_is_transposed = false;
+    pred_cu.intra.mip_flag = false;
     pred_cu.joint_cb_cr = 0;
     FILL(pred_cu.cbf, 0);
 
@@ -780,6 +829,9 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   }
 
   // Update order according to new costs
+  if (num_mip_modes) {
+    kvz_sort_modes_intra_luma(mip_modes, mip_trafo, mip_costs, num_mip_modes);
+  }
   kvz_sort_modes_intra_luma(modes, trafo, costs, modes_to_check);
 
   // The best transform split hierarchy is not saved anywhere, so to get the
@@ -921,7 +973,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
           x_px, y_px,
           depth,
           -1, chroma.mode, // skip luma
-          NULL, NULL, 0, lcu);
+          NULL, NULL, 0, false, false, lcu);
       }
       else {
 
@@ -954,7 +1006,7 @@ int8_t kvz_search_intra_chroma_rdo(encoder_state_t * const state,
           x_px, y_px,
           depth,
           -1, chroma.mode, // skip luma
-          NULL, cclm_params, 0, lcu);
+          NULL, cclm_params, 0, false, false, lcu);
       }
       chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
 
@@ -1044,7 +1096,9 @@ void kvz_search_cu_intra(encoder_state_t * const state,
                          int8_t *mode_out, 
                          int8_t *trafo_out, 
                          double *cost_out,
-                         uint8_t *multi_ref_idx_out)
+                         uint8_t *multi_ref_idx_out,
+                         bool *mip_flag_out,
+                         bool * mip_transposed_out)
 {
   const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
   const int8_t cu_width = LCU_WIDTH >> depth;
@@ -1081,6 +1135,38 @@ void kvz_search_cu_intra(encoder_state_t * const state,
   int8_t trafo[MAX_REF_LINE_IDX][67] = { 0 };
   double costs[MAX_REF_LINE_IDX][67];
 
+  bool enable_mip = state->encoder_control->cfg.mip;
+  int8_t mip_modes[32]; // Modes [0, 15] are non-transposed. Modes [16,31] are transposed.
+  int8_t mip_trafo[32];
+  double mip_costs[32];
+
+  if (enable_mip) {
+    for (int i = 0; i < 32; ++i) {
+      mip_modes[i] = i;
+      mip_costs[i] = MAX_INT;
+    }
+  }
+
+  // The maximum number of possible MIP modes depend on block size & shape
+  int width = LCU_WIDTH >> depth;
+  int height = width; // TODO: proper height for non-square blocks.
+  int tmp_modes;
+  // MIP_TODO: check for illegal block sizes.
+  if (width == 4 && height == 4) {
+    // Mip size_id = 0. Num modes = 32
+    tmp_modes = 32;
+  }
+  else if (width == 4 || height == 4 || (width == 8 && height == 8)) {
+    // Mip size_id = 0. Num modes = 16
+    tmp_modes = 16;
+  }
+  else {
+    // Mip size_id = 0. Num modes = 12
+    tmp_modes = 12;
+  }
+  
+  uint8_t num_mip_modes = enable_mip ? tmp_modes : 0;
+
   // Find best intra mode for 2Nx2N.
   kvz_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
 
@@ -1132,24 +1218,37 @@ void kvz_search_cu_intra(encoder_state_t * const state,
     }
     
     for(int8_t line = 0; line < lines; ++line) {
-      // For extra reference lines, only check predicted modes
+      // For extra reference lines, only check predicted modes & no MIP search.
       if (line != 0) {
         number_of_modes_to_search = 0;
+        num_mip_modes = 0;
       }
       int num_modes_to_check = MIN(number_of_modes[line], number_of_modes_to_search);
       kvz_sort_modes(modes[line], costs[line], number_of_modes[line]);
+      // TODO: if rough search is implemented for MIP, sort mip_modes here.
       number_of_modes[line] = search_intra_rdo(state,
                             x_px, y_px, depth,
                             ref_pixels, LCU_WIDTH,
                             candidate_modes,
                             num_modes_to_check,
-                            modes[line], trafo[line], costs[line], lcu, line);
+                            modes[line], trafo[line], costs[line],
+                            num_mip_modes,
+                            mip_modes, mip_trafo, mip_costs,
+                            lcu, line);
     }
   }
   
   uint8_t best_line = 0;
   double best_line_mode_cost = costs[0][0];
+  uint8_t best_mip_mode_idx = 0;
   uint8_t best_mode_indices[MAX_REF_LINE_IDX];
+
+  int8_t tmp_best_mode;
+  int8_t tmp_best_trafo;
+  double tmp_best_cost;
+  bool tmp_mip_flag = false;
+  bool tmp_mip_transp = false;
+
   for (int line = 0; line < lines; ++line) {
     best_mode_indices[line] = select_best_mode_index(modes[line], costs[line], number_of_modes[line]);
     if (best_line_mode_cost > costs[line][best_mode_indices[line]]) {
@@ -1158,8 +1257,25 @@ void kvz_search_cu_intra(encoder_state_t * const state,
     }
   }
 
-  *mode_out =  modes[best_line][best_mode_indices[best_line]];
-  *trafo_out = trafo[best_line][best_mode_indices[best_line]];
-  *cost_out =  costs[best_line][best_mode_indices[best_line]];
-  *multi_ref_idx_out = best_line;
+  tmp_best_mode = modes[best_line][best_mode_indices[best_line]];
+  tmp_best_trafo = trafo[best_line][best_mode_indices[best_line]];
+  tmp_best_cost = costs[best_line][best_mode_indices[best_line]];
+
+  if (num_mip_modes) {
+    best_mip_mode_idx = select_best_mode_index(mip_modes, mip_costs, num_mip_modes);
+    if (tmp_best_cost > mip_costs[best_mip_mode_idx]) {
+      tmp_best_mode = mip_modes[best_mip_mode_idx];
+      tmp_best_trafo = mip_trafo[best_mip_mode_idx];
+      tmp_best_cost = mip_costs[best_mip_mode_idx];
+      tmp_mip_flag = true;
+      tmp_mip_transp = (tmp_best_mode >= (num_mip_modes >> 1)) ? 1 : 0;
+    }
+  }
+
+  *mode_out =  tmp_best_mode;
+  *trafo_out = tmp_best_trafo;
+  *cost_out =  tmp_best_cost;
+  *mip_flag_out = tmp_mip_flag;
+  *mip_transposed_out = tmp_mip_transp;
+  *multi_ref_idx_out = tmp_mip_flag ? 0 : best_line;
 }
diff --git a/src/search_intra.h b/src/search_intra.h
index 4fc7210d..3ee84512 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -60,6 +60,8 @@ void kvz_search_cu_intra(encoder_state_t * const state,
                          int8_t *mode_out,
                          int8_t *trafo_out, 
                          double *cost_out,
-                         uint8_t *multi_ref_idx_out);
+                         uint8_t *multi_ref_idx_out,
+                         bool *mip_flag,
+                         bool *mip_transp);
 
 #endif // SEARCH_INTRA_H_

From 2daa8ad5376e45795de092a27637e176330a748e Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 21 Jan 2022 02:17:07 +0200
Subject: [PATCH 09/28] [mip] Implement cabac write.

---
 src/cabac.h              |   1 +
 src/encode_coding_tree.c | 255 +++++++++++++++++++++++----------------
 2 files changed, 149 insertions(+), 107 deletions(-)

diff --git a/src/cabac.h b/src/cabac.h
index 77d6251b..4d1c4d70 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -107,6 +107,7 @@ typedef struct
     cabac_ctx_t sig_coeff_group_model[4];
     cabac_ctx_t luma_planar_model[2];
     cabac_ctx_t multi_ref_line[2];
+    cabac_ctx_t mip_flag;
     cabac_ctx_t bdpcm_mode[4];
     cabac_ctx_t joint_cb_cr[3];
     cabac_ctx_t transform_skip_model_luma;
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 97d6115f..6f96735e 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -854,6 +854,39 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
   //isp_mode += ((height > TR_MAX_WIDTH) || !enough_samples) ? 2 : 0;
   bool allow_isp = enough_samples;
 
+  // Code MIP related bits
+  bool enable_mip = state->encoder_control->cfg.mip;
+  bool mip_flag = enable_mip ? cur_cu->intra.mip_flag : false;
+  bool mip_transpose = enable_mip ? cur_cu->intra.mip_is_transposed : false;
+  int8_t mip_mode = enable_mip ? cur_cu->intra.mode : 0;
+  uint8_t num_mip_modes;
+  
+  // Number of MIP modes for this block
+  if (width == 4 && height == 4) {
+    num_mip_modes = 16;
+  }
+  else if (width == 4 || height == 4 || (width == 8 && height == 8)) {
+    num_mip_modes = 8;
+  }
+  else {
+    num_mip_modes = 6;
+  }
+
+  if (mip_flag) {
+    assert(mip_mode >= 0 && mip_mode < 16 && "MIP mode must be between [0, 15]");
+  }
+
+  if (cur_cu->type == CU_INTRA && !cur_cu->bdpcmMode && enable_mip) {
+    // Write MIP flag
+    cabac->cur_ctx = &(cabac->ctx.mip_flag);
+    CABAC_BIN(cabac, mip_flag, "mip_flag");
+    if (mip_flag) {
+      // Write MIP transpose flag & mode
+      CABAC_BIN_EP(cabac, (cur_cu->intra.mip_is_transposed), "mip_transposed");
+      kvz_cabac_encode_trunc_bin(cabac, mip_mode, num_mip_modes);
+    }
+  }
+
   // Code MRL related bits
   bool enable_mrl = state->encoder_control->cfg.mrl;
   int multi_ref_idx = enable_mrl ? cur_cu->intra.multi_ref_idx : 0;
@@ -862,7 +895,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
   if(multi_ref_idx) DBG_YUVIEW_VALUE(state->frame->poc, DBG_YUVIEW_MRL, x, y, width, width, multi_ref_idx);
 #endif
 
-  if (cur_cu->type == CU_INTRA && (y % LCU_WIDTH) != 0 && !cur_cu->bdpcmMode && enable_mrl) {
+  if (cur_cu->type == CU_INTRA && (y % LCU_WIDTH) != 0 && !cur_cu->bdpcmMode && enable_mrl && !mip_flag) {
     if (MAX_REF_LINE_IDX > 1) {
       cabac->cur_ctx = &(cabac->ctx.multi_ref_line[0]);
       CABAC_BIN(cabac, multi_ref_idx != 0, "multi_ref_line");
@@ -875,7 +908,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
 
 
   // ToDo: update real usage, these if clauses as such don't make any sense
-  if (isp_mode != 0 && multi_ref_idx == 0) {
+  if (isp_mode != 0 && multi_ref_idx == 0 && !mip_flag) {
     if (isp_mode) {
       cabac->cur_ctx = &(cabac->ctx.intra_subpart_model[0]);
       CABAC_BIN(cabac, 0, "intra_subPartitions");
@@ -890,126 +923,134 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
     }
   }
 
-  // PREDINFO CODING
-  // If intra prediction mode is found from the predictors,
-  // it can be signaled with two EP's. Otherwise we can send
-  // 5 EP bins with the full predmode
-  // ToDo: fix comments for VVC
   const int cu_width = LCU_WIDTH >> depth;
-
-  cabac->cur_ctx = &(cabac->ctx.intra_luma_mpm_flag_model);
-  for (int j = 0; j < num_pred_units; ++j) {
-    const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j);
-    const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j);
-    const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y);
-
-    const cu_info_t *left_pu = NULL;
-    const cu_info_t *above_pu = NULL;
-
-    if (pu_x > 0) {
-      assert(pu_x >> 2 > 0);
-      left_pu = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y + cu_width - 1);
-    }
-    // Don't take the above PU across the LCU boundary.
-    if (pu_y % LCU_WIDTH > 0 && pu_y > 0) {
-      assert(pu_y >> 2 > 0);
-      above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x + cu_width - 1, pu_y - 1);
-    }
-
-
-    kvz_intra_get_dir_luma_predictor(pu_x, pu_y,
-                                      intra_preds[j],
-                                      cur_pu,
-                                      left_pu, above_pu);
-
-
-    intra_pred_mode_actual[j] = cur_pu->intra.mode;
-
-    for (int i = 0; i < INTRA_MPM_COUNT; i++) {
-      if (intra_preds[j][i] == intra_pred_mode[j]) {
-        mpm_preds[j] = (int8_t)i;
-        break;
-      }
-    }
-    // Is the mode in the MPM array or not
-    flag[j] = (mpm_preds[j] == -1) ? 0 : 1;
-    if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) {
-      CABAC_BIN(cabac, flag[j], "prev_intra_luma_pred_flag");
-    }
-  }
-
-  for (int j = 0; j < num_pred_units; ++j) {
-    // Signal index of the prediction mode in the prediction list, if it is there
-    if (flag[j]) {
-      
+  // If MIP is used, skip writing normal intra modes
+  if (!mip_flag) {
+    // PREDINFO CODING
+    // If intra prediction mode is found from the predictors,
+    // it can be signaled with two EP's. Otherwise we can send
+    // 5 EP bins with the full predmode
+    // ToDo: fix comments for VVC
+    
+    cabac->cur_ctx = &(cabac->ctx.intra_luma_mpm_flag_model);
+    for (int j = 0; j < num_pred_units; ++j) {
       const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j);
       const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j);
-      const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y);
-      cabac->cur_ctx = &(cabac->ctx.luma_planar_model[(isp_mode ? 0 : 1)]);
-      if (cur_pu->intra.multi_ref_idx == 0) {
-        CABAC_BIN(cabac, (mpm_preds[j] > 0 ? 1 : 0), "mpm_idx_luma_planar");
-      }
-      //CABAC_BIN_EP(cabac, (mpm_preds[j] > 0 ? 1 : 0), "mpm_idx");
-      if (mpm_preds[j] > 0) {
-        CABAC_BIN_EP(cabac, (mpm_preds[j] > 1 ? 1 : 0), "mpm_idx");
-      }
-      if (mpm_preds[j] > 1) {
-        CABAC_BIN_EP(cabac, (mpm_preds[j] > 2 ? 1 : 0), "mpm_idx");
-      }
-      if (mpm_preds[j] > 2) {
-        CABAC_BIN_EP(cabac, (mpm_preds[j] > 3 ? 1 : 0), "mpm_idx");
-      }
-      if (mpm_preds[j] > 3) {
-        CABAC_BIN_EP(cabac, (mpm_preds[j] > 4 ? 1 : 0), "mpm_idx");
-      }
-    } else {
-      // Signal the actual prediction mode.
-      int32_t tmp_pred = intra_pred_mode[j];
+      const cu_info_t* cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y);
 
-      uint8_t intra_preds_temp[INTRA_MPM_COUNT+2];
-      memcpy(intra_preds_temp, intra_preds[j], sizeof(int8_t)*3);
-      memcpy(intra_preds_temp+4, &intra_preds[j][3], sizeof(int8_t)*3);
-      intra_preds_temp[3] = 255;
-      intra_preds_temp[7] = 255;
+      const cu_info_t* left_pu = NULL;
+      const cu_info_t* above_pu = NULL;
 
-      // Improvised merge sort
-      // Sort prediction list from lowest to highest.
-      if (intra_preds_temp[0] > intra_preds_temp[1]) SWAP(intra_preds_temp[0], intra_preds_temp[1], uint8_t);
-      if (intra_preds_temp[0] > intra_preds_temp[2]) SWAP(intra_preds_temp[0], intra_preds_temp[2], uint8_t);
-      if (intra_preds_temp[1] > intra_preds_temp[2]) SWAP(intra_preds_temp[1], intra_preds_temp[2], uint8_t);
+      if (pu_x > 0) {
+        assert(pu_x >> 2 > 0);
+        left_pu = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y + cu_width - 1);
+      }
+      // Don't take the above PU across the LCU boundary.
+      if (pu_y % LCU_WIDTH > 0 && pu_y > 0) {
+        assert(pu_y >> 2 > 0);
+        above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x + cu_width - 1, pu_y - 1);
+      }
 
-      if (intra_preds_temp[4] > intra_preds_temp[5]) SWAP(intra_preds_temp[4], intra_preds_temp[5], uint8_t);
-      if (intra_preds_temp[4] > intra_preds_temp[6]) SWAP(intra_preds_temp[4], intra_preds_temp[6], uint8_t);
-      if (intra_preds_temp[5] > intra_preds_temp[6]) SWAP(intra_preds_temp[5], intra_preds_temp[6], uint8_t);
 
-      // Merge two subarrays
-      int32_t array1 = 0;
-      int32_t array2 = 4;
-      for (int item = 0; item < INTRA_MPM_COUNT; item++) {
-        if (intra_preds_temp[array1] < intra_preds_temp[array2]) {
-          intra_preds[j][item] = intra_preds_temp[array1];
-          array1++;
-        } else {
-          intra_preds[j][item] = intra_preds_temp[array2];
-          array2++;
+      kvz_intra_get_dir_luma_predictor(pu_x, pu_y,
+        intra_preds[j],
+        cur_pu,
+        left_pu, above_pu);
+
+
+      intra_pred_mode_actual[j] = cur_pu->intra.mode;
+
+      for (int i = 0; i < INTRA_MPM_COUNT; i++) {
+        if (intra_preds[j][i] == intra_pred_mode[j]) {
+          mpm_preds[j] = (int8_t)i;
+          break;
         }
       }
+      // Is the mode in the MPM array or not
+      flag[j] = (mpm_preds[j] == -1) ? 0 : 1;
+      if (!(cur_pu->intra.multi_ref_idx || (isp_mode))) {
+        CABAC_BIN(cabac, flag[j], "prev_intra_luma_pred_flag");
+      }
+    }
 
-      // Reduce the index of the signaled prediction mode according to the
-      // prediction list, as it has been already signaled that it's not one
-      // of the prediction modes.
-      for (int i = INTRA_MPM_COUNT-1; i >= 0; i--) {
-        if (tmp_pred > intra_preds[j][i]) {
-          tmp_pred--;
+    for (int j = 0; j < num_pred_units; ++j) {
+      // TODO: this loop is unnecessary in VVC. Remove in future
+      assert(j == 0 && "In VVC this loop should be run only once.");
+
+      // Signal index of the prediction mode in the prediction list, if it is there
+      if (flag[j]) {
+
+        const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j);
+        const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j);
+        const cu_info_t* cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y);
+        cabac->cur_ctx = &(cabac->ctx.luma_planar_model[(isp_mode ? 0 : 1)]);
+        if (cur_pu->intra.multi_ref_idx == 0) {
+          CABAC_BIN(cabac, (mpm_preds[j] > 0 ? 1 : 0), "mpm_idx_luma_planar");
+        }
+        //CABAC_BIN_EP(cabac, (mpm_preds[j] > 0 ? 1 : 0), "mpm_idx");
+        if (mpm_preds[j] > 0) {
+          CABAC_BIN_EP(cabac, (mpm_preds[j] > 1 ? 1 : 0), "mpm_idx");
+        }
+        if (mpm_preds[j] > 1) {
+          CABAC_BIN_EP(cabac, (mpm_preds[j] > 2 ? 1 : 0), "mpm_idx");
+        }
+        if (mpm_preds[j] > 2) {
+          CABAC_BIN_EP(cabac, (mpm_preds[j] > 3 ? 1 : 0), "mpm_idx");
+        }
+        if (mpm_preds[j] > 3) {
+          CABAC_BIN_EP(cabac, (mpm_preds[j] > 4 ? 1 : 0), "mpm_idx");
         }
       }
-      
-      kvz_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT);
+      else {
+        // Signal the actual prediction mode.
+        int32_t tmp_pred = intra_pred_mode[j];
+
+        uint8_t intra_preds_temp[INTRA_MPM_COUNT + 2];
+        memcpy(intra_preds_temp, intra_preds[j], sizeof(int8_t) * 3);
+        memcpy(intra_preds_temp + 4, &intra_preds[j][3], sizeof(int8_t) * 3);
+        intra_preds_temp[3] = 255;
+        intra_preds_temp[7] = 255;
+
+        // Improvised merge sort
+        // Sort prediction list from lowest to highest.
+        if (intra_preds_temp[0] > intra_preds_temp[1]) SWAP(intra_preds_temp[0], intra_preds_temp[1], uint8_t);
+        if (intra_preds_temp[0] > intra_preds_temp[2]) SWAP(intra_preds_temp[0], intra_preds_temp[2], uint8_t);
+        if (intra_preds_temp[1] > intra_preds_temp[2]) SWAP(intra_preds_temp[1], intra_preds_temp[2], uint8_t);
+
+        if (intra_preds_temp[4] > intra_preds_temp[5]) SWAP(intra_preds_temp[4], intra_preds_temp[5], uint8_t);
+        if (intra_preds_temp[4] > intra_preds_temp[6]) SWAP(intra_preds_temp[4], intra_preds_temp[6], uint8_t);
+        if (intra_preds_temp[5] > intra_preds_temp[6]) SWAP(intra_preds_temp[5], intra_preds_temp[6], uint8_t);
+
+        // Merge two subarrays
+        int32_t array1 = 0;
+        int32_t array2 = 4;
+        for (int item = 0; item < INTRA_MPM_COUNT; item++) {
+          if (intra_preds_temp[array1] < intra_preds_temp[array2]) {
+            intra_preds[j][item] = intra_preds_temp[array1];
+            array1++;
+          }
+          else {
+            intra_preds[j][item] = intra_preds_temp[array2];
+            array2++;
+          }
+        }
+
+        // Reduce the index of the signaled prediction mode according to the
+        // prediction list, as it has been already signaled that it's not one
+        // of the prediction modes.
+        for (int i = INTRA_MPM_COUNT - 1; i >= 0; i--) {
+          if (tmp_pred > intra_preds[j][i]) {
+            tmp_pred--;
+          }
+        }
+
+        kvz_cabac_encode_trunc_bin(cabac, tmp_pred, 67 - INTRA_MPM_COUNT);
+      }
     }
   }
 
   // Code chroma prediction mode.
-  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) {
+  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4 && !mip_flag) {
     encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
   }
 
@@ -1017,7 +1058,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
 
   encode_mts_idx(state, cabac, cur_cu);
 
-  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) {
+  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8 && !mip_flag) {
     encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
     encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff);
   }

From d5e2bbd824fcb5a3dcda38f479af0e855389c6d2 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 21 Jan 2022 02:20:37 +0200
Subject: [PATCH 10/28] [mip] Fix things according to comments. Fix asserts.
 Remove MIP from chroma recon and search calls. WIP mip mode cost calculation.

---
 src/cu.h           |  4 ++--
 src/intra.c        | 19 ++++++++-----------
 src/search.c       |  8 ++++++--
 src/search_intra.c | 16 +++++++++++++---
 src/search_intra.h |  1 +
 5 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index 779fe1fd..56ece914 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -169,8 +169,8 @@ typedef struct
       int8_t mode;
       int8_t mode_chroma;
       uint8_t multi_ref_idx;
-      bool mip_flag;
-      bool mip_is_transposed;
+      uint8_t mip_flag;
+      uint8_t mip_is_transposed;
     } intra;
     struct {
       mv_t    mv[2][2];  // \brief Motion vectors for L0 and L1
diff --git a/src/intra.c b/src/intra.c
index d1723faa..bac1a019 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -614,7 +614,7 @@ void kvz_mip_reduced_pred(kvz_pixel* const output,
   const int input_size = 2 * red_bdry_size;
 
   // Use local buffer for transposed result
-  kvz_pixel* out_buf_transposed = MALLOC(kvz_pixel, red_pred_size * red_pred_size); // MIP_TODO: get rid of MALLOC & FREE
+  kvz_pixel out_buf_transposed[LCU_WIDTH * LCU_WIDTH];
   kvz_pixel* const out_ptr = transpose ? out_buf_transposed : output;
 
   int sum = 0;
@@ -657,8 +657,6 @@ void kvz_mip_reduced_pred(kvz_pixel* const output,
       }
     }
   }
-
-  FREE_POINTER(out_buf_transposed);
 }
 
 
@@ -774,8 +772,8 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
   uint16_t ups_ver_factor = height / red_pred_size;
 
   // Upsampling factors must be powers of two
-  assert((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1)) != 0) && "Horizontal upsampling factor must be power of two.");
-  assert((ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1)) != 0) && "Vertical upsampling factor must be power of two.");
+  assert(!(ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1)) != 0) && "Horizontal upsampling factor must be power of two.");
+  assert(!(ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1)) != 0) && "Vertical upsampling factor must be power of two.");
 
   // Initialize prediction parameters END
 
@@ -840,7 +838,8 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
       assert(false && "Invalid MIP size id.");
   }
 
-  kvz_pixel* red_pred_buffer = MALLOC(kvz_pixel, red_pred_size * red_pred_size); // MIP_TODO: get rid of MALLOC and FREE
+  // Max possible size is red_pred_size * red_pred_size, red_pred_size can be either 4 or 8
+  kvz_pixel red_pred_buffer[8*8];
   kvz_pixel* const reduced_pred = need_upsampling ? red_pred_buffer : result;
 
   const kvz_pixel* const reduced_bdry = transpose ? red_bdry_trans : red_bdry;
@@ -868,8 +867,6 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
         1, ups_ver_factor);
     }
   }
-
-  FREE_POINTER(red_pred_buffer);
   // *** BLOCK PREDICT *** END
 }
 
@@ -1409,7 +1406,7 @@ static void intra_recon_tb_leaf(
   const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
   if(intra_mode < 68) {
     if (use_mip) {
-      assert(intra_mode < 16 && "MIP mode must be between [0, 16]");
+      assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]");
       kvz_mip_predict(state, &refs, width, height, color, pred, intra_mode, mip_transp);
     }
     else {
@@ -1530,8 +1527,8 @@ void kvz_intra_recon_cu(
       intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y, multi_ref_index, use_mip, mip_transposed);
     }
     if (has_chroma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0, use_mip, mip_transposed);
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0, use_mip, mip_transposed);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0, false, false);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0, false, false);
     }
 
     kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
diff --git a/src/search.c b/src/search.c
index beedbf0f..2c91a5d8 100644
--- a/src/search.c
+++ b/src/search.c
@@ -739,6 +739,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         cur_cu->intra.multi_ref_idx = multi_ref_index;
         cur_cu->intra.mip_flag = mip_flag;
         cur_cu->intra.mip_is_transposed = mip_transposed;
+        // If a MIP mode is selected, set chroma mode to planar and skip further chroma search
+        if (mip_flag) {
+          cur_cu->intra.mode_chroma = 0;
+        }
 
         //If the CU is not split from 64x64 block, the MTS is disabled for that CU.
         cur_cu->tr_idx = (depth > 0) ? intra_trafo : 0;
@@ -770,7 +774,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         // into account, so there is less of a chanse of luma mode being
         // really bad for chroma.
         cclm_parameters_t cclm_params[2];
-        if (ctrl->cfg.rdo >= 3) {
+        if (ctrl->cfg.rdo >= 3 && !cur_cu->intra.mip_flag) {
           cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu, cclm_params);
           lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         }
@@ -779,7 +783,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                            x & ~7, y & ~7, // TODO: as does this
                            depth,
                            -1, cur_cu->intra.mode_chroma, // skip luma
-                           NULL, cclm_params, 0, cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed,
+                           NULL, cclm_params, 0, false, false,
                            lcu);
       }
     } else if (cur_cu->type == CU_INTER) {
diff --git a/src/search_intra.c b/src/search_intra.c
index c5503e24..48e70914 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -364,7 +364,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
         depth,
         -1, chroma_mode,
         pred_cu, cclm_params, 0, 
-        pred_cu->intra.mip_flag, pred_cu->intra.mip_is_transposed,
+        false, false,
         lcu);
       best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
     }
@@ -758,8 +758,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   // MIP_TODO: implement this inside the standard intra for loop. Code duplication is bad.
   // MIP search
   const int transp_off = num_mip_modes >> 1;
-  for (int mip_mode = 0; mip_mode < num_mip_modes; ++mip_mode) {
-    int rdo_bitcost = 0; // MIP_TODO: MIP needs own bit cost calculation
+  for (uint8_t mip_mode = 0; mip_mode < num_mip_modes; ++mip_mode) {
+    int rdo_bitcost = kvz_mip_mode_bits(state, mip_mode, num_mip_modes);
 
     mip_costs[mip_mode] = rdo_bitcost * (int)(state->lambda + 0.5); // MIP_TODO: check if this is also correct in the case when MIP is used.
 
@@ -853,6 +853,16 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
 }
 
 
+double kvz_mip_mode_bits(const encoder_state_t *state, int mip_mode, int num_mip_modes)
+{
+  double mode_bits = 0.0;
+
+  // MIP_TODO: calculate bit costs of writing the following: mip_flag, mip_transpose_flag & mip_mode
+
+  return mode_bits;
+}
+
+
 double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx)
 {
   double mode_bits = 0.0;
diff --git a/src/search_intra.h b/src/search_intra.h
index 3ee84512..7d25e09d 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -43,6 +43,7 @@
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"
 
+double kvz_mip_mode_bits(const encoder_state_t *state, int mip_mode, int num_mip_modes);
 
 double kvz_luma_mode_bits(const encoder_state_t *state, 
                           int8_t luma_mode, const int8_t *intra_preds, uint8_t multi_ref_idx);

From 0cf89e9516751cf41f53e2cdd418ff7829a472b9 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 21 Jan 2022 13:32:41 +0200
Subject: [PATCH 11/28] [mip] Fix CI errors.

---
 src/encode_coding_tree.c |  2 +-
 src/intra.c              | 12 ++++++------
 src/mip_data.h           |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 6f96735e..5edca0f8 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -882,7 +882,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
     CABAC_BIN(cabac, mip_flag, "mip_flag");
     if (mip_flag) {
       // Write MIP transpose flag & mode
-      CABAC_BIN_EP(cabac, (cur_cu->intra.mip_is_transposed), "mip_transposed");
+      CABAC_BIN_EP(cabac, mip_transpose, "mip_transposed");
       kvz_cabac_encode_trunc_bin(cabac, mip_mode, num_mip_modes);
     }
   }
diff --git a/src/intra.c b/src/intra.c
index bac1a019..1350aa69 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -772,8 +772,8 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
   uint16_t ups_ver_factor = height / red_pred_size;
 
   // Upsampling factors must be powers of two
-  assert(!(ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1)) != 0) && "Horizontal upsampling factor must be power of two.");
-  assert(!(ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1)) != 0) && "Vertical upsampling factor must be power of two.");
+  assert(!((ups_hor_factor < 1) || ((ups_hor_factor & (ups_hor_factor - 1))) != 0) && "Horizontal upsampling factor must be power of two.");
+  assert(!((ups_ver_factor < 1) || ((ups_ver_factor & (ups_ver_factor - 1))) != 0) && "Vertical upsampling factor must be power of two.");
 
   // Initialize prediction parameters END
 
@@ -823,16 +823,16 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
   const bool need_upsampling = (ups_hor_factor > 1) || (ups_ver_factor > 1);
   const bool transpose = mip_transp;
 
-  uint8_t* matrix;
+  const uint8_t* matrix;
   switch (size_id) {
     case 0: 
-      matrix = &mip_matrix_4x4[mode_idx][0][0];
+      matrix = &kvz_mip_matrix_4x4[mode_idx][0][0];
       break;
     case 1: 
-      matrix = &mip_matrix_8x8[mode_idx][0][0];
+      matrix = &kvz_mip_matrix_8x8[mode_idx][0][0];
       break;
     case 2: 
-      matrix = &mip_matrix_16x16[mode_idx][0][0];
+      matrix = &kvz_mip_matrix_16x16[mode_idx][0][0];
       break;
     default:
       assert(false && "Invalid MIP size id.");
diff --git a/src/mip_data.h b/src/mip_data.h
index b789994d..2ace73ab 100644
--- a/src/mip_data.h
+++ b/src/mip_data.h
@@ -44,7 +44,7 @@
 #define MIP_OFFSET_MATRIX 32
 
 // NOTE: these matrices need to be aligned if used with avx2
-const uint8_t mip_matrix_4x4[16][16][4] =
+const uint8_t kvz_mip_matrix_4x4[16][16][4] =
 {
   {
     {   32,   30,   90,   28},
@@ -336,7 +336,7 @@ const uint8_t mip_matrix_4x4[16][16][4] =
   }
 };
 
-const uint8_t mip_matrix_8x8[8][16][8] =
+const uint8_t kvz_mip_matrix_8x8[8][16][8] =
 {
   {
     {   30,   63,   46,   37,   25,   33,   33,   34},
@@ -484,7 +484,7 @@ const uint8_t mip_matrix_8x8[8][16][8] =
   }
 };
 
-const uint8_t mip_matrix_16x16[6][64][7] =
+const uint8_t kvz_mip_matrix_16x16[6][64][7] =
 {
   {
     {   42,   37,   33,   27,   44,   33,   35},

From 6b3395797862784d693988a03c5786cbb45f050a Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 24 Jan 2022 13:16:28 +0200
Subject: [PATCH 12/28] [mip] Implement MIP bit cost calculation.

---
 src/cu.h           |   4 +-
 src/search.c       |   3 +-
 src/search_intra.c | 106 +++++++++++++++++++++++++++------------------
 src/search_intra.h |   4 +-
 4 files changed, 70 insertions(+), 47 deletions(-)

diff --git a/src/cu.h b/src/cu.h
index 56ece914..4be18926 100644
--- a/src/cu.h
+++ b/src/cu.h
@@ -169,8 +169,8 @@ typedef struct
       int8_t mode;
       int8_t mode_chroma;
       uint8_t multi_ref_idx;
-      uint8_t mip_flag;
-      uint8_t mip_is_transposed;
+      int8_t mip_flag;
+      int8_t mip_is_transposed;
     } intra;
     struct {
       mv_t    mv[2][2];  // \brief Motion vectors for L0 and L1
diff --git a/src/search.c b/src/search.c
index 2c91a5d8..2cc14eb3 100644
--- a/src/search.c
+++ b/src/search.c
@@ -502,7 +502,8 @@ static double calc_mode_bits(const encoder_state_t *state,
     kvz_intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu);
   }
 
-  double mode_bits = kvz_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes, cur_cu->intra.multi_ref_idx);
+  // MIP_TODO: calculation of MIP mode cost if this CU has MIP enabled.
+  double mode_bits = kvz_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes, cur_cu->intra.multi_ref_idx, 0);
 
   if (((depth == 4 && x % 8 && y % 8) || (depth != 4)) && state->encoder_control->chroma_format != KVZ_CSP_400) {
     mode_bits += kvz_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode);
diff --git a/src/search_intra.c b/src/search_intra.c
index 48e70914..46a34e2f 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -679,7 +679,7 @@ static int8_t search_intra_rough(encoder_state_t * const state,
   // affecting the halving search.
   int lambda_cost = (int)(state->lambda_sqrt + 0.5);
   for (int mode_i = 0; mode_i < modes_selected; ++mode_i) {
-    costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0);
+    costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0);
   }
 
   #undef PARALLEL_BLKS
@@ -759,7 +759,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   // MIP search
   const int transp_off = num_mip_modes >> 1;
   for (uint8_t mip_mode = 0; mip_mode < num_mip_modes; ++mip_mode) {
-    int rdo_bitcost = kvz_mip_mode_bits(state, mip_mode, num_mip_modes);
+    int rdo_bitcost = kvz_luma_mode_bits(state, mip_modes[mip_mode], intra_preds, 0, num_mip_modes);
 
     mip_costs[mip_mode] = rdo_bitcost * (int)(state->lambda + 0.5); // MIP_TODO: check if this is also correct in the case when MIP is used.
 
@@ -797,7 +797,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   }
 
   for(int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
-    int rdo_bitcost = kvz_luma_mode_bits(state, modes[rdo_mode], intra_preds, multi_ref_idx);
+    int rdo_bitcost = kvz_luma_mode_bits(state, modes[rdo_mode], intra_preds, multi_ref_idx, 0);
 
     costs[rdo_mode] = rdo_bitcost * (int)(state->lambda + 0.5);
 
@@ -853,56 +853,80 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
 }
 
 
-double kvz_mip_mode_bits(const encoder_state_t *state, int mip_mode, int num_mip_modes)
+double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes)
 {
   double mode_bits = 0.0;
 
-  // MIP_TODO: calculate bit costs of writing the following: mip_flag, mip_transpose_flag & mip_mode
+  bool enable_mip = state->encoder_control->cfg.mip ? (num_mip_modes > 0 ? true : false) : false;
 
-  return mode_bits;
-}
+  if (enable_mip) {
+    // Make a copy of state->cabac for bit cost estimation.
+    cabac_data_t state_cabac_copy;
+    cabac_data_t* cabac;
+    memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t));
+    // Clear data and set mode to count only
+    state_cabac_copy.only_count = 1;
+    state_cabac_copy.num_buffered_bytes = 0;
+    state_cabac_copy.bits_left = 23;
 
+    cabac = &state_cabac_copy;
 
-double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx)
-{
-  double mode_bits = 0.0;
-
-  int8_t mode_in_preds = -1;
-  for (int i = 0; i < INTRA_MPM_COUNT; ++i) {
-    if (luma_mode == intra_preds[i]) {
-      mode_in_preds = i;
-      break;
+    // Do cabac writes as normal
+    const int transp_off = num_mip_modes >> 1;
+    bool mip_flag = enable_mip;
+    const bool is_transposed = luma_mode >= transp_off ? true : false;
+    int8_t mip_mode = is_transposed ? luma_mode - transp_off : luma_mode;
+    // Write MIP flag
+    cabac->cur_ctx = &(cabac->ctx.mip_flag);
+    CABAC_BIN(cabac, mip_flag, "mip_flag");
+    if (mip_flag) {
+      // Write MIP transpose flag & mode
+      CABAC_BIN_EP(cabac, is_transposed, "mip_transposed");
+      kvz_cabac_encode_trunc_bin(cabac, mip_mode, transp_off);
     }
+
+    // Writes done. Get bit cost out of cabac
+    mode_bits += (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3); // MIP_TODO: check what this bit shifting means.
   }
-
-  bool enable_mrl = state->encoder_control->cfg.mrl;
-  uint8_t multi_ref_index = enable_mrl ? multi_ref_idx : 0;
-
-  const cabac_ctx_t* ctx = &(state->cabac.ctx.intra_luma_mpm_flag_model);
-
-  if (multi_ref_index == 0) {
-    mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds != -1);
-  }
-
-  // Add MRL bits.
-  if (enable_mrl && MAX_REF_LINE_IDX > 1) {
-    ctx = &(state->cabac.ctx.multi_ref_line[0]);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 0);
-
-    if (multi_ref_index != 0 && MAX_REF_LINE_IDX > 2) {
-      ctx = &(state->cabac.ctx.multi_ref_line[1]);
-      mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 1);
+  else {
+    int8_t mode_in_preds = -1;
+    for (int i = 0; i < INTRA_MPM_COUNT; ++i) {
+      if (luma_mode == intra_preds[i]) {
+        mode_in_preds = i;
+        break;
+      }
     }
-  }
 
-  if (mode_in_preds != -1 || multi_ref_index != 0) {
-    ctx = &(state->cabac.ctx.luma_planar_model[0]);
+    bool enable_mrl = state->encoder_control->cfg.mrl;
+    uint8_t multi_ref_index = enable_mrl ? multi_ref_idx : 0;
+
+    const cabac_ctx_t* ctx = &(state->cabac.ctx.intra_luma_mpm_flag_model);
+
     if (multi_ref_index == 0) {
-      mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds>0);
+      mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds != -1);
+    }
+
+    // Add MRL bits.
+    if (enable_mrl && MAX_REF_LINE_IDX > 1) {
+      ctx = &(state->cabac.ctx.multi_ref_line[0]);
+      mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 0);
+
+      if (multi_ref_index != 0 && MAX_REF_LINE_IDX > 2) {
+        ctx = &(state->cabac.ctx.multi_ref_line[1]);
+        mode_bits += CTX_ENTROPY_FBITS(ctx, multi_ref_index != 1);
+      }
+    }
+
+    if (mode_in_preds != -1 || multi_ref_index != 0) {
+      ctx = &(state->cabac.ctx.luma_planar_model[0]);
+      if (multi_ref_index == 0) {
+        mode_bits += CTX_ENTROPY_FBITS(ctx, mode_in_preds > 0);
+      }
+      mode_bits += MIN(4.0, mode_in_preds);
+    }
+    else {
+      mode_bits += 6.0;
     }
-    mode_bits += MIN(4.0,mode_in_preds);
-  } else {
-    mode_bits += 6.0;
   }
 
   return mode_bits;
diff --git a/src/search_intra.h b/src/search_intra.h
index 7d25e09d..06ef77ec 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -43,10 +43,8 @@
 #include "global.h" // IWYU pragma: keep
 #include "intra.h"
 
-double kvz_mip_mode_bits(const encoder_state_t *state, int mip_mode, int num_mip_modes);
-
 double kvz_luma_mode_bits(const encoder_state_t *state, 
-                          int8_t luma_mode, const int8_t *intra_preds, uint8_t multi_ref_idx);
+                          int8_t luma_mode, const int8_t *intra_preds, uint8_t multi_ref_idx, const uint8_t num_mip_modes);
                        
 double kvz_chroma_mode_bits(const encoder_state_t *state,
                         int8_t chroma_mode, int8_t luma_mode);

From a4366dbcc59313a0106a640a300b24eddb2ab0cd Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 25 Jan 2022 00:44:31 +0200
Subject: [PATCH 13/28] [mip] Fix error which caused asan CI test to fail. Was
 caused by an uninitialized intra CU mip value.

---
 src/search.c       | 11 +++++--
 src/search_intra.c | 71 ++++++++++++++++++++++++++++++----------------
 2 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/src/search.c b/src/search.c
index 2cc14eb3..faa8b900 100644
--- a/src/search.c
+++ b/src/search.c
@@ -161,6 +161,8 @@ static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, in
         to->intra.mode        = cu->intra.mode;
         to->intra.mode_chroma = cu->intra.mode_chroma;
         to->intra.multi_ref_idx = cu->intra.multi_ref_idx;
+        to->intra.mip_flag = cu->intra.mip_flag;
+        to->intra.mip_is_transposed = cu->intra.mip_is_transposed;
       } else {
         to->skipped   = cu->skipped;
         to->merged    = cu->merged;
@@ -728,8 +730,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
       int8_t intra_trafo;
       double intra_cost;
       uint8_t multi_ref_index = 0;
-      bool mip_flag;
-      bool mip_transposed;
+      bool mip_flag = false;
+      bool mip_transposed = false;
       kvz_search_cu_intra(state, x, y, depth, lcu,
                           &intra_mode, &intra_trafo, &intra_cost, &multi_ref_index, &mip_flag, &mip_transposed);
       if (intra_cost < cost) {
@@ -754,7 +756,10 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // mode search of adjacent CUs.
     if (cur_cu->type == CU_INTRA) {
       assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN);
-      cur_cu->intra.mode_chroma = cur_cu->intra.mode;
+      // Chroma mode must be planar if mip_flag is set.
+      if (!cur_cu->intra.mip_flag) {
+        cur_cu->intra.mode_chroma = cur_cu->intra.mode;
+      }
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
       kvz_intra_recon_cu(state,
                          x, y,
diff --git a/src/search_intra.c b/src/search_intra.c
index 46a34e2f..dc0d8d6e 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -829,10 +829,15 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   }
 
   // Update order according to new costs
+  kvz_sort_modes_intra_luma(modes, trafo, costs, modes_to_check);
+  bool use_mip = false;
   if (num_mip_modes) {
     kvz_sort_modes_intra_luma(mip_modes, mip_trafo, mip_costs, num_mip_modes);
+    if (costs[0] > mip_costs[0]) {
+      use_mip = true;
+    }
   }
-  kvz_sort_modes_intra_luma(modes, trafo, costs, modes_to_check);
+  
 
   // The best transform split hierarchy is not saved anywhere, so to get the
   // transform split hierarchy the search has to be performed again with the
@@ -842,9 +847,25 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
     pred_cu.depth = depth;
     pred_cu.type = CU_INTRA;
     pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
-    pred_cu.intra.mode = modes[0];
-    pred_cu.intra.mode_chroma = modes[0];
-    pred_cu.intra.multi_ref_idx = multi_ref_idx;
+    if (use_mip) {
+      pred_cu.intra.mode = mip_modes[0];
+      pred_cu.intra.mode_chroma = 0;
+      pred_cu.intra.multi_ref_idx = 0;
+      int transp_off = num_mip_modes >> 1;
+      bool is_transposed = (mip_modes[0] >= transp_off ? true : false);
+      int8_t pred_mode = (is_transposed ? mip_modes[0] - transp_off : mip_modes[0]);
+      pred_cu.intra.mode = pred_mode;
+      pred_cu.intra.mode_chroma = 0;
+      pred_cu.intra.mip_flag = true;
+      pred_cu.intra.mip_is_transposed = is_transposed;
+    }
+    else {
+      pred_cu.intra.mode = modes[0];
+      pred_cu.intra.mode_chroma = modes[0];
+      pred_cu.intra.multi_ref_idx = multi_ref_idx;
+      pred_cu.intra.mip_flag = false;
+      pred_cu.intra.mip_is_transposed = false;
+    }
     FILL(pred_cu.cbf, 0);
     search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu, NULL, trafo[0]);
   }
@@ -1170,37 +1191,37 @@ void kvz_search_cu_intra(encoder_state_t * const state,
   double costs[MAX_REF_LINE_IDX][67];
 
   bool enable_mip = state->encoder_control->cfg.mip;
-  int8_t mip_modes[32]; // Modes [0, 15] are non-transposed. Modes [16,31] are transposed.
+  // The maximum number of mip modes is 32. Max modes can be less depending on block size.
+  // Half of the possible modes are transposed, which is indicated by a separate transpose flag
+  int8_t mip_modes[32]; 
   int8_t mip_trafo[32];
   double mip_costs[32];
 
+  // The maximum number of possible MIP modes depend on block size & shape
+  int width = LCU_WIDTH >> depth;
+  int height = width; // TODO: proper height for non-square blocks.
+  int num_mip_modes = 0;
+
   if (enable_mip) {
     for (int i = 0; i < 32; ++i) {
       mip_modes[i] = i;
       mip_costs[i] = MAX_INT;
     }
+    // MIP_TODO: check for illegal block sizes.
+    if (width == 4 && height == 4) {
+      // Mip size_id = 0. Num modes = 32
+      num_mip_modes = 32;
+    }
+    else if (width == 4 || height == 4 || (width == 8 && height == 8)) {
+      // Mip size_id = 0. Num modes = 16
+      num_mip_modes = 16;
+    }
+    else {
+      // Mip size_id = 0. Num modes = 12
+      num_mip_modes = 12;
+    }
   }
 
-  // The maximum number of possible MIP modes depend on block size & shape
-  int width = LCU_WIDTH >> depth;
-  int height = width; // TODO: proper height for non-square blocks.
-  int tmp_modes;
-  // MIP_TODO: check for illegal block sizes.
-  if (width == 4 && height == 4) {
-    // Mip size_id = 0. Num modes = 32
-    tmp_modes = 32;
-  }
-  else if (width == 4 || height == 4 || (width == 8 && height == 8)) {
-    // Mip size_id = 0. Num modes = 16
-    tmp_modes = 16;
-  }
-  else {
-    // Mip size_id = 0. Num modes = 12
-    tmp_modes = 12;
-  }
-  
-  uint8_t num_mip_modes = enable_mip ? tmp_modes : 0;
-
   // Find best intra mode for 2Nx2N.
   kvz_pixel *ref_pixels = &lcu->ref.y[lcu_px.x + lcu_px.y * LCU_WIDTH];
 

From 55e4091030b6eba0b3d1130ff58ce3297dc56f8a Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 26 Jan 2022 13:33:51 +0200
Subject: [PATCH 14/28] [mip] Fix MIP cabac write.

---
 src/cabac.h                   |  2 +-
 src/context.c                 | 11 +++++++++++
 src/encode_coding_tree.c      | 34 ++++++++++++++++++++++++++++++----
 src/encoder_state-bitstream.c |  6 +++++-
 4 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/src/cabac.h b/src/cabac.h
index 4d1c4d70..8489333c 100644
--- a/src/cabac.h
+++ b/src/cabac.h
@@ -107,7 +107,7 @@ typedef struct
     cabac_ctx_t sig_coeff_group_model[4];
     cabac_ctx_t luma_planar_model[2];
     cabac_ctx_t multi_ref_line[2];
-    cabac_ctx_t mip_flag;
+    cabac_ctx_t mip_flag[4];
     cabac_ctx_t bdpcm_mode[4];
     cabac_ctx_t joint_cb_cr[3];
     cabac_ctx_t transform_skip_model_luma;
diff --git a/src/context.c b/src/context.c
index b7b1142e..3e834744 100644
--- a/src/context.c
+++ b/src/context.c
@@ -93,6 +93,13 @@ static const uint8_t MULTI_REF_LINE_MODE[4][2] = {
   {   5,   8, },
 };
 
+static const uint8_t MIP_FLAG[4][4] = {
+  {  56,  57,  50,  26, },
+  {  41,  57,  58,  26, },
+  {  33,  49,  50,  25, },
+  {   9,  10,   9,   6, },
+};
+
 static const uint8_t INIT_INTRA_LUMA_MPM_FLAG[4] = {
   44, 36, 45, 6
 };
@@ -483,6 +490,10 @@ void kvz_init_contexts(encoder_state_t *state, int8_t QP, int8_t slice)
   kvz_ctx_init(&cabac->ctx.multi_ref_line[0], QP, MULTI_REF_LINE_MODE[slice][0], MULTI_REF_LINE_MODE[3][0]);
   kvz_ctx_init(&cabac->ctx.multi_ref_line[1], QP, MULTI_REF_LINE_MODE[slice][1], MULTI_REF_LINE_MODE[3][1]);
 
+  for (i = 0; i < 4; i++) {
+    kvz_ctx_init(&cabac->ctx.mip_flag[i], QP, MIP_FLAG[slice][i], MIP_FLAG[3][i]);
+  }
+
   kvz_ctx_init(&cabac->ctx.chroma_pred_model, QP, INIT_CHROMA_PRED_MODE[slice], INIT_CHROMA_PRED_MODE[3]);
 
   kvz_ctx_init(&cabac->ctx.cclm_flag, QP, INIT_CCLM_FLAG[slice], INIT_CCLM_FLAG[3]);
diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 5edca0f8..3b4a0bea 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -856,7 +856,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
 
   // Code MIP related bits
   bool enable_mip = state->encoder_control->cfg.mip;
-  bool mip_flag = enable_mip ? cur_cu->intra.mip_flag : false;
+  int8_t mip_flag = enable_mip ? cur_cu->intra.mip_flag : false;
   bool mip_transpose = enable_mip ? cur_cu->intra.mip_is_transposed : false;
   int8_t mip_mode = enable_mip ? cur_cu->intra.mode : 0;
   uint8_t num_mip_modes;
@@ -877,8 +877,34 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
   }
 
   if (cur_cu->type == CU_INTRA && !cur_cu->bdpcmMode && enable_mip) {
+    // Derive mip flag context id
+    uint8_t ctx_id = 0;
+    const int cu_width = LCU_WIDTH >> depth;
+    const int cu_height = cu_width; // TODO: height for non-square blocks
+    const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, 0);
+    const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, 0);
+    const cu_info_t* left_pu = NULL;
+    const cu_info_t* above_pu = NULL;
+
+    if (pu_x > 0) {
+      assert(pu_x >> 2 > 0);
+      left_pu = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y + cu_width - 1);
+    }
+    if (left_pu != NULL) {
+      ctx_id = left_pu->intra.mip_flag ? 1 : 0;
+    } 
+    // Don't take the above PU across the LCU boundary.
+    if (pu_y % LCU_WIDTH > 0 && pu_y > 0) {
+      assert(pu_y >> 2 > 0);
+      above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x + cu_width - 1, pu_y - 1);
+    }
+    if (above_pu != NULL) {
+      ctx_id += above_pu->intra.mip_flag ? 1 : 0;
+    }
+    ctx_id = (cu_width > 2 * cu_height || cu_height > 2 * cu_width) ? 3 : ctx_id;
+
     // Write MIP flag
-    cabac->cur_ctx = &(cabac->ctx.mip_flag);
+    cabac->cur_ctx = &(cabac->ctx.mip_flag[ctx_id]);
     CABAC_BIN(cabac, mip_flag, "mip_flag");
     if (mip_flag) {
       // Write MIP transpose flag & mode
@@ -1050,7 +1076,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
   }
 
   // Code chroma prediction mode.
-  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4 && !mip_flag) {
+  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth != 4) {
     encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
   }
 
@@ -1058,7 +1084,7 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
 
   encode_mts_idx(state, cabac, cur_cu);
 
-  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8 && !mip_flag) {
+  if (state->encoder_control->chroma_format != KVZ_CSP_400 && depth == 4 && x % 8 && y % 8) {
     encode_chroma_intra_cu(cabac, cur_cu, x, y, frame, cu_width, state->encoder_control->cfg.cclm);
     encode_transform_coeff(state, x, y, depth, 0, 0, 0, 1, coeff);
   }
diff --git a/src/encoder_state-bitstream.c b/src/encoder_state-bitstream.c
index b4645f92..f1d8b61d 100644
--- a/src/encoder_state-bitstream.c
+++ b/src/encoder_state-bitstream.c
@@ -668,7 +668,11 @@ static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
     WRITE_U(stream, 0, 1, "sps_mrl_enabled_flag");
   }
   
-  WRITE_U(stream, 0, 1, "sps_mip_enabled_flag");
+  if (state->encoder_control->cfg.mip) {
+    WRITE_U(stream, 1, 1, "sps_mip_enabled_flag");
+  } else {
+    WRITE_U(stream, 0, 1, "sps_mip_enabled_flag");
+  }
   // if(!no_cclm_constraint_flag)
   if(encoder->chroma_format != KVZ_CSP_400) {
     WRITE_U(stream, encoder->cfg.cclm, 1, "sps_cclm_enabled_flag");

From b8a8bce55a1b9f54cf5afbffd833d2300b118212 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 26 Jan 2022 13:34:39 +0200
Subject: [PATCH 15/28] [mip] Fix MIP bit cost calculation.

---
 src/search.c       |  2 +-
 src/search_intra.c | 41 ++++++++++++++++++++++++++++++++++++-----
 src/search_intra.h |  2 +-
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/src/search.c b/src/search.c
index faa8b900..7db7d5b9 100644
--- a/src/search.c
+++ b/src/search.c
@@ -505,7 +505,7 @@ static double calc_mode_bits(const encoder_state_t *state,
   }
 
   // MIP_TODO: calculation of MIP mode cost if this CU has MIP enabled.
-  double mode_bits = kvz_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes, cur_cu->intra.multi_ref_idx, 0);
+  double mode_bits = kvz_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes, cur_cu->intra.multi_ref_idx, 0, 0);
 
   if (((depth == 4 && x % 8 && y % 8) || (depth != 4)) && state->encoder_control->chroma_format != KVZ_CSP_400) {
     mode_bits += kvz_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode);
diff --git a/src/search_intra.c b/src/search_intra.c
index dc0d8d6e..6800bfef 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -679,7 +679,7 @@ static int8_t search_intra_rough(encoder_state_t * const state,
   // affecting the halving search.
   int lambda_cost = (int)(state->lambda_sqrt + 0.5);
   for (int mode_i = 0; mode_i < modes_selected; ++mode_i) {
-    costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0);
+    costs[mode_i] += lambda_cost * kvz_luma_mode_bits(state, modes[mode_i], intra_preds, 0, 0, 0);
   }
 
   #undef PARALLEL_BLKS
@@ -756,10 +756,40 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   }
 
   // MIP_TODO: implement this inside the standard intra for loop. Code duplication is bad.
+  // MIP_TODO: deriving mip flag context id could be done in it's own function since the exact same code is used in encode_coding_tree.c
   // MIP search
   const int transp_off = num_mip_modes >> 1;
   for (uint8_t mip_mode = 0; mip_mode < num_mip_modes; ++mip_mode) {
-    int rdo_bitcost = kvz_luma_mode_bits(state, mip_modes[mip_mode], intra_preds, 0, num_mip_modes);
+    // Derive mip flag context id
+    uint8_t ctx_id = 0;
+    const videoframe_t* const frame = state->tile->frame;
+    const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
+    cu_info_t* cur_cu;
+    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
+    const int cu_width = width;
+    const int cu_height = cu_width; // TODO: height for non-square blocks
+    const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x_px, 0);
+    const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y_px, 0);
+    const cu_info_t* left_pu = NULL;
+    const cu_info_t* above_pu = NULL;
+
+    if (pu_x > 0) {
+      assert(pu_x >> 2 > 0);
+      left_pu = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y + cu_width - 1);
+    }
+    if (left_pu != NULL) {
+      ctx_id = left_pu->intra.mip_flag ? 1 : 0;
+    }
+    // Don't take the above PU across the LCU boundary.
+    if (pu_y % LCU_WIDTH > 0 && pu_y > 0) {
+      assert(pu_y >> 2 > 0);
+      above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x + cu_width - 1, pu_y - 1);
+    }
+    if (above_pu != NULL) {
+      ctx_id += above_pu->intra.mip_flag ? 1 : 0;
+    }
+    ctx_id = (cu_width > 2 * cu_height || cu_height > 2 * cu_width) ? 3 : ctx_id;
+    int rdo_bitcost = kvz_luma_mode_bits(state, mip_modes[mip_mode], intra_preds, 0, num_mip_modes, ctx_id);
 
     mip_costs[mip_mode] = rdo_bitcost * (int)(state->lambda + 0.5); // MIP_TODO: check if this is also correct in the case when MIP is used.
 
@@ -797,7 +827,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   }
 
   for(int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
-    int rdo_bitcost = kvz_luma_mode_bits(state, modes[rdo_mode], intra_preds, multi_ref_idx, 0);
+    int rdo_bitcost = kvz_luma_mode_bits(state, modes[rdo_mode], intra_preds, multi_ref_idx, 0, 0);
 
     costs[rdo_mode] = rdo_bitcost * (int)(state->lambda + 0.5);
 
@@ -874,7 +904,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
 }
 
 
-double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes)
+double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes, int mip_flag_ctx_id)
 {
   double mode_bits = 0.0;
 
@@ -897,8 +927,9 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const
     bool mip_flag = enable_mip;
     const bool is_transposed = luma_mode >= transp_off ? true : false;
     int8_t mip_mode = is_transposed ? luma_mode - transp_off : luma_mode;
+
     // Write MIP flag
-    cabac->cur_ctx = &(cabac->ctx.mip_flag);
+    cabac->cur_ctx = &(cabac->ctx.mip_flag[mip_flag_ctx_id]);
     CABAC_BIN(cabac, mip_flag, "mip_flag");
     if (mip_flag) {
       // Write MIP transpose flag & mode
diff --git a/src/search_intra.h b/src/search_intra.h
index 06ef77ec..659695b3 100644
--- a/src/search_intra.h
+++ b/src/search_intra.h
@@ -44,7 +44,7 @@
 #include "intra.h"
 
 double kvz_luma_mode_bits(const encoder_state_t *state, 
-                          int8_t luma_mode, const int8_t *intra_preds, uint8_t multi_ref_idx, const uint8_t num_mip_modes);
+                          int8_t luma_mode, const int8_t *intra_preds, uint8_t multi_ref_idx, const uint8_t num_mip_modes, int mip_flag_ctx_id);
                        
 double kvz_chroma_mode_bits(const encoder_state_t *state,
                         int8_t chroma_mode, int8_t luma_mode);

From 8aea4f67f7e977abb8ff947a03867c7ce7aab5df Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 31 Jan 2022 13:05:56 +0200
Subject: [PATCH 16/28] [mip] Fix reference indexing error.

---
 src/intra.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 1350aa69..fa7f27d6 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -777,8 +777,8 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
 
   // Initialize prediction parameters END
 
-  kvz_pixel* ref_samples_top = refs->ref.top; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init
-  kvz_pixel* ref_samples_left = refs->ref.left;
+  kvz_pixel* ref_samples_top = &refs->ref.top[1]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init
+  kvz_pixel* ref_samples_left = &refs->ref.left[1];
 
   // Compute reduced boundary with Haar-downsampling
   const int input_size = 2 * red_bdry_size;

From d2c24c9a0c38802c8c81a2ee8c4a817cccac5e0b Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Mon, 31 Jan 2022 13:07:02 +0200
Subject: [PATCH 17/28] [mip] Fix error in deriving MIP flag context id.

---
 src/encode_coding_tree.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index 3b4a0bea..c398fa97 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -882,24 +882,17 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
     const int cu_width = LCU_WIDTH >> depth;
     const int cu_height = cu_width; // TODO: height for non-square blocks
     const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, 0);
-    const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, 0);
-    const cu_info_t* left_pu = NULL;
-    const cu_info_t* above_pu = NULL;
+    const int pu_y = PU_GET_Y(cur_cu->part_size, cu_height, y, 0);
 
     if (pu_x > 0) {
       assert(pu_x >> 2 > 0);
-      left_pu = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y + cu_width - 1);
+      // Get mip flag from left PU
+      ctx_id = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y)->intra.mip_flag ? 1 : 0;
     }
-    if (left_pu != NULL) {
-      ctx_id = left_pu->intra.mip_flag ? 1 : 0;
-    } 
-    // Don't take the above PU across the LCU boundary.
     if (pu_y % LCU_WIDTH > 0 && pu_y > 0) {
       assert(pu_y >> 2 > 0);
-      above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x + cu_width - 1, pu_y - 1);
-    }
-    if (above_pu != NULL) {
-      ctx_id += above_pu->intra.mip_flag ? 1 : 0;
+      // Get mip flag from above PU
+      ctx_id += kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y - 1)->intra.mip_flag ? 1 : 0;
     }
     ctx_id = (cu_width > 2 * cu_height || cu_height > 2 * cu_width) ? 3 : ctx_id;
 

From df5cbbe82f85acfef18936fdc52d6fce179e84dc Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 1 Feb 2022 21:09:36 +0200
Subject: [PATCH 18/28] [mip] Fix issue with invalid MIP modes written into
 cabac. Fix Mip mode cost estimation. Implement function to derive mip flag
 context id. Fix some asserts. Replace floor log 2 implementation with
 existing kvz math function.

---
 src/encode_coding_tree.c | 19 +---------
 src/intra.c              | 80 +++++++++++++++-------------------------
 src/intra.h              |  2 +
 src/search_intra.c       | 47 +++++++----------------
 4 files changed, 47 insertions(+), 101 deletions(-)

diff --git a/src/encode_coding_tree.c b/src/encode_coding_tree.c
index c398fa97..77b43f5f 100644
--- a/src/encode_coding_tree.c
+++ b/src/encode_coding_tree.c
@@ -873,28 +873,13 @@ static void encode_intra_coding_unit(encoder_state_t * const state,
   }
 
   if (mip_flag) {
-    assert(mip_mode >= 0 && mip_mode < 16 && "MIP mode must be between [0, 15]");
+    assert(mip_mode >= 0 && mip_mode < num_mip_modes && "Invalid MIP mode.");
   }
 
   if (cur_cu->type == CU_INTRA && !cur_cu->bdpcmMode && enable_mip) {
-    // Derive mip flag context id
-    uint8_t ctx_id = 0;
     const int cu_width = LCU_WIDTH >> depth;
     const int cu_height = cu_width; // TODO: height for non-square blocks
-    const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, 0);
-    const int pu_y = PU_GET_Y(cur_cu->part_size, cu_height, y, 0);
-
-    if (pu_x > 0) {
-      assert(pu_x >> 2 > 0);
-      // Get mip flag from left PU
-      ctx_id = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y)->intra.mip_flag ? 1 : 0;
-    }
-    if (pu_y % LCU_WIDTH > 0 && pu_y > 0) {
-      assert(pu_y >> 2 > 0);
-      // Get mip flag from above PU
-      ctx_id += kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y - 1)->intra.mip_flag ? 1 : 0;
-    }
-    ctx_id = (cu_width > 2 * cu_height || cu_height > 2 * cu_width) ? 3 : ctx_id;
+    uint8_t ctx_id = kvz_get_mip_flag_context(x, y, cu_width, cu_height, NULL, frame->cu_array);
 
     // Write MIP flag
     cabac->cur_ctx = &(cabac->ctx.mip_flag[ctx_id]);
diff --git a/src/intra.c b/src/intra.c
index fa7f27d6..2d3d0939 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -546,37 +546,41 @@ void kvz_predict_cclm(
 }
 
 
+int kvz_get_mip_flag_context(int x, int y, int width, int height, lcu_t* const lcu, cu_array_t* const cu_a) {
+  assert(!(lcu && cu_a));
+  int context = 0;
+  
+  if (lcu) {
+    int x_local = SUB_SCU(x);
+    int y_local = SUB_SCU(y);
+    if (x) {
+      context += LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local)->intra.mip_flag;
+    }
+    if (y) {
+      context += LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1)->intra.mip_flag;
+    }
+    context = (width > 2 * height || height > 2 * width) ? 3 : context;
+  }
+  else {
+    if (x > 0) {
+      context += kvz_cu_array_at_const(cu_a, x - 1, y)->intra.mip_flag;
+    }
+    if (y > 0) {
+      context += kvz_cu_array_at_const(cu_a, x, y - 1)->intra.mip_flag;
+    }
+    context = (width > 2 * height || height > 2 * width) ? 3 : context;
+  }
+  return context;
+}
+
+
 void kvz_mip_boundary_downsampling_1D(kvz_pixel* reduced_dst, const kvz_pixel* const ref_src, int src_len, int dst_len)
 {
   if (dst_len < src_len)
   {
     // Create reduced boundary by downsampling
     uint16_t down_smp_factor = src_len / dst_len;
-
-    // Calculate floor log2. MIP_TODO: find a better / faster solution
-    int tmp = 0;
-    if (down_smp_factor & 0xffff0000) {
-      down_smp_factor >>= 16;
-      tmp += 16;
-    }
-    if (down_smp_factor & 0xff00) {
-      down_smp_factor >>= 8;
-      tmp += 8;
-    }
-    if (down_smp_factor & 0xf0) {
-      down_smp_factor >>= 4;
-      tmp += 4;
-    }
-    if (down_smp_factor & 0xc) {
-      down_smp_factor >>= 2;
-      tmp += 2;
-    }
-    if (down_smp_factor & 0x2) {
-      down_smp_factor >>= 1;
-      tmp += 1;
-    }
-
-    const int log2_factor = tmp;
+    const int log2_factor = kvz_math_floor_log2(down_smp_factor);
     const int rounding_offset = (1 << (log2_factor - 1));
 
     uint16_t src_idx = 0;
@@ -667,31 +671,7 @@ void kvz_mip_pred_upsampling_1D(kvz_pixel* const dst, const kvz_pixel* const src
                                 const uint16_t boundary_step,
                                 const uint16_t ups_factor)
 {
-  // Calculate floor log2. MIP_TODO: find a better / faster solution
-  uint16_t upsample_factor = ups_factor;
-  int tmp = 0;
-  if (upsample_factor & 0xffff0000) {
-    upsample_factor >>= 16;
-    tmp += 16;
-  }
-  if (upsample_factor & 0xff00) {
-    upsample_factor >>= 8;
-    tmp += 8;
-  }
-  if (upsample_factor & 0xf0) {
-    upsample_factor >>= 4;
-    tmp += 4;
-  }
-  if (upsample_factor & 0xc) {
-    upsample_factor >>= 2;
-    tmp += 2;
-  }
-  if (upsample_factor & 0x2) {
-    upsample_factor >>= 1;
-    tmp += 1;
-  }
-
-  const int log2_factor = tmp;
+  const int log2_factor = kvz_math_floor_log2(ups_factor);
   assert(ups_factor >= 2 && "Upsampling factor must be at least 2.");
   const int rounding_offset = 1 << (log2_factor - 1);
 
diff --git a/src/intra.h b/src/intra.h
index 436e20bf..7bd27e1f 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -150,6 +150,8 @@ void kvz_predict_cclm(
   cclm_parameters_t* cclm_params
 );
 
+int kvz_get_mip_flag_context(int x, int y, int width, int height, lcu_t* const lcu, cu_array_t* const cu_a);
+
 void kvz_mip_predict(
   encoder_state_t const * const state,
   kvz_intra_references * refs,
diff --git a/src/search_intra.c b/src/search_intra.c
index 6800bfef..efeda0e5 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -726,11 +726,11 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
 {
   const int tr_depth = CLIP(1, MAX_PU_DEPTH, depth + state->encoder_control->cfg.tr_depth_intra);
   const int width = LCU_WIDTH >> depth;
+  const int height = width; // TODO: proper height for non-square blocks
 
   kvz_pixel orig_block[LCU_WIDTH * LCU_WIDTH + 1];
 
-  // TODO: height for non-square blocks
-  kvz_pixels_blit(orig, orig_block, width, width, origstride, width);
+  kvz_pixels_blit(orig, orig_block, width, height, origstride, width);
 
   // Check that the predicted modes are in the RDO mode list
   if (modes_to_check < 67) {
@@ -756,46 +756,19 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   }
 
   // MIP_TODO: implement this inside the standard intra for loop. Code duplication is bad.
-  // MIP_TODO: deriving mip flag context id could be done in it's own function since the exact same code is used in encode_coding_tree.c
   // MIP search
   const int transp_off = num_mip_modes >> 1;
   for (uint8_t mip_mode = 0; mip_mode < num_mip_modes; ++mip_mode) {
     // Derive mip flag context id
-    uint8_t ctx_id = 0;
-    const videoframe_t* const frame = state->tile->frame;
-    const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
-    cu_info_t* cur_cu;
-    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
-    const int cu_width = width;
-    const int cu_height = cu_width; // TODO: height for non-square blocks
-    const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x_px, 0);
-    const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y_px, 0);
-    const cu_info_t* left_pu = NULL;
-    const cu_info_t* above_pu = NULL;
-
-    if (pu_x > 0) {
-      assert(pu_x >> 2 > 0);
-      left_pu = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y + cu_width - 1);
-    }
-    if (left_pu != NULL) {
-      ctx_id = left_pu->intra.mip_flag ? 1 : 0;
-    }
-    // Don't take the above PU across the LCU boundary.
-    if (pu_y % LCU_WIDTH > 0 && pu_y > 0) {
-      assert(pu_y >> 2 > 0);
-      above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x + cu_width - 1, pu_y - 1);
-    }
-    if (above_pu != NULL) {
-      ctx_id += above_pu->intra.mip_flag ? 1 : 0;
-    }
-    ctx_id = (cu_width > 2 * cu_height || cu_height > 2 * cu_width) ? 3 : ctx_id;
+    uint8_t ctx_id = kvz_get_mip_flag_context(x_px, y_px, width, height, lcu, NULL);
     int rdo_bitcost = kvz_luma_mode_bits(state, mip_modes[mip_mode], intra_preds, 0, num_mip_modes, ctx_id);
 
     mip_costs[mip_mode] = rdo_bitcost * (int)(state->lambda + 0.5); // MIP_TODO: check if this is also correct in the case when MIP is used.
 
     const bool is_transposed = (mip_modes[mip_mode] >= transp_off ? true : false);
     // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream.
-    // Modes [16, 31] are indicated with the separate transpose flag.
+    // Half of the modes [16, 31] are indicated with the separate transpose flag.
+    // Number of possible modes is less for larger blocks.
     int8_t pred_mode = (is_transposed ? mip_modes[mip_mode] - transp_off : mip_modes[mip_mode]);
 
     // Perform transform split search and save mode RD cost for the best one.
@@ -1244,11 +1217,11 @@ void kvz_search_cu_intra(encoder_state_t * const state,
       num_mip_modes = 32;
     }
     else if (width == 4 || height == 4 || (width == 8 && height == 8)) {
-      // Mip size_id = 0. Num modes = 16
+      // Mip size_id = 1. Num modes = 16
       num_mip_modes = 16;
     }
     else {
-      // Mip size_id = 0. Num modes = 12
+      // Mip size_id = 2. Num modes = 12
       num_mip_modes = 12;
     }
   }
@@ -1358,6 +1331,12 @@ void kvz_search_cu_intra(encoder_state_t * const state,
     }
   }
 
+  if (tmp_mip_flag) {
+    // Transform best mode index to proper form.
+    // Max mode index is half of max number of modes - 1 (i. e. for size id 2, max mode id is 5)
+    tmp_best_mode = (tmp_mip_transp ? tmp_best_mode - (num_mip_modes >> 1) : tmp_best_mode);
+  }
+
   *mode_out =  tmp_best_mode;
   *trafo_out = tmp_best_trafo;
   *cost_out =  tmp_best_cost;

From fcde90fbe0f1afe95736dd2515eb62b347e16e21 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 4 Feb 2022 14:18:48 +0200
Subject: [PATCH 19/28] [mip] Change kvz_pixel to int inside MIP implementation
 since some temp values could be negative. Add define for intra reference line
 length. Fix bug where wrong intra mode was passed to recon after search.

---
 src/intra.c        | 68 ++++++++++++++++++++++++++++------------------
 src/intra.h        |  7 +++--
 src/search_intra.c |  4 +--
 3 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 2d3d0939..6ae57361 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -574,7 +574,7 @@ int kvz_get_mip_flag_context(int x, int y, int width, int height, lcu_t* const l
 }
 
 
-void kvz_mip_boundary_downsampling_1D(kvz_pixel* reduced_dst, const kvz_pixel* const ref_src, int src_len, int dst_len)
+void kvz_mip_boundary_downsampling_1D(int* reduced_dst, const int* const ref_src, int src_len, int dst_len)
 {
   if (dst_len < src_len)
   {
@@ -605,8 +605,8 @@ void kvz_mip_boundary_downsampling_1D(kvz_pixel* reduced_dst, const kvz_pixel* c
 }
 
 
-void kvz_mip_reduced_pred(kvz_pixel* const output,
-                          const kvz_pixel* const input,
+void kvz_mip_reduced_pred(int* const output,
+                          const int* const input,
                           const uint8_t* matrix,
                           const bool transpose,
                           const int red_bdry_size,
@@ -618,8 +618,8 @@ void kvz_mip_reduced_pred(kvz_pixel* const output,
   const int input_size = 2 * red_bdry_size;
 
   // Use local buffer for transposed result
-  kvz_pixel out_buf_transposed[LCU_WIDTH * LCU_WIDTH];
-  kvz_pixel* const out_ptr = transpose ? out_buf_transposed : output;
+  int out_buf_transposed[LCU_WIDTH * LCU_WIDTH];
+  int* const out_ptr = transpose ? out_buf_transposed : output;
 
   int sum = 0;
   for (int i = 0; i < input_size; i++) { 
@@ -664,7 +664,7 @@ void kvz_mip_reduced_pred(kvz_pixel* const output,
 }
 
 
-void kvz_mip_pred_upsampling_1D(kvz_pixel* const dst, const kvz_pixel* const src, const kvz_pixel* const boundary,
+void kvz_mip_pred_upsampling_1D(int* const dst, const int* const src, const int* const boundary,
                                 const uint16_t src_size_ups_dim, const uint16_t src_size_orth_dim,
                                 const uint16_t src_step, const uint16_t src_stride,
                                 const uint16_t dst_step, const uint16_t dst_stride,
@@ -676,15 +676,15 @@ void kvz_mip_pred_upsampling_1D(kvz_pixel* const dst, const kvz_pixel* const src
   const int rounding_offset = 1 << (log2_factor - 1);
 
   uint16_t idx_orth_dim = 0;
-  const kvz_pixel* src_line = src;
-  kvz_pixel* dst_line = dst;
-  const kvz_pixel* boundary_line = boundary + boundary_step - 1;
+  const int* src_line = src;
+  int* dst_line = dst;
+  const int* boundary_line = boundary + boundary_step - 1;
   while (idx_orth_dim < src_size_orth_dim)
   {
     uint16_t idx_upsample_dim = 0;
-    const kvz_pixel* before = boundary_line;
-    const kvz_pixel* behind = src_line;
-    kvz_pixel* cur_dst = dst_line;
+    const int* before = boundary_line;
+    const int* behind = src_line;
+    int* cur_dst = dst_line;
     while (idx_upsample_dim < src_size_ups_dim)
     {
       uint16_t pos = 1;
@@ -721,9 +721,10 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
                      kvz_pixel* dst,
                      const int mip_mode, const bool mip_transp)
 {
-  // Separate this function into smaller bits if needed
+  // MIP prediction uses int values instead of kvz_pixel as some temp values may be negative
   
-  kvz_pixel* result = dst;
+  kvz_pixel* out = dst;
+  int result[32*32] = {0};
   const int mode_idx = mip_mode;
 
   // *** INPUT PREP ***
@@ -757,24 +758,29 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
 
   // Initialize prediction parameters END
 
-  kvz_pixel* ref_samples_top = &refs->ref.top[1]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init
-  kvz_pixel* ref_samples_left = &refs->ref.left[1];
+  int ref_samples_top[INTRA_REF_LENGTH]; 
+  int ref_samples_left[INTRA_REF_LENGTH];
+
+  for (int i = 0; i < INTRA_REF_LENGTH; i++) {
+    ref_samples_top[i] =  (int)refs->ref.top[i+1]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init
+    ref_samples_left[i] = (int)refs->ref.left[i+1];
+  }
 
   // Compute reduced boundary with Haar-downsampling
   const int input_size = 2 * red_bdry_size;
 
-  kvz_pixel red_bdry[MIP_MAX_INPUT_SIZE];
-  kvz_pixel red_bdry_trans[MIP_MAX_INPUT_SIZE];
+  int red_bdry[MIP_MAX_INPUT_SIZE];
+  int red_bdry_trans[MIP_MAX_INPUT_SIZE];
 
-  kvz_pixel* const top_reduced = &red_bdry[0];
-  kvz_pixel* const left_reduced = &red_bdry[red_bdry_size];
+  int* const top_reduced = &red_bdry[0];
+  int* const left_reduced = &red_bdry[red_bdry_size];
 
   kvz_mip_boundary_downsampling_1D(top_reduced, ref_samples_top, width, red_bdry_size);
   kvz_mip_boundary_downsampling_1D(left_reduced, ref_samples_left, height, red_bdry_size);
 
   // Transposed reduced boundaries
-  kvz_pixel* const left_reduced_trans = &red_bdry_trans[0];
-  kvz_pixel* const top_reduced_trans = &red_bdry_trans[red_bdry_size];
+  int* const left_reduced_trans = &red_bdry_trans[0];
+  int* const top_reduced_trans = &red_bdry_trans[red_bdry_size];
 
   for (int x = 0; x < red_bdry_size; x++) {
     top_reduced_trans[x] = top_reduced[x];
@@ -819,18 +825,18 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
   }
 
   // Max possible size is red_pred_size * red_pred_size, red_pred_size can be either 4 or 8
-  kvz_pixel red_pred_buffer[8*8];
-  kvz_pixel* const reduced_pred = need_upsampling ? red_pred_buffer : result;
+  int red_pred_buffer[8*8];
+  int* const reduced_pred = need_upsampling ? red_pred_buffer : result;
 
-  const kvz_pixel* const reduced_bdry = transpose ? red_bdry_trans : red_bdry;
+  const int* const reduced_bdry = transpose ? red_bdry_trans : red_bdry;
 
   kvz_mip_reduced_pred(reduced_pred, reduced_bdry, matrix, transpose, red_bdry_size, red_pred_size, size_id, input_offset, input_offset_trans);
   if (need_upsampling) {
-    const kvz_pixel* ver_src = reduced_pred;
+    const int* ver_src = reduced_pred;
     uint16_t ver_src_step = width;
     
     if (ups_hor_factor > 1) {
-      kvz_pixel* const hor_dst = result + (ups_ver_factor - 1) * width;
+      int* const hor_dst = result + (ups_ver_factor - 1) * width;
       ver_src = hor_dst;
       ver_src_step *= ups_ver_factor;
 
@@ -847,6 +853,11 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
         1, ups_ver_factor);
     }
   }
+
+  // Assign and cast values from temp array to output
+  for (int i = 0; i < 32 * 32; i++) {
+    out[i] = (kvz_pixel)result[i];
+  }
   // *** BLOCK PREDICT *** END
 }
 
@@ -1439,6 +1450,8 @@ static void intra_recon_tb_leaf(
  * \param mode_chroma   intra mode for chroma, or -1 to skip chroma recon
  * \param cur_cu        pointer to the CU, or NULL to fetch CU from LCU
  * \param cclm_params   pointer for the cclm_parameters, can be NULL if the mode is not cclm mode
+ * \param mip_flag      indicates whether the passed mode_luma is a MIP mode
+ * \param mip_transp    indicates whether the used MIP mode is transposed
  * \param lcu           containing LCU
  */
 void kvz_intra_recon_cu(
@@ -1502,6 +1515,7 @@ void kvz_intra_recon_cu(
   } else {
     const bool has_luma = mode_luma != -1;
     const bool has_chroma = mode_chroma != -1 &&  (x % 8 == 0 && y % 8 == 0);
+   
     // Process a leaf TU.
     if (has_luma) {
       intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y, multi_ref_index, use_mip, mip_transposed);
diff --git a/src/intra.h b/src/intra.h
index 7bd27e1f..666044c5 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -42,9 +42,12 @@
 #include "global.h" // IWYU pragma: keep
 #include "kvazaar.h"
 
+// Maximum possible reference line length for intra blocks
+#define INTRA_REF_LENGTH (2 * 128 + 3 + 33 * MAX_REF_LINE_IDX)
+
 typedef struct {
-  kvz_pixel left[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX];
-  kvz_pixel top[2 * 128 + 3 + 33 * MAX_REF_LINE_IDX];
+  kvz_pixel left[INTRA_REF_LENGTH];
+  kvz_pixel top[INTRA_REF_LENGTH];
 } kvz_intra_ref;
 typedef struct
 {
diff --git a/src/search_intra.c b/src/search_intra.c
index efeda0e5..586a633b 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -328,7 +328,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
           continue;
         }
       }
-
+     
       kvz_intra_recon_cu(state,
         x_px, y_px,
         depth,
@@ -870,7 +870,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
       pred_cu.intra.mip_is_transposed = false;
     }
     FILL(pred_cu.cbf, 0);
-    search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[0], MAX_INT, &pred_cu, lcu, NULL, trafo[0]);
+    search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_cu.intra.mode, MAX_INT, &pred_cu, lcu, NULL, trafo[0]);
   }
 
   return modes_to_check;

From e8ef0d2b28e7ee7b7cc65ed0b94a280c553f05a0 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 4 Feb 2022 14:41:57 +0200
Subject: [PATCH 20/28] [mip] Fix undefined behaviour error in CI test.

---
 src/intra.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 6ae57361..79c4d639 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -761,9 +761,9 @@ void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* c
   int ref_samples_top[INTRA_REF_LENGTH]; 
   int ref_samples_left[INTRA_REF_LENGTH];
 
-  for (int i = 0; i < INTRA_REF_LENGTH; i++) {
-    ref_samples_top[i] =  (int)refs->ref.top[i+1]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init
-    ref_samples_left[i] = (int)refs->ref.left[i+1];
+  for (int i = 1; i < INTRA_REF_LENGTH; i++) {
+    ref_samples_top[i-1] =  (int)refs->ref.top[i]; // NOTE: in VTM code these are indexed as x + 1 & y + 1 during init
+    ref_samples_left[i-1] = (int)refs->ref.left[i];
   }
 
   // Compute reduced boundary with Haar-downsampling

From 769703ea7153fa3910945e9cb5240a09e1f98a59 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 9 Feb 2022 03:24:02 +0200
Subject: [PATCH 21/28] [mip] Fix mpm mode selection. When neighboring CU uses
 MIP, signaled intra mode must be planar. Fix chroma reconstruction when MIP
 is enabled. Only allow MIP to be used if chroma scheme is 444. Otherwise use
 planar mode.

---
 src/intra.c        | 41 ++++++++++++++++++++++++++++++++++++-----
 src/search.c       | 14 +++++---------
 src/search_intra.c |  2 +-
 3 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 79c4d639..de3efaab 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -100,14 +100,23 @@ int8_t kvz_intra_get_dir_luma_predictor(
   int8_t number_of_candidates = 0;
 
   // The default mode if block is not coded yet is INTRA_PLANAR.
+  // If the neighboring blocks were MIP blocks, intra mode is set to planar.
   int8_t left_intra_dir  = 0;
   if (left_pu && left_pu->type == CU_INTRA) {
-    left_intra_dir = left_pu->intra.mode;
+    if (left_pu->intra.mip_flag) {
+      left_intra_dir = PLANAR_IDX;
+    } else {
+      left_intra_dir = left_pu->intra.mode;
+    }
   }
 
   int8_t above_intra_dir = 0;
   if (above_pu && above_pu->type == CU_INTRA && y % LCU_WIDTH != 0) {
-    above_intra_dir = above_pu->intra.mode;
+    if (above_pu->intra.mip_flag) {
+      above_intra_dir = PLANAR_IDX;
+    } else {
+      above_intra_dir = above_pu->intra.mode;
+    }
   }
 
   const int offset = 61;
@@ -715,6 +724,7 @@ void kvz_mip_pred_upsampling_1D(int* const dst, const int* const src, const int*
 
 /** \brief Matrix weighted intra prediction.
 */
+// MIP_TODO: remove color parameter if it is not used
 void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* const refs,
                      const uint16_t pred_block_width, const uint16_t pred_block_height,
                      const color_t color,
@@ -1347,7 +1357,7 @@ static void intra_recon_tb_leaf(
   lcu_t *lcu,
   color_t color,
   uint8_t multi_ref_idx,
-  bool use_mip,
+  bool mip_flag,
   bool mip_transp)
 {
   const kvz_config *cfg = &state->encoder_control->cfg;
@@ -1395,6 +1405,21 @@ static void intra_recon_tb_leaf(
   kvz_pixel pred[32 * 32];
   int stride = state->tile->frame->source->stride;
   const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
+  bool use_mip = false;
+  if (mip_flag) {
+    if (color == COLOR_Y) {
+      use_mip = true;
+    } else {
+      // MIP can be used for chroma if the chroma scheme is 444
+      if (state->encoder_control->chroma_format == KVZ_CSP_444) {
+        use_mip = true;
+      } else {
+        // If MIP cannot be used for chroma, set mode to planar
+        intra_mode = 0;
+      }
+    }
+  }
+
   if(intra_mode < 68) {
     if (use_mip) {
       assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]");
@@ -1476,6 +1501,12 @@ void kvz_intra_recon_cu(
   uint8_t multi_ref_index = multi_ref_idx;
   bool use_mip = mip_flag;
   bool mip_transposed = mip_transp;
+  
+  if (mode_luma != -1 && mode_chroma != -1) {
+    if (use_mip) {
+      assert(mode_luma == mode_chroma && "Chroma mode must be derived from luma mode if block uses MIP.");
+    }
+  }
 
   // Reset CBFs because CBFs might have been set
   // for depth earlier
@@ -1521,8 +1552,8 @@ void kvz_intra_recon_cu(
       intra_recon_tb_leaf(state, x, y, depth, mode_luma, cclm_params, lcu, COLOR_Y, multi_ref_index, use_mip, mip_transposed);
     }
     if (has_chroma) {
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0, false, false);
-      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0, false, false);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_U, 0, use_mip, mip_transposed);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, cclm_params, lcu, COLOR_V, 0, use_mip, mip_transposed);
     }
 
     kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
diff --git a/src/search.c b/src/search.c
index 7db7d5b9..99a9df27 100644
--- a/src/search.c
+++ b/src/search.c
@@ -742,10 +742,6 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
         cur_cu->intra.multi_ref_idx = multi_ref_index;
         cur_cu->intra.mip_flag = mip_flag;
         cur_cu->intra.mip_is_transposed = mip_transposed;
-        // If a MIP mode is selected, set chroma mode to planar and skip further chroma search
-        if (mip_flag) {
-          cur_cu->intra.mode_chroma = 0;
-        }
 
         //If the CU is not split from 64x64 block, the MTS is disabled for that CU.
         cur_cu->tr_idx = (depth > 0) ? intra_trafo : 0;
@@ -756,10 +752,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // mode search of adjacent CUs.
     if (cur_cu->type == CU_INTRA) {
       assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN);
-      // Chroma mode must be planar if mip_flag is set.
-      if (!cur_cu->intra.mip_flag) {
-        cur_cu->intra.mode_chroma = cur_cu->intra.mode;
-      }
+      cur_cu->intra.mode_chroma = cur_cu->intra.mode;
+      
       lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
       kvz_intra_recon_cu(state,
                          x, y,
@@ -789,7 +783,8 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
                            x & ~7, y & ~7, // TODO: as does this
                            depth,
                            -1, cur_cu->intra.mode_chroma, // skip luma
-                           NULL, cclm_params, 0, false, false,
+                           NULL, cclm_params, 0, 
+                           cur_cu->intra.mip_flag, cur_cu->intra.mip_is_transposed,
                            lcu);
       }
     } else if (cur_cu->type == CU_INTER) {
@@ -925,6 +920,7 @@ static double search_cu(encoder_state_t * const state, int x, int y, int depth,
     // of the top left CU from the next depth. This should ensure that 64x64
     // gets used, at least in the most obvious cases, while avoiding any
     // searching.
+    
     if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH
         && x + cu_width <= frame->width && y + cu_width <= frame->height && 0)
     {
diff --git a/src/search_intra.c b/src/search_intra.c
index 586a633b..fa60eeb9 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -364,7 +364,7 @@ static double search_intra_trdepth(encoder_state_t * const state,
         depth,
         -1, chroma_mode,
         pred_cu, cclm_params, 0, 
-        false, false,
+        pred_cu->intra.mip_flag, pred_cu->intra.mip_is_transposed,
         lcu);
       best_rd_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
     }

From 7577d5e4fb9329e14a72f3d833f72ec40a37c2ea Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 9 Feb 2022 03:52:27 +0200
Subject: [PATCH 22/28] [mip] Add CI test.

---
 tests/test_intra.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_intra.sh b/tests/test_intra.sh
index 1d417223..3a560002 100755
--- a/tests/test_intra.sh
+++ b/tests/test_intra.sh
@@ -11,6 +11,7 @@ valgrind_test $common_args --rd=1
 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37
 valgrind_test $common_args --rd=2 --no-transform-skip --qp 37 --signhide --rdoq 
 valgrind_test $common_args --rd=2 --mrl
+valgrind_test $common_args --rd=2 --mip
 valgrind_test $common_args --rd=3
 valgrind_test $common_args --alf=full --no-wpp --threads=0 --owf=0
 valgrind_test $common_args --alf=full --wpp --threads=1

From 09f3af81c66721c9a4222b23d86969cf224bf391 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 10 Feb 2022 00:41:20 +0200
Subject: [PATCH 23/28] [mip] Improve mip flag context function.

---
 src/intra.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index de3efaab..bd8396a3 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -557,8 +557,11 @@ void kvz_predict_cclm(
 
 int kvz_get_mip_flag_context(int x, int y, int width, int height, lcu_t* const lcu, cu_array_t* const cu_a) {
   assert(!(lcu && cu_a));
-  int context = 0;
+  if (width > 2 * height || height > 2 * width) {
+    return 3;
+  }
   
+  int context = 0;
   if (lcu) {
     int x_local = SUB_SCU(x);
     int y_local = SUB_SCU(y);
@@ -568,7 +571,6 @@ int kvz_get_mip_flag_context(int x, int y, int width, int height, lcu_t* const l
     if (y) {
       context += LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1)->intra.mip_flag;
     }
-    context = (width > 2 * height || height > 2 * width) ? 3 : context;
   }
   else {
     if (x > 0) {
@@ -577,7 +579,6 @@ int kvz_get_mip_flag_context(int x, int y, int width, int height, lcu_t* const l
     if (y > 0) {
       context += kvz_cu_array_at_const(cu_a, x, y - 1)->intra.mip_flag;
     }
-    context = (width > 2 * height || height > 2 * width) ? 3 : context;
   }
   return context;
 }

From ac45a5299ce566d5845d8d42aa2280058597c9c2 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Thu, 10 Feb 2022 02:12:06 +0200
Subject: [PATCH 24/28] [mip] Add define for number of mip modes. Fix mip cost
 calculation. If mip is enabled, the cost of writing of mip flag must always
 be included. Some code cleanup.

---
 src/intra.c        |  2 --
 src/intra.h        |  1 -
 src/search.c       |  7 ++++--
 src/search.h       |  3 +++
 src/search_intra.c | 56 +++++++++++++++++++++-------------------------
 5 files changed, 33 insertions(+), 36 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index bd8396a3..1925b8d7 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -725,10 +725,8 @@ void kvz_mip_pred_upsampling_1D(int* const dst, const int* const src, const int*
 
 /** \brief Matrix weighted intra prediction.
 */
-// MIP_TODO: remove color parameter if it is not used
 void kvz_mip_predict(encoder_state_t const* const state, kvz_intra_references* const refs,
                      const uint16_t pred_block_width, const uint16_t pred_block_height,
-                     const color_t color,
                      kvz_pixel* dst,
                      const int mip_mode, const bool mip_transp)
 {
diff --git a/src/intra.h b/src/intra.h
index 666044c5..44ab404d 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -160,7 +160,6 @@ void kvz_mip_predict(
   kvz_intra_references * refs,
   const uint16_t width,
   const uint16_t height,
-  const color_t color,
   kvz_pixel* dst,
   const int mip_mode,
   const bool mip_transp
diff --git a/src/search.c b/src/search.c
index 99a9df27..1bdc67d5 100644
--- a/src/search.c
+++ b/src/search.c
@@ -504,8 +504,11 @@ static double calc_mode_bits(const encoder_state_t *state,
     kvz_intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu);
   }
 
-  // MIP_TODO: calculation of MIP mode cost if this CU has MIP enabled.
-  double mode_bits = kvz_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes, cur_cu->intra.multi_ref_idx, 0, 0);
+  int width = LCU_WIDTH >> depth;
+  int height = width; // TODO: height for non-square blocks
+  int num_mip_modes_half = NUM_MIP_MODES_HALF(width, height);
+  int mip_flag_ctx_id = kvz_get_mip_flag_context(x, y, width, height, lcu, NULL);
+  double mode_bits = kvz_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes, cur_cu->intra.multi_ref_idx, num_mip_modes_half, mip_flag_ctx_id);
 
   if (((depth == 4 && x % 8 && y % 8) || (depth != 4)) && state->encoder_control->chroma_format != KVZ_CSP_400) {
     mode_bits += kvz_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode);
diff --git a/src/search.h b/src/search.h
index 3694a2ff..044aa8a2 100644
--- a/src/search.h
+++ b/src/search.h
@@ -44,6 +44,9 @@
 #include "image.h"
 #include "constraint.h"
 
+#define NUM_MIP_MODES_FULL(width, height) (width == 4 && height == 4) ? 32 : (width == 4 || height == 4 || (width == 8 && height == 8) ? 16 : 12)
+#define NUM_MIP_MODES_HALF(width, height) NUM_MIP_MODES_FULL(width, height) >> 1
+
 void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length);
 void kvz_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length);
 
diff --git a/src/search_intra.c b/src/search_intra.c
index fa60eeb9..3b597c11 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -719,7 +719,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
                              int8_t *intra_preds,
                              int modes_to_check,
                              int8_t modes[67], int8_t trafo[67], double costs[67],
-                             int num_mip_modes,
+                             int num_mip_modes_full,
                              int8_t mip_modes[32], int8_t mip_trafo[32], double mip_costs[32],
                              lcu_t *lcu,
                              uint8_t multi_ref_idx)
@@ -756,14 +756,15 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   }
 
   // MIP_TODO: implement this inside the standard intra for loop. Code duplication is bad.
+  // MIP_TODO: loop through normal intra modes first
   // MIP search
-  const int transp_off = num_mip_modes >> 1;
-  for (uint8_t mip_mode = 0; mip_mode < num_mip_modes; ++mip_mode) {
-    // Derive mip flag context id
-    uint8_t ctx_id = kvz_get_mip_flag_context(x_px, y_px, width, height, lcu, NULL);
-    int rdo_bitcost = kvz_luma_mode_bits(state, mip_modes[mip_mode], intra_preds, 0, num_mip_modes, ctx_id);
+  const int transp_off = num_mip_modes_full >> 1;
+  // Derive mip flag context id
+  uint8_t ctx_id = kvz_get_mip_flag_context(x_px, y_px, width, height, lcu, NULL);
+  for (uint8_t mip_mode = 0; mip_mode < num_mip_modes_full; ++mip_mode) {
+    int rdo_bitcost = kvz_luma_mode_bits(state, mip_modes[mip_mode], intra_preds, 0, transp_off, ctx_id);
 
-    mip_costs[mip_mode] = rdo_bitcost * (int)(state->lambda + 0.5); // MIP_TODO: check if this is also correct in the case when MIP is used.
+    mip_costs[mip_mode] = rdo_bitcost * (int)(state->lambda + 0.5);
 
     const bool is_transposed = (mip_modes[mip_mode] >= transp_off ? true : false);
     // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream.
@@ -791,7 +792,6 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
     mip_costs[mip_mode] += mode_cost;
     mip_trafo[mip_mode] = pred_cu.tr_idx;
 
-    // MIP_TODO: check if ET is viable when MIP is used
     // Early termination if no coefficients has to be coded
     if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) {
       modes_to_check = mip_mode + 1;
@@ -834,8 +834,8 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
   // Update order according to new costs
   kvz_sort_modes_intra_luma(modes, trafo, costs, modes_to_check);
   bool use_mip = false;
-  if (num_mip_modes) {
-    kvz_sort_modes_intra_luma(mip_modes, mip_trafo, mip_costs, num_mip_modes);
+  if (num_mip_modes_full) {
+    kvz_sort_modes_intra_luma(mip_modes, mip_trafo, mip_costs, num_mip_modes_full);
     if (costs[0] > mip_costs[0]) {
       use_mip = true;
     }
@@ -854,7 +854,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
       pred_cu.intra.mode = mip_modes[0];
       pred_cu.intra.mode_chroma = 0;
       pred_cu.intra.multi_ref_idx = 0;
-      int transp_off = num_mip_modes >> 1;
+      int transp_off = num_mip_modes_full >> 1;
       bool is_transposed = (mip_modes[0] >= transp_off ? true : false);
       int8_t pred_mode = (is_transposed ? mip_modes[0] - transp_off : mip_modes[0]);
       pred_cu.intra.mode = pred_mode;
@@ -877,12 +877,14 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
 }
 
 
-double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes, int mip_flag_ctx_id)
+double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds, const uint8_t multi_ref_idx, const uint8_t num_mip_modes_half, int mip_flag_ctx_id)
 {
   double mode_bits = 0.0;
 
-  bool enable_mip = state->encoder_control->cfg.mip ? (num_mip_modes > 0 ? true : false) : false;
+  bool enable_mip = state->encoder_control->cfg.mip;
+  bool mip_flag = enable_mip ? (num_mip_modes_half > 0 ? true : false) : false;
 
+  // Mip flag cost must be calculated even if mip is not used in this block
   if (enable_mip) {
     // Make a copy of state->cabac for bit cost estimation.
     cabac_data_t state_cabac_copy;
@@ -896,24 +898,25 @@ double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const
     cabac = &state_cabac_copy;
 
     // Do cabac writes as normal
-    const int transp_off = num_mip_modes >> 1;
-    bool mip_flag = enable_mip;
+    const int transp_off = num_mip_modes_half;
     const bool is_transposed = luma_mode >= transp_off ? true : false;
     int8_t mip_mode = is_transposed ? luma_mode - transp_off : luma_mode;
 
     // Write MIP flag
     cabac->cur_ctx = &(cabac->ctx.mip_flag[mip_flag_ctx_id]);
     CABAC_BIN(cabac, mip_flag, "mip_flag");
+    
     if (mip_flag) {
       // Write MIP transpose flag & mode
       CABAC_BIN_EP(cabac, is_transposed, "mip_transposed");
       kvz_cabac_encode_trunc_bin(cabac, mip_mode, transp_off);
     }
-
-    // Writes done. Get bit cost out of cabac
-    mode_bits += (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3); // MIP_TODO: check what this bit shifting means.
+    
+    // Write is done. Get bit cost out of cabac
+    mode_bits += (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
   }
-  else {
+
+  if (!mip_flag) {
     int8_t mode_in_preds = -1;
     for (int i = 0; i < INTRA_MPM_COUNT; ++i) {
       if (luma_mode == intra_preds[i]) {
@@ -1211,18 +1214,9 @@ void kvz_search_cu_intra(encoder_state_t * const state,
       mip_modes[i] = i;
       mip_costs[i] = MAX_INT;
     }
-    // MIP_TODO: check for illegal block sizes.
-    if (width == 4 && height == 4) {
-      // Mip size_id = 0. Num modes = 32
-      num_mip_modes = 32;
-    }
-    else if (width == 4 || height == 4 || (width == 8 && height == 8)) {
-      // Mip size_id = 1. Num modes = 16
-      num_mip_modes = 16;
-    }
-    else {
-      // Mip size_id = 2. Num modes = 12
-      num_mip_modes = 12;
+    // MIP is not allowed for 64 x 4 or 4 x 64 blocks
+    if (!((width == 64 && height == 4) || (width == 4 && height == 64))) {
+      num_mip_modes = NUM_MIP_MODES_FULL(width, height);
     }
   }
 

From 9b04a6f302c89082645b5c14452397f1b675b247 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Tue, 15 Feb 2022 11:24:01 +0200
Subject: [PATCH 25/28] [mip] Combine mip mode search loop into the original
 intra mode search loop. Some code clean up.

---
 src/intra.c        |   2 +-
 src/search_intra.c | 111 ++++++++++++++++++---------------------------
 2 files changed, 44 insertions(+), 69 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 1925b8d7..40a1f77c 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -1422,7 +1422,7 @@ static void intra_recon_tb_leaf(
   if(intra_mode < 68) {
     if (use_mip) {
       assert(intra_mode >= 0 && intra_mode < 16 && "MIP mode must be between [0, 15]");
-      kvz_mip_predict(state, &refs, width, height, color, pred, intra_mode, mip_transp);
+      kvz_mip_predict(state, &refs, width, height, pred, intra_mode, mip_transp);
     }
     else {
       kvz_intra_predict(state, &refs, log2width, intra_mode, color, pred, filter_boundary, multi_ref_index);
diff --git a/src/search_intra.c b/src/search_intra.c
index 3b597c11..87139b93 100644
--- a/src/search_intra.c
+++ b/src/search_intra.c
@@ -757,77 +757,53 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
 
   // MIP_TODO: implement this inside the standard intra for loop. Code duplication is bad.
   // MIP_TODO: loop through normal intra modes first
-  // MIP search
-  const int transp_off = num_mip_modes_full >> 1;
-  // Derive mip flag context id
-  uint8_t ctx_id = kvz_get_mip_flag_context(x_px, y_px, width, height, lcu, NULL);
-  for (uint8_t mip_mode = 0; mip_mode < num_mip_modes_full; ++mip_mode) {
-    int rdo_bitcost = kvz_luma_mode_bits(state, mip_modes[mip_mode], intra_preds, 0, transp_off, ctx_id);
+  
+  for (int mip = 0; mip <= 1; mip++) {
+    const int transp_off = mip ? num_mip_modes_full >> 1 : 0;
+    uint8_t ctx_id = mip ? kvz_get_mip_flag_context(x_px, y_px, width, height, lcu, NULL) : 0;
+    uint8_t multi_ref_index = mip ? 0 : multi_ref_idx;
+    int *num_modes = mip ? &num_mip_modes_full : &modes_to_check;
 
-    mip_costs[mip_mode] = rdo_bitcost * (int)(state->lambda + 0.5);
+    for (uint8_t i = 0; i < *num_modes; i++) {
+      int8_t mode = mip ? mip_modes[i] : modes[i];
+      double *mode_cost_p = mip ? &mip_costs[i] : &costs[i];
+      int8_t *mode_trafo_p = mip ? &mip_trafo[i] : &trafo[i];
+      int rdo_bitcost = kvz_luma_mode_bits(state, mode, intra_preds, multi_ref_index, transp_off, ctx_id);
 
-    const bool is_transposed = (mip_modes[mip_mode] >= transp_off ? true : false);
-    // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream.
-    // Half of the modes [16, 31] are indicated with the separate transpose flag.
-    // Number of possible modes is less for larger blocks.
-    int8_t pred_mode = (is_transposed ? mip_modes[mip_mode] - transp_off : mip_modes[mip_mode]);
+      *mode_cost_p = rdo_bitcost * (int)(state->lambda + 0.5);
 
-    // Perform transform split search and save mode RD cost for the best one.
-    cu_info_t pred_cu;
-    pred_cu.depth = depth;
-    pred_cu.type = CU_INTRA;
-    pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); // TODO: non-square blocks
-    pred_cu.intra.mode = pred_mode;
-    pred_cu.intra.mode_chroma = pred_mode;
-    pred_cu.intra.multi_ref_idx = 0;
-    pred_cu.intra.mip_is_transposed = is_transposed;
-    pred_cu.intra.mip_flag = true;
-    pred_cu.joint_cb_cr = 0;
-    FILL(pred_cu.cbf, 0);
+      // Mip related stuff
+      // There can be 32 MIP modes, but only mode numbers [0, 15] are ever written to bitstream.
+      // Half of the modes [16, 31] are indicated with the separate transpose flag.
+      // Number of possible modes is less for larger blocks.
+      const bool is_transposed = mip ? (mode >= transp_off ? true : false) : 0;
+      int8_t pred_mode = (is_transposed ? mode - transp_off : mode);
 
-    // Reset transform split data in lcu.cu for this area.
-    kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
+      // Perform transform split search and save mode RD cost for the best one.
+      cu_info_t pred_cu;
+      pred_cu.depth = depth;
+      pred_cu.type = CU_INTRA;
+      pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N); // TODO: non-square blocks
+      pred_cu.intra.mode = pred_mode;
+      pred_cu.intra.mode_chroma = pred_mode;
+      pred_cu.intra.multi_ref_idx = multi_ref_index;
+      pred_cu.intra.mip_is_transposed = is_transposed;
+      pred_cu.intra.mip_flag = mip ? true : false;
+      pred_cu.joint_cb_cr = 0;
+      FILL(pred_cu.cbf, 0);
 
-    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_mode, MAX_INT, &pred_cu, lcu, NULL, -1);
-    mip_costs[mip_mode] += mode_cost;
-    mip_trafo[mip_mode] = pred_cu.tr_idx;
+      // Reset transform split data in lcu.cu for this area.
+      kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
 
-    // Early termination if no coefficients has to be coded
-    if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) {
-      modes_to_check = mip_mode + 1;
-      break;
-    }
-  }
+      double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_mode, MAX_INT, &pred_cu, lcu, NULL, -1);
+      *mode_cost_p += mode_cost;
+      *mode_trafo_p = pred_cu.tr_idx;
 
-  for(int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
-    int rdo_bitcost = kvz_luma_mode_bits(state, modes[rdo_mode], intra_preds, multi_ref_idx, 0, 0);
-
-    costs[rdo_mode] = rdo_bitcost * (int)(state->lambda + 0.5);
-
-    // Perform transform split search and save mode RD cost for the best one.
-    cu_info_t pred_cu;
-    pred_cu.depth = depth;
-    pred_cu.type = CU_INTRA;
-    pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
-    pred_cu.intra.mode = modes[rdo_mode];
-    pred_cu.intra.mode_chroma = modes[rdo_mode];
-    pred_cu.intra.multi_ref_idx = multi_ref_idx;
-    pred_cu.intra.mip_is_transposed = false;
-    pred_cu.intra.mip_flag = false;
-    pred_cu.joint_cb_cr = 0;
-    FILL(pred_cu.cbf, 0);
-
-    // Reset transform split data in lcu.cu for this area.
-    kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
-
-    double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu, NULL, -1);
-    costs[rdo_mode] += mode_cost;
-    trafo[rdo_mode] = pred_cu.tr_idx;
-
-    // Early termination if no coefficients has to be coded
-    if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) {
-      modes_to_check = rdo_mode + 1;
-      break;
+      // Early termination if no coefficients has to be coded
+      if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) {
+        *num_modes = i + 1;
+        break;
+      }
     }
   }
 
@@ -851,14 +827,12 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
     pred_cu.type = CU_INTRA;
     pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
     if (use_mip) {
-      pred_cu.intra.mode = mip_modes[0];
-      pred_cu.intra.mode_chroma = 0;
-      pred_cu.intra.multi_ref_idx = 0;
       int transp_off = num_mip_modes_full >> 1;
       bool is_transposed = (mip_modes[0] >= transp_off ? true : false);
       int8_t pred_mode = (is_transposed ? mip_modes[0] - transp_off : mip_modes[0]);
       pred_cu.intra.mode = pred_mode;
-      pred_cu.intra.mode_chroma = 0;
+      pred_cu.intra.mode_chroma = pred_mode;
+      pred_cu.intra.multi_ref_idx = 0;
       pred_cu.intra.mip_flag = true;
       pred_cu.intra.mip_is_transposed = is_transposed;
     }
@@ -873,6 +847,7 @@ static int8_t search_intra_rdo(encoder_state_t * const state,
     search_intra_trdepth(state, x_px, y_px, depth, tr_depth, pred_cu.intra.mode, MAX_INT, &pred_cu, lcu, NULL, trafo[0]);
   }
 
+  // TODO: modes to check does not consider mip modes. Maybe replace with array when mip search is optimized?
   return modes_to_check;
 }
 

From fa963234a825b5c2456df031931e2e36644ba4c8 Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Wed, 16 Feb 2022 17:14:26 +0200
Subject: [PATCH 26/28] [mip] Fix CI error. Const modifier in wrong place.

---
 src/intra.c | 2 +-
 src/intra.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index 40a1f77c..cd103900 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -555,7 +555,7 @@ void kvz_predict_cclm(
 }
 
 
-int kvz_get_mip_flag_context(int x, int y, int width, int height, lcu_t* const lcu, cu_array_t* const cu_a) {
+int kvz_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a) {
   assert(!(lcu && cu_a));
   if (width > 2 * height || height > 2 * width) {
     return 3;
diff --git a/src/intra.h b/src/intra.h
index 44ab404d..4e3542c3 100644
--- a/src/intra.h
+++ b/src/intra.h
@@ -153,7 +153,7 @@ void kvz_predict_cclm(
   cclm_parameters_t* cclm_params
 );
 
-int kvz_get_mip_flag_context(int x, int y, int width, int height, lcu_t* const lcu, cu_array_t* const cu_a);
+int kvz_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* lcu, cu_array_t* const cu_a);
 
 void kvz_mip_predict(
   encoder_state_t const * const state,

From ae2e0da6ab9a5b58f1308adda3476c161c9576fd Mon Sep 17 00:00:00 2001
From: siivonek <kari.siivonen@tuni.fi>
Date: Fri, 4 Mar 2022 00:12:15 +0200
Subject: [PATCH 27/28] [mip] Add parenthesis around macro parameters.

---
 src/search.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/search.h b/src/search.h
index 044aa8a2..4eb5943f 100644
--- a/src/search.h
+++ b/src/search.h
@@ -44,8 +44,8 @@
 #include "image.h"
 #include "constraint.h"
 
-#define NUM_MIP_MODES_FULL(width, height) (width == 4 && height == 4) ? 32 : (width == 4 || height == 4 || (width == 8 && height == 8) ? 16 : 12)
-#define NUM_MIP_MODES_HALF(width, height) NUM_MIP_MODES_FULL(width, height) >> 1
+#define NUM_MIP_MODES_FULL(width, height) ((width) == 4 && (height) == 4) ? 32 : ((width) == 4 || (height) == 4 || ((width) == 8 && (height) == 8) ? 16 : 12)
+#define NUM_MIP_MODES_HALF(width, height) NUM_MIP_MODES_FULL((width), (height)) >> 1
 
 void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length);
 void kvz_sort_modes_intra_luma(int8_t *__restrict modes, int8_t *__restrict trafo, double *__restrict costs, uint8_t length);

From 0dd7646a34a6653a5325a304424634b6da846b30 Mon Sep 17 00:00:00 2001
From: Joose Sainio <joose.sainio@tuni.fi>
Date: Thu, 10 Mar 2022 10:48:37 +0200
Subject: [PATCH 28/28] Fix mip context calculation for P and B slices

---
 src/intra.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/intra.c b/src/intra.c
index cd103900..4c6e204b 100644
--- a/src/intra.c
+++ b/src/intra.c
@@ -562,24 +562,28 @@ int kvz_get_mip_flag_context(int x, int y, int width, int height, const lcu_t* l
   }
   
   int context = 0;
+  const cu_info_t* left = NULL;
+  const cu_info_t* top = NULL;
   if (lcu) {
     int x_local = SUB_SCU(x);
     int y_local = SUB_SCU(y);
     if (x) {
-      context += LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local)->intra.mip_flag;
+      left = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); 
     }
     if (y) {
-      context += LCU_GET_CU_AT_PX(lcu, x_local, y_local - 1)->intra.mip_flag;
+      top = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local); 
     }
   }
   else {
     if (x > 0) {
-      context += kvz_cu_array_at_const(cu_a, x - 1, y)->intra.mip_flag;
+      left = kvz_cu_array_at_const(cu_a, x - 1, y);
     }
     if (y > 0) {
-      context += kvz_cu_array_at_const(cu_a, x, y - 1)->intra.mip_flag;
+      top = kvz_cu_array_at_const(cu_a, x, y - 1);
     }
   }
+  context += left && left->type == CU_INTRA ? left->intra.mip_flag : 0;
+  context += top && top->type == CU_INTRA ? top->intra.mip_flag : 0;
   return context;
 }